diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..426af4eb6909e3c5e7c1343472d1051a0155f727 Binary files /dev/null and b/.DS_Store differ diff --git a/checkpoint-100/README.md b/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-100/adapter_config.json b/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-100/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-100/adapter_model.safetensors b/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..a6df65ab7f71ac324ffc87aaebbb8fdf3e5e6df8 --- /dev/null +++ b/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:189f16e272512604f417acb61041d13ea8cf5491faaf3775a73fa8eb7acfcba9 +size 3695848 diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7a9eaeace23d23fb0876312d6086768bdbb6d42 --- /dev/null +++ b/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7446647508a3f1e193d8002f3abea6912b4f0b6763c7dc576d37e298b2eaf9 +size 7433594 diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..16771faa3f088413763e4ed8c5ff3fe5cb439d23 --- /dev/null +++ b/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9b6b7e6cf0b505cf781760539befb21856622708465aec629481b220a020626 +size 14244 diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1b090ccb725bac0f6385b5ebc76d447866e6428 --- /dev/null +++ b/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:798d14d73ca2d10bac9b41826264fce38de23cc8ba8f11a956f55eaab401ae24 +size 1064 diff --git a/checkpoint-100/special_tokens_map.json b/checkpoint-100/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-100/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-100/tokenizer.model b/checkpoint-100/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-100/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-100/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..290af034d0b7672d59c0aeb629bd2c38e117de5a --- /dev/null +++ b/checkpoint-100/trainer_state.json @@ -0,0 +1,99 @@ +{ + "best_metric": 2.126384735107422, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-100", + "epoch": 0.17777777777777778, + "eval_steps": 100, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 8.247898064093184e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-1000/README.md b/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-1000/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1000/adapter_config.json b/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-1000/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1000/adapter_model.safetensors b/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e8fcc3c306544852763bc92d68bbeed6030b9b64 --- /dev/null +++ b/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7916de9ea406d8a03c10557143738d35307ad45987482ebb9e93d9db165ad52 +size 3695848 diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d87715c66afd2b4d5f523f9ead37f0ebefd3f745 --- /dev/null +++ b/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd862fdf8905f2cd906d896bfa4adb91e72c8b2a9e37933d9818163f89b1855e +size 7433594 diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d0624c55dbf111a545343527344831df83b9e31a --- /dev/null +++ b/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e16eaed4b8b35384ad183eb82126dc2f08137ade07d11c29930267932cc1ea1 +size 14244 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fba7abf5b78ebb770295701b05eb8e07f4c4edb --- /dev/null +++ b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53349943fc3bda7b7686dfb1b7835d2ed6729aa6a46db4b4e53d5c6f633658c9 +size 1064 diff --git a/checkpoint-1000/special_tokens_map.json b/checkpoint-1000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-1000/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1000/tokenizer.model b/checkpoint-1000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-1000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-1000/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8f3eb22b0270d8b0d2a2bfcf161b9fc4bf96df87 --- /dev/null +++ b/checkpoint-1000/trainer_state.json @@ -0,0 +1,801 @@ +{ + "best_metric": 1.5386379957199097, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-1000", + "epoch": 1.7777777777777777, + "eval_steps": 100, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + }, + { + "epoch": 1.44, + "grad_norm": 3.520264148712158, + "learning_rate": 4.7480731191519224e-05, + "loss": 1.4991, + "step": 810 + }, + { + "epoch": 1.46, + "grad_norm": 5.3987531661987305, + "learning_rate": 4.741924250958289e-05, + "loss": 1.6856, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 12.352794647216797, + "learning_rate": 4.7357053265095575e-05, + "loss": 1.4509, + "step": 830 + }, + { + "epoch": 1.49, + "grad_norm": 9.825531005859375, + "learning_rate": 4.729416540136361e-05, + "loss": 1.6168, + "step": 840 + }, + { + "epoch": 1.51, + "grad_norm": 10.881526947021484, + "learning_rate": 4.723058088352395e-05, + "loss": 1.5783, + "step": 850 + }, + { + "epoch": 1.53, + "grad_norm": 6.232407093048096, + "learning_rate": 4.7166301698482815e-05, + "loss": 1.4556, + "step": 860 + }, + { + "epoch": 1.55, + "grad_norm": 3.3216302394866943, + "learning_rate": 4.710132985485355e-05, + "loss": 1.593, + "step": 870 + }, + { + "epoch": 1.56, + "grad_norm": 5.219264984130859, + "learning_rate": 4.703566738289389e-05, + "loss": 1.5131, + "step": 880 + }, + { + "epoch": 1.58, + "grad_norm": 7.875769138336182, + "learning_rate": 4.696931633444251e-05, + "loss": 1.5667, + "step": 890 + }, + { + "epoch": 1.6, + "grad_norm": 5.77959680557251, + "learning_rate": 4.69022787828549e-05, + "loss": 1.5211, + "step": 900 + }, + { + "epoch": 1.6, + "eval_loss": 1.5731443166732788, + "eval_runtime": 124.8025, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 900 + }, + { + "epoch": 1.62, + "grad_norm": 4.806954383850098, + "learning_rate": 4.683455682293863e-05, + "loss": 1.6824, + "step": 910 + }, + { + "epoch": 1.64, + "grad_norm": 5.980200290679932, + "learning_rate": 4.676615257088776e-05, + "loss": 1.5989, + "step": 920 + }, + { + "epoch": 1.65, + "grad_norm": 4.3645429611206055, + "learning_rate": 4.6697068164216896e-05, + "loss": 1.6469, + "step": 930 + }, + { + "epoch": 1.67, + "grad_norm": 3.2400012016296387, + "learning_rate": 4.662730576169423e-05, + "loss": 1.568, + "step": 940 + }, + { + "epoch": 1.69, + "grad_norm": 4.331827640533447, + "learning_rate": 4.6556867543274184e-05, + "loss": 1.5236, + "step": 950 + }, + { + "epoch": 1.71, + "grad_norm": 3.3798201084136963, + "learning_rate": 4.6485755710029256e-05, + "loss": 1.5046, + "step": 960 + }, + { + "epoch": 1.72, + "grad_norm": 5.440864086151123, + "learning_rate": 4.6413972484081216e-05, + "loss": 1.5816, + "step": 970 + }, + { + "epoch": 1.74, + "grad_norm": 5.852995872497559, + "learning_rate": 4.6341520108531746e-05, + "loss": 1.4193, + "step": 980 + }, + { + "epoch": 1.76, + "grad_norm": 4.2782206535339355, + "learning_rate": 4.626840084739224e-05, + "loss": 1.5457, + "step": 990 + }, + { + "epoch": 1.78, + "grad_norm": 8.631403923034668, + "learning_rate": 4.619461698551315e-05, + "loss": 1.652, + "step": 1000 + }, + { + "epoch": 1.78, + "eval_loss": 1.5386379957199097, + "eval_runtime": 124.8384, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 2.003, + "step": 1000 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 7.861412473287475e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-1100/README.md b/checkpoint-1100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-1100/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1100/adapter_config.json b/checkpoint-1100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-1100/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1100/adapter_model.safetensors b/checkpoint-1100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..20324857492decc395a1b92e283e21507504c3cf --- /dev/null +++ b/checkpoint-1100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4b9eb06e71d7973f8a07ec27b6a40c58f903b3a4ba384017bc0f6e2a2196c4b +size 3695848 diff --git a/checkpoint-1100/optimizer.pt b/checkpoint-1100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d0916b7af282fe5bf99d6607ec06d794b16b7ef --- /dev/null +++ b/checkpoint-1100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:725db80f8de0a9d1939950241fb60493a3d4a4da625c46af710b20696067ba64 +size 7433594 diff --git a/checkpoint-1100/rng_state.pth b/checkpoint-1100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3e25cff94241d1d51da6ec770b900487240d0503 --- /dev/null +++ b/checkpoint-1100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:664509c403435e02712a1f6ee4f0fe06f1cdce5a2a4e368b12808975f9d67973 +size 14244 diff --git a/checkpoint-1100/scheduler.pt b/checkpoint-1100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..db156dd70fb7b899a4fe2d7d51de608c3e5f4ced --- /dev/null +++ b/checkpoint-1100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddf8f94804871a990551a025205f7d78b0c82ddb68268b867708cefdd898eeb3 +size 1064 diff --git a/checkpoint-1100/special_tokens_map.json b/checkpoint-1100/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-1100/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1100/tokenizer.model b/checkpoint-1100/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-1100/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-1100/tokenizer_config.json b/checkpoint-1100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-1100/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-1100/trainer_state.json b/checkpoint-1100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..185d3b92150a992a9f7646349a3b099973f53e06 --- /dev/null +++ b/checkpoint-1100/trainer_state.json @@ -0,0 +1,879 @@ +{ + "best_metric": 1.5198711156845093, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-1100", + "epoch": 1.9555555555555557, + "eval_steps": 100, + "global_step": 1100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + }, + { + "epoch": 1.44, + "grad_norm": 3.520264148712158, + "learning_rate": 4.7480731191519224e-05, + "loss": 1.4991, + "step": 810 + }, + { + "epoch": 1.46, + "grad_norm": 5.3987531661987305, + "learning_rate": 4.741924250958289e-05, + "loss": 1.6856, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 12.352794647216797, + "learning_rate": 4.7357053265095575e-05, + "loss": 1.4509, + "step": 830 + }, + { + "epoch": 1.49, + "grad_norm": 9.825531005859375, + "learning_rate": 4.729416540136361e-05, + "loss": 1.6168, + "step": 840 + }, + { + "epoch": 1.51, + "grad_norm": 10.881526947021484, + "learning_rate": 4.723058088352395e-05, + "loss": 1.5783, + "step": 850 + }, + { + "epoch": 1.53, + "grad_norm": 6.232407093048096, + "learning_rate": 4.7166301698482815e-05, + "loss": 1.4556, + "step": 860 + }, + { + "epoch": 1.55, + "grad_norm": 3.3216302394866943, + "learning_rate": 4.710132985485355e-05, + "loss": 1.593, + "step": 870 + }, + { + "epoch": 1.56, + "grad_norm": 5.219264984130859, + "learning_rate": 4.703566738289389e-05, + "loss": 1.5131, + "step": 880 + }, + { + "epoch": 1.58, + "grad_norm": 7.875769138336182, + "learning_rate": 4.696931633444251e-05, + "loss": 1.5667, + "step": 890 + }, + { + "epoch": 1.6, + "grad_norm": 5.77959680557251, + "learning_rate": 4.69022787828549e-05, + "loss": 1.5211, + "step": 900 + }, + { + "epoch": 1.6, + "eval_loss": 1.5731443166732788, + "eval_runtime": 124.8025, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 900 + }, + { + "epoch": 1.62, + "grad_norm": 4.806954383850098, + "learning_rate": 4.683455682293863e-05, + "loss": 1.6824, + "step": 910 + }, + { + "epoch": 1.64, + "grad_norm": 5.980200290679932, + "learning_rate": 4.676615257088776e-05, + "loss": 1.5989, + "step": 920 + }, + { + "epoch": 1.65, + "grad_norm": 4.3645429611206055, + "learning_rate": 4.6697068164216896e-05, + "loss": 1.6469, + "step": 930 + }, + { + "epoch": 1.67, + "grad_norm": 3.2400012016296387, + "learning_rate": 4.662730576169423e-05, + "loss": 1.568, + "step": 940 + }, + { + "epoch": 1.69, + "grad_norm": 4.331827640533447, + "learning_rate": 4.6556867543274184e-05, + "loss": 1.5236, + "step": 950 + }, + { + "epoch": 1.71, + "grad_norm": 3.3798201084136963, + "learning_rate": 4.6485755710029256e-05, + "loss": 1.5046, + "step": 960 + }, + { + "epoch": 1.72, + "grad_norm": 5.440864086151123, + "learning_rate": 4.6413972484081216e-05, + "loss": 1.5816, + "step": 970 + }, + { + "epoch": 1.74, + "grad_norm": 5.852995872497559, + "learning_rate": 4.6341520108531746e-05, + "loss": 1.4193, + "step": 980 + }, + { + "epoch": 1.76, + "grad_norm": 4.2782206535339355, + "learning_rate": 4.626840084739224e-05, + "loss": 1.5457, + "step": 990 + }, + { + "epoch": 1.78, + "grad_norm": 8.631403923034668, + "learning_rate": 4.619461698551315e-05, + "loss": 1.652, + "step": 1000 + }, + { + "epoch": 1.78, + "eval_loss": 1.5386379957199097, + "eval_runtime": 124.8384, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 2.003, + "step": 1000 + }, + { + "epoch": 1.8, + "grad_norm": 4.581122875213623, + "learning_rate": 4.612017082851253e-05, + "loss": 1.5746, + "step": 1010 + }, + { + "epoch": 1.81, + "grad_norm": 3.0373165607452393, + "learning_rate": 4.604506470270403e-05, + "loss": 1.6038, + "step": 1020 + }, + { + "epoch": 1.83, + "grad_norm": 3.5066914558410645, + "learning_rate": 4.5969300955024167e-05, + "loss": 1.5725, + "step": 1030 + }, + { + "epoch": 1.85, + "grad_norm": 4.402235507965088, + "learning_rate": 4.589288195295901e-05, + "loss": 1.5469, + "step": 1040 + }, + { + "epoch": 1.87, + "grad_norm": 4.844370365142822, + "learning_rate": 4.58158100844702e-05, + "loss": 1.5424, + "step": 1050 + }, + { + "epoch": 1.88, + "grad_norm": 4.146657943725586, + "learning_rate": 4.573808775792033e-05, + "loss": 1.4878, + "step": 1060 + }, + { + "epoch": 1.9, + "grad_norm": 3.210528612136841, + "learning_rate": 4.5659717401997655e-05, + "loss": 1.6077, + "step": 1070 + }, + { + "epoch": 1.92, + "grad_norm": 5.2232818603515625, + "learning_rate": 4.5580701465640254e-05, + "loss": 1.4824, + "step": 1080 + }, + { + "epoch": 1.94, + "grad_norm": 2.8741068840026855, + "learning_rate": 4.550104241795946e-05, + "loss": 1.6172, + "step": 1090 + }, + { + "epoch": 1.96, + "grad_norm": 8.092519760131836, + "learning_rate": 4.5420742748162734e-05, + "loss": 1.3659, + "step": 1100 + }, + { + "epoch": 1.96, + "eval_loss": 1.5198711156845093, + "eval_runtime": 124.8546, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1100 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 8.600909737867346e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1100/training_args.bin b/checkpoint-1100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-1100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-1200/README.md b/checkpoint-1200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-1200/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1200/adapter_config.json b/checkpoint-1200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-1200/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1200/adapter_model.safetensors b/checkpoint-1200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5ccfea1bcb22bda64943df38e447c45056a8d400 --- /dev/null +++ b/checkpoint-1200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40ffbe8de08fe4d834e15ba4cb1361370d23f5218e15f7836f3f1863cbfd4b84 +size 3695848 diff --git a/checkpoint-1200/optimizer.pt b/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4508df3094ed72add16d9163e28df44d9520f2b --- /dev/null +++ b/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d14447e20d16512af5af2f3d5355e9061f5d86b9f45fdd6e3de7f65af2f7e6ec +size 7433594 diff --git a/checkpoint-1200/rng_state.pth b/checkpoint-1200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..4b31c5601a7f39fdb5f7c7b205ce6b1b838aca11 --- /dev/null +++ b/checkpoint-1200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad7b75b55e4564c8897b2e74676b56608a7a54046d34b7140d36e3fecaa54d17 +size 14244 diff --git a/checkpoint-1200/scheduler.pt b/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f37fc99403e2438d01888f45bad1610cfbbd6e42 --- /dev/null +++ b/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c16774d623c8977b5af4f890ada5f121527f570596390b865e9450991873920b +size 1064 diff --git a/checkpoint-1200/special_tokens_map.json b/checkpoint-1200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-1200/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1200/tokenizer.model b/checkpoint-1200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-1200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-1200/tokenizer_config.json b/checkpoint-1200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-1200/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-1200/trainer_state.json b/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f3c97c1c9faee7962f48b3c1bef222f508e11d43 --- /dev/null +++ b/checkpoint-1200/trainer_state.json @@ -0,0 +1,957 @@ +{ + "best_metric": 1.5145190954208374, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-1200", + "epoch": 2.1333333333333333, + "eval_steps": 100, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + }, + { + "epoch": 1.44, + "grad_norm": 3.520264148712158, + "learning_rate": 4.7480731191519224e-05, + "loss": 1.4991, + "step": 810 + }, + { + "epoch": 1.46, + "grad_norm": 5.3987531661987305, + "learning_rate": 4.741924250958289e-05, + "loss": 1.6856, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 12.352794647216797, + "learning_rate": 4.7357053265095575e-05, + "loss": 1.4509, + "step": 830 + }, + { + "epoch": 1.49, + "grad_norm": 9.825531005859375, + "learning_rate": 4.729416540136361e-05, + "loss": 1.6168, + "step": 840 + }, + { + "epoch": 1.51, + "grad_norm": 10.881526947021484, + "learning_rate": 4.723058088352395e-05, + "loss": 1.5783, + "step": 850 + }, + { + "epoch": 1.53, + "grad_norm": 6.232407093048096, + "learning_rate": 4.7166301698482815e-05, + "loss": 1.4556, + "step": 860 + }, + { + "epoch": 1.55, + "grad_norm": 3.3216302394866943, + "learning_rate": 4.710132985485355e-05, + "loss": 1.593, + "step": 870 + }, + { + "epoch": 1.56, + "grad_norm": 5.219264984130859, + "learning_rate": 4.703566738289389e-05, + "loss": 1.5131, + "step": 880 + }, + { + "epoch": 1.58, + "grad_norm": 7.875769138336182, + "learning_rate": 4.696931633444251e-05, + "loss": 1.5667, + "step": 890 + }, + { + "epoch": 1.6, + "grad_norm": 5.77959680557251, + "learning_rate": 4.69022787828549e-05, + "loss": 1.5211, + "step": 900 + }, + { + "epoch": 1.6, + "eval_loss": 1.5731443166732788, + "eval_runtime": 124.8025, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 900 + }, + { + "epoch": 1.62, + "grad_norm": 4.806954383850098, + "learning_rate": 4.683455682293863e-05, + "loss": 1.6824, + "step": 910 + }, + { + "epoch": 1.64, + "grad_norm": 5.980200290679932, + "learning_rate": 4.676615257088776e-05, + "loss": 1.5989, + "step": 920 + }, + { + "epoch": 1.65, + "grad_norm": 4.3645429611206055, + "learning_rate": 4.6697068164216896e-05, + "loss": 1.6469, + "step": 930 + }, + { + "epoch": 1.67, + "grad_norm": 3.2400012016296387, + "learning_rate": 4.662730576169423e-05, + "loss": 1.568, + "step": 940 + }, + { + "epoch": 1.69, + "grad_norm": 4.331827640533447, + "learning_rate": 4.6556867543274184e-05, + "loss": 1.5236, + "step": 950 + }, + { + "epoch": 1.71, + "grad_norm": 3.3798201084136963, + "learning_rate": 4.6485755710029256e-05, + "loss": 1.5046, + "step": 960 + }, + { + "epoch": 1.72, + "grad_norm": 5.440864086151123, + "learning_rate": 4.6413972484081216e-05, + "loss": 1.5816, + "step": 970 + }, + { + "epoch": 1.74, + "grad_norm": 5.852995872497559, + "learning_rate": 4.6341520108531746e-05, + "loss": 1.4193, + "step": 980 + }, + { + "epoch": 1.76, + "grad_norm": 4.2782206535339355, + "learning_rate": 4.626840084739224e-05, + "loss": 1.5457, + "step": 990 + }, + { + "epoch": 1.78, + "grad_norm": 8.631403923034668, + "learning_rate": 4.619461698551315e-05, + "loss": 1.652, + "step": 1000 + }, + { + "epoch": 1.78, + "eval_loss": 1.5386379957199097, + "eval_runtime": 124.8384, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 2.003, + "step": 1000 + }, + { + "epoch": 1.8, + "grad_norm": 4.581122875213623, + "learning_rate": 4.612017082851253e-05, + "loss": 1.5746, + "step": 1010 + }, + { + "epoch": 1.81, + "grad_norm": 3.0373165607452393, + "learning_rate": 4.604506470270403e-05, + "loss": 1.6038, + "step": 1020 + }, + { + "epoch": 1.83, + "grad_norm": 3.5066914558410645, + "learning_rate": 4.5969300955024167e-05, + "loss": 1.5725, + "step": 1030 + }, + { + "epoch": 1.85, + "grad_norm": 4.402235507965088, + "learning_rate": 4.589288195295901e-05, + "loss": 1.5469, + "step": 1040 + }, + { + "epoch": 1.87, + "grad_norm": 4.844370365142822, + "learning_rate": 4.58158100844702e-05, + "loss": 1.5424, + "step": 1050 + }, + { + "epoch": 1.88, + "grad_norm": 4.146657943725586, + "learning_rate": 4.573808775792033e-05, + "loss": 1.4878, + "step": 1060 + }, + { + "epoch": 1.9, + "grad_norm": 3.210528612136841, + "learning_rate": 4.5659717401997655e-05, + "loss": 1.6077, + "step": 1070 + }, + { + "epoch": 1.92, + "grad_norm": 5.2232818603515625, + "learning_rate": 4.5580701465640254e-05, + "loss": 1.4824, + "step": 1080 + }, + { + "epoch": 1.94, + "grad_norm": 2.8741068840026855, + "learning_rate": 4.550104241795946e-05, + "loss": 1.6172, + "step": 1090 + }, + { + "epoch": 1.96, + "grad_norm": 8.092519760131836, + "learning_rate": 4.5420742748162734e-05, + "loss": 1.3659, + "step": 1100 + }, + { + "epoch": 1.96, + "eval_loss": 1.5198711156845093, + "eval_runtime": 124.8546, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1100 + }, + { + "epoch": 1.97, + "grad_norm": 5.068336009979248, + "learning_rate": 4.5339804965475875e-05, + "loss": 1.4661, + "step": 1110 + }, + { + "epoch": 1.99, + "grad_norm": 13.167552947998047, + "learning_rate": 4.525823159906459e-05, + "loss": 1.411, + "step": 1120 + }, + { + "epoch": 2.01, + "grad_norm": 4.712369918823242, + "learning_rate": 4.5176025197955494e-05, + "loss": 1.3309, + "step": 1130 + }, + { + "epoch": 2.03, + "grad_norm": 7.261610507965088, + "learning_rate": 4.509318833095642e-05, + "loss": 1.3892, + "step": 1140 + }, + { + "epoch": 2.04, + "grad_norm": 3.8006956577301025, + "learning_rate": 4.500972358657618e-05, + "loss": 1.3927, + "step": 1150 + }, + { + "epoch": 2.06, + "grad_norm": 3.6301958560943604, + "learning_rate": 4.492563357294369e-05, + "loss": 1.4629, + "step": 1160 + }, + { + "epoch": 2.08, + "grad_norm": 4.353027820587158, + "learning_rate": 4.4840920917726426e-05, + "loss": 1.352, + "step": 1170 + }, + { + "epoch": 2.1, + "grad_norm": 3.375173807144165, + "learning_rate": 4.475558826804833e-05, + "loss": 1.4096, + "step": 1180 + }, + { + "epoch": 2.12, + "grad_norm": 6.289668560028076, + "learning_rate": 4.466963829040712e-05, + "loss": 1.4834, + "step": 1190 + }, + { + "epoch": 2.13, + "grad_norm": 4.517002582550049, + "learning_rate": 4.458307367059092e-05, + "loss": 1.4746, + "step": 1200 + }, + { + "epoch": 2.13, + "eval_loss": 1.5145190954208374, + "eval_runtime": 124.8898, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 1200 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 9.345143605029765e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1200/training_args.bin b/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-1300/README.md b/checkpoint-1300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-1300/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1300/adapter_config.json b/checkpoint-1300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-1300/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1300/adapter_model.safetensors b/checkpoint-1300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..51e366d242f788d8a7fddfa8f75f10c5aaea593a --- /dev/null +++ b/checkpoint-1300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fd707712dd82358da79a33ee9f1c986f6405353e769651f7ac4db92b2079d77 +size 3695848 diff --git a/checkpoint-1300/optimizer.pt b/checkpoint-1300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..27669f3a0e0681bd0144e201651e338001917032 --- /dev/null +++ b/checkpoint-1300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71434438c6b8408bde4caf0128d81bbeb25b13b24e638b72f5195085c2e81e47 +size 7433594 diff --git a/checkpoint-1300/rng_state.pth b/checkpoint-1300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..986a56f5712aeaae6e1215bde30448d4d9db638d --- /dev/null +++ b/checkpoint-1300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdcb35c24b5daeb5cb101341c361e48eb368eb2034e53248c5f637b582676b7a +size 14244 diff --git a/checkpoint-1300/scheduler.pt b/checkpoint-1300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..58e58fce20e078ddc45b92aca0c5f670daa19fc5 --- /dev/null +++ b/checkpoint-1300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c165513408538ccddcaa8244e302fcd0ee9c15713dd0c456621a1c0c3d17aec0 +size 1064 diff --git a/checkpoint-1300/special_tokens_map.json b/checkpoint-1300/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-1300/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1300/tokenizer.model b/checkpoint-1300/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-1300/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-1300/tokenizer_config.json b/checkpoint-1300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-1300/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-1300/trainer_state.json b/checkpoint-1300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a7408217e92c6df9676a753d263b9b0ec6e23469 --- /dev/null +++ b/checkpoint-1300/trainer_state.json @@ -0,0 +1,1035 @@ +{ + "best_metric": 1.504623532295227, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-1300", + "epoch": 2.311111111111111, + "eval_steps": 100, + "global_step": 1300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + }, + { + "epoch": 1.44, + "grad_norm": 3.520264148712158, + "learning_rate": 4.7480731191519224e-05, + "loss": 1.4991, + "step": 810 + }, + { + "epoch": 1.46, + "grad_norm": 5.3987531661987305, + "learning_rate": 4.741924250958289e-05, + "loss": 1.6856, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 12.352794647216797, + "learning_rate": 4.7357053265095575e-05, + "loss": 1.4509, + "step": 830 + }, + { + "epoch": 1.49, + "grad_norm": 9.825531005859375, + "learning_rate": 4.729416540136361e-05, + "loss": 1.6168, + "step": 840 + }, + { + "epoch": 1.51, + "grad_norm": 10.881526947021484, + "learning_rate": 4.723058088352395e-05, + "loss": 1.5783, + "step": 850 + }, + { + "epoch": 1.53, + "grad_norm": 6.232407093048096, + "learning_rate": 4.7166301698482815e-05, + "loss": 1.4556, + "step": 860 + }, + { + "epoch": 1.55, + "grad_norm": 3.3216302394866943, + "learning_rate": 4.710132985485355e-05, + "loss": 1.593, + "step": 870 + }, + { + "epoch": 1.56, + "grad_norm": 5.219264984130859, + "learning_rate": 4.703566738289389e-05, + "loss": 1.5131, + "step": 880 + }, + { + "epoch": 1.58, + "grad_norm": 7.875769138336182, + "learning_rate": 4.696931633444251e-05, + "loss": 1.5667, + "step": 890 + }, + { + "epoch": 1.6, + "grad_norm": 5.77959680557251, + "learning_rate": 4.69022787828549e-05, + "loss": 1.5211, + "step": 900 + }, + { + "epoch": 1.6, + "eval_loss": 1.5731443166732788, + "eval_runtime": 124.8025, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 900 + }, + { + "epoch": 1.62, + "grad_norm": 4.806954383850098, + "learning_rate": 4.683455682293863e-05, + "loss": 1.6824, + "step": 910 + }, + { + "epoch": 1.64, + "grad_norm": 5.980200290679932, + "learning_rate": 4.676615257088776e-05, + "loss": 1.5989, + "step": 920 + }, + { + "epoch": 1.65, + "grad_norm": 4.3645429611206055, + "learning_rate": 4.6697068164216896e-05, + "loss": 1.6469, + "step": 930 + }, + { + "epoch": 1.67, + "grad_norm": 3.2400012016296387, + "learning_rate": 4.662730576169423e-05, + "loss": 1.568, + "step": 940 + }, + { + "epoch": 1.69, + "grad_norm": 4.331827640533447, + "learning_rate": 4.6556867543274184e-05, + "loss": 1.5236, + "step": 950 + }, + { + "epoch": 1.71, + "grad_norm": 3.3798201084136963, + "learning_rate": 4.6485755710029256e-05, + "loss": 1.5046, + "step": 960 + }, + { + "epoch": 1.72, + "grad_norm": 5.440864086151123, + "learning_rate": 4.6413972484081216e-05, + "loss": 1.5816, + "step": 970 + }, + { + "epoch": 1.74, + "grad_norm": 5.852995872497559, + "learning_rate": 4.6341520108531746e-05, + "loss": 1.4193, + "step": 980 + }, + { + "epoch": 1.76, + "grad_norm": 4.2782206535339355, + "learning_rate": 4.626840084739224e-05, + "loss": 1.5457, + "step": 990 + }, + { + "epoch": 1.78, + "grad_norm": 8.631403923034668, + "learning_rate": 4.619461698551315e-05, + "loss": 1.652, + "step": 1000 + }, + { + "epoch": 1.78, + "eval_loss": 1.5386379957199097, + "eval_runtime": 124.8384, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 2.003, + "step": 1000 + }, + { + "epoch": 1.8, + "grad_norm": 4.581122875213623, + "learning_rate": 4.612017082851253e-05, + "loss": 1.5746, + "step": 1010 + }, + { + "epoch": 1.81, + "grad_norm": 3.0373165607452393, + "learning_rate": 4.604506470270403e-05, + "loss": 1.6038, + "step": 1020 + }, + { + "epoch": 1.83, + "grad_norm": 3.5066914558410645, + "learning_rate": 4.5969300955024167e-05, + "loss": 1.5725, + "step": 1030 + }, + { + "epoch": 1.85, + "grad_norm": 4.402235507965088, + "learning_rate": 4.589288195295901e-05, + "loss": 1.5469, + "step": 1040 + }, + { + "epoch": 1.87, + "grad_norm": 4.844370365142822, + "learning_rate": 4.58158100844702e-05, + "loss": 1.5424, + "step": 1050 + }, + { + "epoch": 1.88, + "grad_norm": 4.146657943725586, + "learning_rate": 4.573808775792033e-05, + "loss": 1.4878, + "step": 1060 + }, + { + "epoch": 1.9, + "grad_norm": 3.210528612136841, + "learning_rate": 4.5659717401997655e-05, + "loss": 1.6077, + "step": 1070 + }, + { + "epoch": 1.92, + "grad_norm": 5.2232818603515625, + "learning_rate": 4.5580701465640254e-05, + "loss": 1.4824, + "step": 1080 + }, + { + "epoch": 1.94, + "grad_norm": 2.8741068840026855, + "learning_rate": 4.550104241795946e-05, + "loss": 1.6172, + "step": 1090 + }, + { + "epoch": 1.96, + "grad_norm": 8.092519760131836, + "learning_rate": 4.5420742748162734e-05, + "loss": 1.3659, + "step": 1100 + }, + { + "epoch": 1.96, + "eval_loss": 1.5198711156845093, + "eval_runtime": 124.8546, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1100 + }, + { + "epoch": 1.97, + "grad_norm": 5.068336009979248, + "learning_rate": 4.5339804965475875e-05, + "loss": 1.4661, + "step": 1110 + }, + { + "epoch": 1.99, + "grad_norm": 13.167552947998047, + "learning_rate": 4.525823159906459e-05, + "loss": 1.411, + "step": 1120 + }, + { + "epoch": 2.01, + "grad_norm": 4.712369918823242, + "learning_rate": 4.5176025197955494e-05, + "loss": 1.3309, + "step": 1130 + }, + { + "epoch": 2.03, + "grad_norm": 7.261610507965088, + "learning_rate": 4.509318833095642e-05, + "loss": 1.3892, + "step": 1140 + }, + { + "epoch": 2.04, + "grad_norm": 3.8006956577301025, + "learning_rate": 4.500972358657618e-05, + "loss": 1.3927, + "step": 1150 + }, + { + "epoch": 2.06, + "grad_norm": 3.6301958560943604, + "learning_rate": 4.492563357294369e-05, + "loss": 1.4629, + "step": 1160 + }, + { + "epoch": 2.08, + "grad_norm": 4.353027820587158, + "learning_rate": 4.4840920917726426e-05, + "loss": 1.352, + "step": 1170 + }, + { + "epoch": 2.1, + "grad_norm": 3.375173807144165, + "learning_rate": 4.475558826804833e-05, + "loss": 1.4096, + "step": 1180 + }, + { + "epoch": 2.12, + "grad_norm": 6.289668560028076, + "learning_rate": 4.466963829040712e-05, + "loss": 1.4834, + "step": 1190 + }, + { + "epoch": 2.13, + "grad_norm": 4.517002582550049, + "learning_rate": 4.458307367059092e-05, + "loss": 1.4746, + "step": 1200 + }, + { + "epoch": 2.13, + "eval_loss": 1.5145190954208374, + "eval_runtime": 124.8898, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 1200 + }, + { + "epoch": 2.15, + "grad_norm": 3.195769786834717, + "learning_rate": 4.449589711359438e-05, + "loss": 1.4149, + "step": 1210 + }, + { + "epoch": 2.17, + "grad_norm": 3.751405715942383, + "learning_rate": 4.440811134353412e-05, + "loss": 1.5501, + "step": 1220 + }, + { + "epoch": 2.19, + "grad_norm": 4.148709774017334, + "learning_rate": 4.431971910356363e-05, + "loss": 1.5253, + "step": 1230 + }, + { + "epoch": 2.2, + "grad_norm": 20.003253936767578, + "learning_rate": 4.42307231557875e-05, + "loss": 1.6413, + "step": 1240 + }, + { + "epoch": 2.22, + "grad_norm": 4.721023082733154, + "learning_rate": 4.414112628117517e-05, + "loss": 1.5608, + "step": 1250 + }, + { + "epoch": 2.24, + "grad_norm": 4.672358989715576, + "learning_rate": 4.4050931279474015e-05, + "loss": 1.3646, + "step": 1260 + }, + { + "epoch": 2.26, + "grad_norm": 4.073034286499023, + "learning_rate": 4.396014096912182e-05, + "loss": 1.3499, + "step": 1270 + }, + { + "epoch": 2.28, + "grad_norm": 3.2312991619110107, + "learning_rate": 4.386875818715874e-05, + "loss": 1.4648, + "step": 1280 + }, + { + "epoch": 2.29, + "grad_norm": 18.92267417907715, + "learning_rate": 4.3776785789138675e-05, + "loss": 1.4919, + "step": 1290 + }, + { + "epoch": 2.31, + "grad_norm": 5.677367687225342, + "learning_rate": 4.368422664903997e-05, + "loss": 1.2891, + "step": 1300 + }, + { + "epoch": 2.31, + "eval_loss": 1.504623532295227, + "eval_runtime": 124.8541, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1300 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 1.010572452744659e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1300/training_args.bin b/checkpoint-1300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-1300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-1400/README.md b/checkpoint-1400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-1400/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1400/adapter_config.json b/checkpoint-1400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-1400/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1400/adapter_model.safetensors b/checkpoint-1400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..13827582981f91b9d2fe89ec1543a804cae9f5d2 --- /dev/null +++ b/checkpoint-1400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44c214b62247a52aa4942afe49e726bbfffc8ef04f492c747323bcbdb317f0aa +size 3695848 diff --git a/checkpoint-1400/optimizer.pt b/checkpoint-1400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b587a86066aa9819b46081fb6bc9aa6bbcdbd682 --- /dev/null +++ b/checkpoint-1400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f948275636df9d282451ca91bcb81adf0ae49ad0a8ca11735782fbbc7b5447b2 +size 7433594 diff --git a/checkpoint-1400/rng_state.pth b/checkpoint-1400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cfd4c8122f37f963437cdd7519832c075f8941b6 --- /dev/null +++ b/checkpoint-1400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9248879f882b1ddec0c0bf83dda9481bdf345a65bb26a9a0ca7a4fad7a6a52d +size 14244 diff --git a/checkpoint-1400/scheduler.pt b/checkpoint-1400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..417c8f49d2aad3440f71c484bde6bef44d208d45 --- /dev/null +++ b/checkpoint-1400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd20179f117c92cee3e8119d3f69bc4babcc8c74f8384e502c07940a1e5b1528 +size 1064 diff --git a/checkpoint-1400/special_tokens_map.json b/checkpoint-1400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-1400/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1400/tokenizer.model b/checkpoint-1400/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-1400/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-1400/tokenizer_config.json b/checkpoint-1400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-1400/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-1400/trainer_state.json b/checkpoint-1400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..da0258aacb9b11024abf033834468ba0ca431608 --- /dev/null +++ b/checkpoint-1400/trainer_state.json @@ -0,0 +1,1113 @@ +{ + "best_metric": 1.498180627822876, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-1400", + "epoch": 2.488888888888889, + "eval_steps": 100, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + }, + { + "epoch": 1.44, + "grad_norm": 3.520264148712158, + "learning_rate": 4.7480731191519224e-05, + "loss": 1.4991, + "step": 810 + }, + { + "epoch": 1.46, + "grad_norm": 5.3987531661987305, + "learning_rate": 4.741924250958289e-05, + "loss": 1.6856, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 12.352794647216797, + "learning_rate": 4.7357053265095575e-05, + "loss": 1.4509, + "step": 830 + }, + { + "epoch": 1.49, + "grad_norm": 9.825531005859375, + "learning_rate": 4.729416540136361e-05, + "loss": 1.6168, + "step": 840 + }, + { + "epoch": 1.51, + "grad_norm": 10.881526947021484, + "learning_rate": 4.723058088352395e-05, + "loss": 1.5783, + "step": 850 + }, + { + "epoch": 1.53, + "grad_norm": 6.232407093048096, + "learning_rate": 4.7166301698482815e-05, + "loss": 1.4556, + "step": 860 + }, + { + "epoch": 1.55, + "grad_norm": 3.3216302394866943, + "learning_rate": 4.710132985485355e-05, + "loss": 1.593, + "step": 870 + }, + { + "epoch": 1.56, + "grad_norm": 5.219264984130859, + "learning_rate": 4.703566738289389e-05, + "loss": 1.5131, + "step": 880 + }, + { + "epoch": 1.58, + "grad_norm": 7.875769138336182, + "learning_rate": 4.696931633444251e-05, + "loss": 1.5667, + "step": 890 + }, + { + "epoch": 1.6, + "grad_norm": 5.77959680557251, + "learning_rate": 4.69022787828549e-05, + "loss": 1.5211, + "step": 900 + }, + { + "epoch": 1.6, + "eval_loss": 1.5731443166732788, + "eval_runtime": 124.8025, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 900 + }, + { + "epoch": 1.62, + "grad_norm": 4.806954383850098, + "learning_rate": 4.683455682293863e-05, + "loss": 1.6824, + "step": 910 + }, + { + "epoch": 1.64, + "grad_norm": 5.980200290679932, + "learning_rate": 4.676615257088776e-05, + "loss": 1.5989, + "step": 920 + }, + { + "epoch": 1.65, + "grad_norm": 4.3645429611206055, + "learning_rate": 4.6697068164216896e-05, + "loss": 1.6469, + "step": 930 + }, + { + "epoch": 1.67, + "grad_norm": 3.2400012016296387, + "learning_rate": 4.662730576169423e-05, + "loss": 1.568, + "step": 940 + }, + { + "epoch": 1.69, + "grad_norm": 4.331827640533447, + "learning_rate": 4.6556867543274184e-05, + "loss": 1.5236, + "step": 950 + }, + { + "epoch": 1.71, + "grad_norm": 3.3798201084136963, + "learning_rate": 4.6485755710029256e-05, + "loss": 1.5046, + "step": 960 + }, + { + "epoch": 1.72, + "grad_norm": 5.440864086151123, + "learning_rate": 4.6413972484081216e-05, + "loss": 1.5816, + "step": 970 + }, + { + "epoch": 1.74, + "grad_norm": 5.852995872497559, + "learning_rate": 4.6341520108531746e-05, + "loss": 1.4193, + "step": 980 + }, + { + "epoch": 1.76, + "grad_norm": 4.2782206535339355, + "learning_rate": 4.626840084739224e-05, + "loss": 1.5457, + "step": 990 + }, + { + "epoch": 1.78, + "grad_norm": 8.631403923034668, + "learning_rate": 4.619461698551315e-05, + "loss": 1.652, + "step": 1000 + }, + { + "epoch": 1.78, + "eval_loss": 1.5386379957199097, + "eval_runtime": 124.8384, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 2.003, + "step": 1000 + }, + { + "epoch": 1.8, + "grad_norm": 4.581122875213623, + "learning_rate": 4.612017082851253e-05, + "loss": 1.5746, + "step": 1010 + }, + { + "epoch": 1.81, + "grad_norm": 3.0373165607452393, + "learning_rate": 4.604506470270403e-05, + "loss": 1.6038, + "step": 1020 + }, + { + "epoch": 1.83, + "grad_norm": 3.5066914558410645, + "learning_rate": 4.5969300955024167e-05, + "loss": 1.5725, + "step": 1030 + }, + { + "epoch": 1.85, + "grad_norm": 4.402235507965088, + "learning_rate": 4.589288195295901e-05, + "loss": 1.5469, + "step": 1040 + }, + { + "epoch": 1.87, + "grad_norm": 4.844370365142822, + "learning_rate": 4.58158100844702e-05, + "loss": 1.5424, + "step": 1050 + }, + { + "epoch": 1.88, + "grad_norm": 4.146657943725586, + "learning_rate": 4.573808775792033e-05, + "loss": 1.4878, + "step": 1060 + }, + { + "epoch": 1.9, + "grad_norm": 3.210528612136841, + "learning_rate": 4.5659717401997655e-05, + "loss": 1.6077, + "step": 1070 + }, + { + "epoch": 1.92, + "grad_norm": 5.2232818603515625, + "learning_rate": 4.5580701465640254e-05, + "loss": 1.4824, + "step": 1080 + }, + { + "epoch": 1.94, + "grad_norm": 2.8741068840026855, + "learning_rate": 4.550104241795946e-05, + "loss": 1.6172, + "step": 1090 + }, + { + "epoch": 1.96, + "grad_norm": 8.092519760131836, + "learning_rate": 4.5420742748162734e-05, + "loss": 1.3659, + "step": 1100 + }, + { + "epoch": 1.96, + "eval_loss": 1.5198711156845093, + "eval_runtime": 124.8546, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1100 + }, + { + "epoch": 1.97, + "grad_norm": 5.068336009979248, + "learning_rate": 4.5339804965475875e-05, + "loss": 1.4661, + "step": 1110 + }, + { + "epoch": 1.99, + "grad_norm": 13.167552947998047, + "learning_rate": 4.525823159906459e-05, + "loss": 1.411, + "step": 1120 + }, + { + "epoch": 2.01, + "grad_norm": 4.712369918823242, + "learning_rate": 4.5176025197955494e-05, + "loss": 1.3309, + "step": 1130 + }, + { + "epoch": 2.03, + "grad_norm": 7.261610507965088, + "learning_rate": 4.509318833095642e-05, + "loss": 1.3892, + "step": 1140 + }, + { + "epoch": 2.04, + "grad_norm": 3.8006956577301025, + "learning_rate": 4.500972358657618e-05, + "loss": 1.3927, + "step": 1150 + }, + { + "epoch": 2.06, + "grad_norm": 3.6301958560943604, + "learning_rate": 4.492563357294369e-05, + "loss": 1.4629, + "step": 1160 + }, + { + "epoch": 2.08, + "grad_norm": 4.353027820587158, + "learning_rate": 4.4840920917726426e-05, + "loss": 1.352, + "step": 1170 + }, + { + "epoch": 2.1, + "grad_norm": 3.375173807144165, + "learning_rate": 4.475558826804833e-05, + "loss": 1.4096, + "step": 1180 + }, + { + "epoch": 2.12, + "grad_norm": 6.289668560028076, + "learning_rate": 4.466963829040712e-05, + "loss": 1.4834, + "step": 1190 + }, + { + "epoch": 2.13, + "grad_norm": 4.517002582550049, + "learning_rate": 4.458307367059092e-05, + "loss": 1.4746, + "step": 1200 + }, + { + "epoch": 2.13, + "eval_loss": 1.5145190954208374, + "eval_runtime": 124.8898, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 1200 + }, + { + "epoch": 2.15, + "grad_norm": 3.195769786834717, + "learning_rate": 4.449589711359438e-05, + "loss": 1.4149, + "step": 1210 + }, + { + "epoch": 2.17, + "grad_norm": 3.751405715942383, + "learning_rate": 4.440811134353412e-05, + "loss": 1.5501, + "step": 1220 + }, + { + "epoch": 2.19, + "grad_norm": 4.148709774017334, + "learning_rate": 4.431971910356363e-05, + "loss": 1.5253, + "step": 1230 + }, + { + "epoch": 2.2, + "grad_norm": 20.003253936767578, + "learning_rate": 4.42307231557875e-05, + "loss": 1.6413, + "step": 1240 + }, + { + "epoch": 2.22, + "grad_norm": 4.721023082733154, + "learning_rate": 4.414112628117517e-05, + "loss": 1.5608, + "step": 1250 + }, + { + "epoch": 2.24, + "grad_norm": 4.672358989715576, + "learning_rate": 4.4050931279474015e-05, + "loss": 1.3646, + "step": 1260 + }, + { + "epoch": 2.26, + "grad_norm": 4.073034286499023, + "learning_rate": 4.396014096912182e-05, + "loss": 1.3499, + "step": 1270 + }, + { + "epoch": 2.28, + "grad_norm": 3.2312991619110107, + "learning_rate": 4.386875818715874e-05, + "loss": 1.4648, + "step": 1280 + }, + { + "epoch": 2.29, + "grad_norm": 18.92267417907715, + "learning_rate": 4.3776785789138675e-05, + "loss": 1.4919, + "step": 1290 + }, + { + "epoch": 2.31, + "grad_norm": 5.677367687225342, + "learning_rate": 4.368422664903997e-05, + "loss": 1.2891, + "step": 1300 + }, + { + "epoch": 2.31, + "eval_loss": 1.504623532295227, + "eval_runtime": 124.8541, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1300 + }, + { + "epoch": 2.33, + "grad_norm": 5.031940460205078, + "learning_rate": 4.359108365917565e-05, + "loss": 1.4939, + "step": 1310 + }, + { + "epoch": 2.35, + "grad_norm": 7.701929092407227, + "learning_rate": 4.349735973010305e-05, + "loss": 1.28, + "step": 1320 + }, + { + "epoch": 2.36, + "grad_norm": 5.7498040199279785, + "learning_rate": 4.3403057790532855e-05, + "loss": 1.4584, + "step": 1330 + }, + { + "epoch": 2.38, + "grad_norm": 8.7277193069458, + "learning_rate": 4.330818078723755e-05, + "loss": 1.5871, + "step": 1340 + }, + { + "epoch": 2.4, + "grad_norm": 13.915125846862793, + "learning_rate": 4.32127316849594e-05, + "loss": 1.3794, + "step": 1350 + }, + { + "epoch": 2.42, + "grad_norm": 2.949733018875122, + "learning_rate": 4.311671346631774e-05, + "loss": 1.3543, + "step": 1360 + }, + { + "epoch": 2.44, + "grad_norm": 5.377658843994141, + "learning_rate": 4.302012913171584e-05, + "loss": 1.3695, + "step": 1370 + }, + { + "epoch": 2.45, + "grad_norm": 16.94107437133789, + "learning_rate": 4.292298169924709e-05, + "loss": 1.5168, + "step": 1380 + }, + { + "epoch": 2.47, + "grad_norm": 4.190367221832275, + "learning_rate": 4.282527420460072e-05, + "loss": 1.4058, + "step": 1390 + }, + { + "epoch": 2.49, + "grad_norm": 9.269573211669922, + "learning_rate": 4.272700970096696e-05, + "loss": 1.5794, + "step": 1400 + }, + { + "epoch": 2.49, + "eval_loss": 1.498180627822876, + "eval_runtime": 124.7222, + "eval_samples_per_second": 8.018, + "eval_steps_per_second": 2.004, + "step": 1400 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 1.0866907152427254e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1400/training_args.bin b/checkpoint-1400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-1400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-1500/README.md b/checkpoint-1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-1500/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1500/adapter_config.json b/checkpoint-1500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-1500/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1500/adapter_model.safetensors b/checkpoint-1500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c58f92f9282867083740399e2cb04690e77a4152 --- /dev/null +++ b/checkpoint-1500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c29f3e1f1182e346950cbf796e51d4023338b936e020e1970ef13e67ae30cc8a +size 3695848 diff --git a/checkpoint-1500/optimizer.pt b/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c48e3d2840af32965852b4f58b06f5c2403ed0d6 --- /dev/null +++ b/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b6ccf96b1459f5d6795b645817dc45059dfb077f42b2acdceaa20b0b67be88b +size 7433594 diff --git a/checkpoint-1500/rng_state.pth b/checkpoint-1500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..54688aa49f444f096ca34a005ad8ae3326bc2187 --- /dev/null +++ b/checkpoint-1500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a7e6f6110044ba16125efcc10e09e6a75d8e8e1308c4939cf51fddad6624a54 +size 14244 diff --git a/checkpoint-1500/scheduler.pt b/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..020d2165b03e9c8f7dda028e8bf3f6dbdfe953cf --- /dev/null +++ b/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a16d14b00a11f0ce9fda4d09d4f83450cf1a6468eea32573e43c47e34aed9fbc +size 1064 diff --git a/checkpoint-1500/special_tokens_map.json b/checkpoint-1500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-1500/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1500/tokenizer.model b/checkpoint-1500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-1500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-1500/tokenizer_config.json b/checkpoint-1500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-1500/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-1500/trainer_state.json b/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a268aa501066666d6297378a45c8bfc62c7ba75c --- /dev/null +++ b/checkpoint-1500/trainer_state.json @@ -0,0 +1,1191 @@ +{ + "best_metric": 1.498180627822876, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-1400", + "epoch": 2.6666666666666665, + "eval_steps": 100, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + }, + { + "epoch": 1.44, + "grad_norm": 3.520264148712158, + "learning_rate": 4.7480731191519224e-05, + "loss": 1.4991, + "step": 810 + }, + { + "epoch": 1.46, + "grad_norm": 5.3987531661987305, + "learning_rate": 4.741924250958289e-05, + "loss": 1.6856, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 12.352794647216797, + "learning_rate": 4.7357053265095575e-05, + "loss": 1.4509, + "step": 830 + }, + { + "epoch": 1.49, + "grad_norm": 9.825531005859375, + "learning_rate": 4.729416540136361e-05, + "loss": 1.6168, + "step": 840 + }, + { + "epoch": 1.51, + "grad_norm": 10.881526947021484, + "learning_rate": 4.723058088352395e-05, + "loss": 1.5783, + "step": 850 + }, + { + "epoch": 1.53, + "grad_norm": 6.232407093048096, + "learning_rate": 4.7166301698482815e-05, + "loss": 1.4556, + "step": 860 + }, + { + "epoch": 1.55, + "grad_norm": 3.3216302394866943, + "learning_rate": 4.710132985485355e-05, + "loss": 1.593, + "step": 870 + }, + { + "epoch": 1.56, + "grad_norm": 5.219264984130859, + "learning_rate": 4.703566738289389e-05, + "loss": 1.5131, + "step": 880 + }, + { + "epoch": 1.58, + "grad_norm": 7.875769138336182, + "learning_rate": 4.696931633444251e-05, + "loss": 1.5667, + "step": 890 + }, + { + "epoch": 1.6, + "grad_norm": 5.77959680557251, + "learning_rate": 4.69022787828549e-05, + "loss": 1.5211, + "step": 900 + }, + { + "epoch": 1.6, + "eval_loss": 1.5731443166732788, + "eval_runtime": 124.8025, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 900 + }, + { + "epoch": 1.62, + "grad_norm": 4.806954383850098, + "learning_rate": 4.683455682293863e-05, + "loss": 1.6824, + "step": 910 + }, + { + "epoch": 1.64, + "grad_norm": 5.980200290679932, + "learning_rate": 4.676615257088776e-05, + "loss": 1.5989, + "step": 920 + }, + { + "epoch": 1.65, + "grad_norm": 4.3645429611206055, + "learning_rate": 4.6697068164216896e-05, + "loss": 1.6469, + "step": 930 + }, + { + "epoch": 1.67, + "grad_norm": 3.2400012016296387, + "learning_rate": 4.662730576169423e-05, + "loss": 1.568, + "step": 940 + }, + { + "epoch": 1.69, + "grad_norm": 4.331827640533447, + "learning_rate": 4.6556867543274184e-05, + "loss": 1.5236, + "step": 950 + }, + { + "epoch": 1.71, + "grad_norm": 3.3798201084136963, + "learning_rate": 4.6485755710029256e-05, + "loss": 1.5046, + "step": 960 + }, + { + "epoch": 1.72, + "grad_norm": 5.440864086151123, + "learning_rate": 4.6413972484081216e-05, + "loss": 1.5816, + "step": 970 + }, + { + "epoch": 1.74, + "grad_norm": 5.852995872497559, + "learning_rate": 4.6341520108531746e-05, + "loss": 1.4193, + "step": 980 + }, + { + "epoch": 1.76, + "grad_norm": 4.2782206535339355, + "learning_rate": 4.626840084739224e-05, + "loss": 1.5457, + "step": 990 + }, + { + "epoch": 1.78, + "grad_norm": 8.631403923034668, + "learning_rate": 4.619461698551315e-05, + "loss": 1.652, + "step": 1000 + }, + { + "epoch": 1.78, + "eval_loss": 1.5386379957199097, + "eval_runtime": 124.8384, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 2.003, + "step": 1000 + }, + { + "epoch": 1.8, + "grad_norm": 4.581122875213623, + "learning_rate": 4.612017082851253e-05, + "loss": 1.5746, + "step": 1010 + }, + { + "epoch": 1.81, + "grad_norm": 3.0373165607452393, + "learning_rate": 4.604506470270403e-05, + "loss": 1.6038, + "step": 1020 + }, + { + "epoch": 1.83, + "grad_norm": 3.5066914558410645, + "learning_rate": 4.5969300955024167e-05, + "loss": 1.5725, + "step": 1030 + }, + { + "epoch": 1.85, + "grad_norm": 4.402235507965088, + "learning_rate": 4.589288195295901e-05, + "loss": 1.5469, + "step": 1040 + }, + { + "epoch": 1.87, + "grad_norm": 4.844370365142822, + "learning_rate": 4.58158100844702e-05, + "loss": 1.5424, + "step": 1050 + }, + { + "epoch": 1.88, + "grad_norm": 4.146657943725586, + "learning_rate": 4.573808775792033e-05, + "loss": 1.4878, + "step": 1060 + }, + { + "epoch": 1.9, + "grad_norm": 3.210528612136841, + "learning_rate": 4.5659717401997655e-05, + "loss": 1.6077, + "step": 1070 + }, + { + "epoch": 1.92, + "grad_norm": 5.2232818603515625, + "learning_rate": 4.5580701465640254e-05, + "loss": 1.4824, + "step": 1080 + }, + { + "epoch": 1.94, + "grad_norm": 2.8741068840026855, + "learning_rate": 4.550104241795946e-05, + "loss": 1.6172, + "step": 1090 + }, + { + "epoch": 1.96, + "grad_norm": 8.092519760131836, + "learning_rate": 4.5420742748162734e-05, + "loss": 1.3659, + "step": 1100 + }, + { + "epoch": 1.96, + "eval_loss": 1.5198711156845093, + "eval_runtime": 124.8546, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1100 + }, + { + "epoch": 1.97, + "grad_norm": 5.068336009979248, + "learning_rate": 4.5339804965475875e-05, + "loss": 1.4661, + "step": 1110 + }, + { + "epoch": 1.99, + "grad_norm": 13.167552947998047, + "learning_rate": 4.525823159906459e-05, + "loss": 1.411, + "step": 1120 + }, + { + "epoch": 2.01, + "grad_norm": 4.712369918823242, + "learning_rate": 4.5176025197955494e-05, + "loss": 1.3309, + "step": 1130 + }, + { + "epoch": 2.03, + "grad_norm": 7.261610507965088, + "learning_rate": 4.509318833095642e-05, + "loss": 1.3892, + "step": 1140 + }, + { + "epoch": 2.04, + "grad_norm": 3.8006956577301025, + "learning_rate": 4.500972358657618e-05, + "loss": 1.3927, + "step": 1150 + }, + { + "epoch": 2.06, + "grad_norm": 3.6301958560943604, + "learning_rate": 4.492563357294369e-05, + "loss": 1.4629, + "step": 1160 + }, + { + "epoch": 2.08, + "grad_norm": 4.353027820587158, + "learning_rate": 4.4840920917726426e-05, + "loss": 1.352, + "step": 1170 + }, + { + "epoch": 2.1, + "grad_norm": 3.375173807144165, + "learning_rate": 4.475558826804833e-05, + "loss": 1.4096, + "step": 1180 + }, + { + "epoch": 2.12, + "grad_norm": 6.289668560028076, + "learning_rate": 4.466963829040712e-05, + "loss": 1.4834, + "step": 1190 + }, + { + "epoch": 2.13, + "grad_norm": 4.517002582550049, + "learning_rate": 4.458307367059092e-05, + "loss": 1.4746, + "step": 1200 + }, + { + "epoch": 2.13, + "eval_loss": 1.5145190954208374, + "eval_runtime": 124.8898, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 1200 + }, + { + "epoch": 2.15, + "grad_norm": 3.195769786834717, + "learning_rate": 4.449589711359438e-05, + "loss": 1.4149, + "step": 1210 + }, + { + "epoch": 2.17, + "grad_norm": 3.751405715942383, + "learning_rate": 4.440811134353412e-05, + "loss": 1.5501, + "step": 1220 + }, + { + "epoch": 2.19, + "grad_norm": 4.148709774017334, + "learning_rate": 4.431971910356363e-05, + "loss": 1.5253, + "step": 1230 + }, + { + "epoch": 2.2, + "grad_norm": 20.003253936767578, + "learning_rate": 4.42307231557875e-05, + "loss": 1.6413, + "step": 1240 + }, + { + "epoch": 2.22, + "grad_norm": 4.721023082733154, + "learning_rate": 4.414112628117517e-05, + "loss": 1.5608, + "step": 1250 + }, + { + "epoch": 2.24, + "grad_norm": 4.672358989715576, + "learning_rate": 4.4050931279474015e-05, + "loss": 1.3646, + "step": 1260 + }, + { + "epoch": 2.26, + "grad_norm": 4.073034286499023, + "learning_rate": 4.396014096912182e-05, + "loss": 1.3499, + "step": 1270 + }, + { + "epoch": 2.28, + "grad_norm": 3.2312991619110107, + "learning_rate": 4.386875818715874e-05, + "loss": 1.4648, + "step": 1280 + }, + { + "epoch": 2.29, + "grad_norm": 18.92267417907715, + "learning_rate": 4.3776785789138675e-05, + "loss": 1.4919, + "step": 1290 + }, + { + "epoch": 2.31, + "grad_norm": 5.677367687225342, + "learning_rate": 4.368422664903997e-05, + "loss": 1.2891, + "step": 1300 + }, + { + "epoch": 2.31, + "eval_loss": 1.504623532295227, + "eval_runtime": 124.8541, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1300 + }, + { + "epoch": 2.33, + "grad_norm": 5.031940460205078, + "learning_rate": 4.359108365917565e-05, + "loss": 1.4939, + "step": 1310 + }, + { + "epoch": 2.35, + "grad_norm": 7.701929092407227, + "learning_rate": 4.349735973010305e-05, + "loss": 1.28, + "step": 1320 + }, + { + "epoch": 2.36, + "grad_norm": 5.7498040199279785, + "learning_rate": 4.3403057790532855e-05, + "loss": 1.4584, + "step": 1330 + }, + { + "epoch": 2.38, + "grad_norm": 8.7277193069458, + "learning_rate": 4.330818078723755e-05, + "loss": 1.5871, + "step": 1340 + }, + { + "epoch": 2.4, + "grad_norm": 13.915125846862793, + "learning_rate": 4.32127316849594e-05, + "loss": 1.3794, + "step": 1350 + }, + { + "epoch": 2.42, + "grad_norm": 2.949733018875122, + "learning_rate": 4.311671346631774e-05, + "loss": 1.3543, + "step": 1360 + }, + { + "epoch": 2.44, + "grad_norm": 5.377658843994141, + "learning_rate": 4.302012913171584e-05, + "loss": 1.3695, + "step": 1370 + }, + { + "epoch": 2.45, + "grad_norm": 16.94107437133789, + "learning_rate": 4.292298169924709e-05, + "loss": 1.5168, + "step": 1380 + }, + { + "epoch": 2.47, + "grad_norm": 4.190367221832275, + "learning_rate": 4.282527420460072e-05, + "loss": 1.4058, + "step": 1390 + }, + { + "epoch": 2.49, + "grad_norm": 9.269573211669922, + "learning_rate": 4.272700970096696e-05, + "loss": 1.5794, + "step": 1400 + }, + { + "epoch": 2.49, + "eval_loss": 1.498180627822876, + "eval_runtime": 124.7222, + "eval_samples_per_second": 8.018, + "eval_steps_per_second": 2.004, + "step": 1400 + }, + { + "epoch": 2.51, + "grad_norm": 3.951293468475342, + "learning_rate": 4.262819125894156e-05, + "loss": 1.56, + "step": 1410 + }, + { + "epoch": 2.52, + "grad_norm": 3.8725697994232178, + "learning_rate": 4.252882196642992e-05, + "loss": 1.5159, + "step": 1420 + }, + { + "epoch": 2.54, + "grad_norm": 3.898501396179199, + "learning_rate": 4.242890492855056e-05, + "loss": 1.4659, + "step": 1430 + }, + { + "epoch": 2.56, + "grad_norm": 5.807662487030029, + "learning_rate": 4.23284432675381e-05, + "loss": 1.5736, + "step": 1440 + }, + { + "epoch": 2.58, + "grad_norm": 3.529371500015259, + "learning_rate": 4.222744012264566e-05, + "loss": 1.5011, + "step": 1450 + }, + { + "epoch": 2.6, + "grad_norm": 6.336548805236816, + "learning_rate": 4.212589865004684e-05, + "loss": 1.6629, + "step": 1460 + }, + { + "epoch": 2.61, + "grad_norm": 6.222330093383789, + "learning_rate": 4.2023822022737016e-05, + "loss": 1.5573, + "step": 1470 + }, + { + "epoch": 2.63, + "grad_norm": 4.25172233581543, + "learning_rate": 4.192121343043424e-05, + "loss": 1.3817, + "step": 1480 + }, + { + "epoch": 2.65, + "grad_norm": 4.487111568450928, + "learning_rate": 4.181807607947954e-05, + "loss": 1.5323, + "step": 1490 + }, + { + "epoch": 2.67, + "grad_norm": 4.656155109405518, + "learning_rate": 4.1714413192736754e-05, + "loss": 1.3678, + "step": 1500 + }, + { + "epoch": 2.67, + "eval_loss": 1.5049968957901, + "eval_runtime": 124.7803, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.004, + "step": 1500 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 1.1665294250335273e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1500/training_args.bin b/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-1600/README.md b/checkpoint-1600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-1600/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1600/adapter_config.json b/checkpoint-1600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-1600/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1600/adapter_model.safetensors b/checkpoint-1600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..60307a4282c6ac66b63e79b42dbf951f751952db --- /dev/null +++ b/checkpoint-1600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5997cd1e1ccec7645a35d86f8fb3e685494291279d220450fde774953970c028 +size 3695848 diff --git a/checkpoint-1600/optimizer.pt b/checkpoint-1600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6930d510ab71a672f095b199a270d3056f4a7b60 --- /dev/null +++ b/checkpoint-1600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46a3d7f9f91db412b6ce432714182422202e4a858c868a464e530597d63bc934 +size 7433594 diff --git a/checkpoint-1600/rng_state.pth b/checkpoint-1600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..df2a013b58dccbd0d383abcb10944297e155f58c --- /dev/null +++ b/checkpoint-1600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da10fd0dc4281cc8906f40adc00a6ebd1867abdeadf3602640e25ca4449c6630 +size 14244 diff --git a/checkpoint-1600/scheduler.pt b/checkpoint-1600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4a71877e40d824ed6f34c9e1dc6df004a4da727 --- /dev/null +++ b/checkpoint-1600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f808986c8e680a605b465bc14eeb4fba94c0c8f4e9ad8bd8334ede3febe21b4 +size 1064 diff --git a/checkpoint-1600/special_tokens_map.json b/checkpoint-1600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-1600/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1600/tokenizer.model b/checkpoint-1600/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-1600/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-1600/tokenizer_config.json b/checkpoint-1600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-1600/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-1600/trainer_state.json b/checkpoint-1600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6582f7e50f660bfb79fbd8fcae5203cea9853113 --- /dev/null +++ b/checkpoint-1600/trainer_state.json @@ -0,0 +1,1269 @@ +{ + "best_metric": 1.4853577613830566, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-1600", + "epoch": 2.8444444444444446, + "eval_steps": 100, + "global_step": 1600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + }, + { + "epoch": 1.44, + "grad_norm": 3.520264148712158, + "learning_rate": 4.7480731191519224e-05, + "loss": 1.4991, + "step": 810 + }, + { + "epoch": 1.46, + "grad_norm": 5.3987531661987305, + "learning_rate": 4.741924250958289e-05, + "loss": 1.6856, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 12.352794647216797, + "learning_rate": 4.7357053265095575e-05, + "loss": 1.4509, + "step": 830 + }, + { + "epoch": 1.49, + "grad_norm": 9.825531005859375, + "learning_rate": 4.729416540136361e-05, + "loss": 1.6168, + "step": 840 + }, + { + "epoch": 1.51, + "grad_norm": 10.881526947021484, + "learning_rate": 4.723058088352395e-05, + "loss": 1.5783, + "step": 850 + }, + { + "epoch": 1.53, + "grad_norm": 6.232407093048096, + "learning_rate": 4.7166301698482815e-05, + "loss": 1.4556, + "step": 860 + }, + { + "epoch": 1.55, + "grad_norm": 3.3216302394866943, + "learning_rate": 4.710132985485355e-05, + "loss": 1.593, + "step": 870 + }, + { + "epoch": 1.56, + "grad_norm": 5.219264984130859, + "learning_rate": 4.703566738289389e-05, + "loss": 1.5131, + "step": 880 + }, + { + "epoch": 1.58, + "grad_norm": 7.875769138336182, + "learning_rate": 4.696931633444251e-05, + "loss": 1.5667, + "step": 890 + }, + { + "epoch": 1.6, + "grad_norm": 5.77959680557251, + "learning_rate": 4.69022787828549e-05, + "loss": 1.5211, + "step": 900 + }, + { + "epoch": 1.6, + "eval_loss": 1.5731443166732788, + "eval_runtime": 124.8025, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 900 + }, + { + "epoch": 1.62, + "grad_norm": 4.806954383850098, + "learning_rate": 4.683455682293863e-05, + "loss": 1.6824, + "step": 910 + }, + { + "epoch": 1.64, + "grad_norm": 5.980200290679932, + "learning_rate": 4.676615257088776e-05, + "loss": 1.5989, + "step": 920 + }, + { + "epoch": 1.65, + "grad_norm": 4.3645429611206055, + "learning_rate": 4.6697068164216896e-05, + "loss": 1.6469, + "step": 930 + }, + { + "epoch": 1.67, + "grad_norm": 3.2400012016296387, + "learning_rate": 4.662730576169423e-05, + "loss": 1.568, + "step": 940 + }, + { + "epoch": 1.69, + "grad_norm": 4.331827640533447, + "learning_rate": 4.6556867543274184e-05, + "loss": 1.5236, + "step": 950 + }, + { + "epoch": 1.71, + "grad_norm": 3.3798201084136963, + "learning_rate": 4.6485755710029256e-05, + "loss": 1.5046, + "step": 960 + }, + { + "epoch": 1.72, + "grad_norm": 5.440864086151123, + "learning_rate": 4.6413972484081216e-05, + "loss": 1.5816, + "step": 970 + }, + { + "epoch": 1.74, + "grad_norm": 5.852995872497559, + "learning_rate": 4.6341520108531746e-05, + "loss": 1.4193, + "step": 980 + }, + { + "epoch": 1.76, + "grad_norm": 4.2782206535339355, + "learning_rate": 4.626840084739224e-05, + "loss": 1.5457, + "step": 990 + }, + { + "epoch": 1.78, + "grad_norm": 8.631403923034668, + "learning_rate": 4.619461698551315e-05, + "loss": 1.652, + "step": 1000 + }, + { + "epoch": 1.78, + "eval_loss": 1.5386379957199097, + "eval_runtime": 124.8384, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 2.003, + "step": 1000 + }, + { + "epoch": 1.8, + "grad_norm": 4.581122875213623, + "learning_rate": 4.612017082851253e-05, + "loss": 1.5746, + "step": 1010 + }, + { + "epoch": 1.81, + "grad_norm": 3.0373165607452393, + "learning_rate": 4.604506470270403e-05, + "loss": 1.6038, + "step": 1020 + }, + { + "epoch": 1.83, + "grad_norm": 3.5066914558410645, + "learning_rate": 4.5969300955024167e-05, + "loss": 1.5725, + "step": 1030 + }, + { + "epoch": 1.85, + "grad_norm": 4.402235507965088, + "learning_rate": 4.589288195295901e-05, + "loss": 1.5469, + "step": 1040 + }, + { + "epoch": 1.87, + "grad_norm": 4.844370365142822, + "learning_rate": 4.58158100844702e-05, + "loss": 1.5424, + "step": 1050 + }, + { + "epoch": 1.88, + "grad_norm": 4.146657943725586, + "learning_rate": 4.573808775792033e-05, + "loss": 1.4878, + "step": 1060 + }, + { + "epoch": 1.9, + "grad_norm": 3.210528612136841, + "learning_rate": 4.5659717401997655e-05, + "loss": 1.6077, + "step": 1070 + }, + { + "epoch": 1.92, + "grad_norm": 5.2232818603515625, + "learning_rate": 4.5580701465640254e-05, + "loss": 1.4824, + "step": 1080 + }, + { + "epoch": 1.94, + "grad_norm": 2.8741068840026855, + "learning_rate": 4.550104241795946e-05, + "loss": 1.6172, + "step": 1090 + }, + { + "epoch": 1.96, + "grad_norm": 8.092519760131836, + "learning_rate": 4.5420742748162734e-05, + "loss": 1.3659, + "step": 1100 + }, + { + "epoch": 1.96, + "eval_loss": 1.5198711156845093, + "eval_runtime": 124.8546, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1100 + }, + { + "epoch": 1.97, + "grad_norm": 5.068336009979248, + "learning_rate": 4.5339804965475875e-05, + "loss": 1.4661, + "step": 1110 + }, + { + "epoch": 1.99, + "grad_norm": 13.167552947998047, + "learning_rate": 4.525823159906459e-05, + "loss": 1.411, + "step": 1120 + }, + { + "epoch": 2.01, + "grad_norm": 4.712369918823242, + "learning_rate": 4.5176025197955494e-05, + "loss": 1.3309, + "step": 1130 + }, + { + "epoch": 2.03, + "grad_norm": 7.261610507965088, + "learning_rate": 4.509318833095642e-05, + "loss": 1.3892, + "step": 1140 + }, + { + "epoch": 2.04, + "grad_norm": 3.8006956577301025, + "learning_rate": 4.500972358657618e-05, + "loss": 1.3927, + "step": 1150 + }, + { + "epoch": 2.06, + "grad_norm": 3.6301958560943604, + "learning_rate": 4.492563357294369e-05, + "loss": 1.4629, + "step": 1160 + }, + { + "epoch": 2.08, + "grad_norm": 4.353027820587158, + "learning_rate": 4.4840920917726426e-05, + "loss": 1.352, + "step": 1170 + }, + { + "epoch": 2.1, + "grad_norm": 3.375173807144165, + "learning_rate": 4.475558826804833e-05, + "loss": 1.4096, + "step": 1180 + }, + { + "epoch": 2.12, + "grad_norm": 6.289668560028076, + "learning_rate": 4.466963829040712e-05, + "loss": 1.4834, + "step": 1190 + }, + { + "epoch": 2.13, + "grad_norm": 4.517002582550049, + "learning_rate": 4.458307367059092e-05, + "loss": 1.4746, + "step": 1200 + }, + { + "epoch": 2.13, + "eval_loss": 1.5145190954208374, + "eval_runtime": 124.8898, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 1200 + }, + { + "epoch": 2.15, + "grad_norm": 3.195769786834717, + "learning_rate": 4.449589711359438e-05, + "loss": 1.4149, + "step": 1210 + }, + { + "epoch": 2.17, + "grad_norm": 3.751405715942383, + "learning_rate": 4.440811134353412e-05, + "loss": 1.5501, + "step": 1220 + }, + { + "epoch": 2.19, + "grad_norm": 4.148709774017334, + "learning_rate": 4.431971910356363e-05, + "loss": 1.5253, + "step": 1230 + }, + { + "epoch": 2.2, + "grad_norm": 20.003253936767578, + "learning_rate": 4.42307231557875e-05, + "loss": 1.6413, + "step": 1240 + }, + { + "epoch": 2.22, + "grad_norm": 4.721023082733154, + "learning_rate": 4.414112628117517e-05, + "loss": 1.5608, + "step": 1250 + }, + { + "epoch": 2.24, + "grad_norm": 4.672358989715576, + "learning_rate": 4.4050931279474015e-05, + "loss": 1.3646, + "step": 1260 + }, + { + "epoch": 2.26, + "grad_norm": 4.073034286499023, + "learning_rate": 4.396014096912182e-05, + "loss": 1.3499, + "step": 1270 + }, + { + "epoch": 2.28, + "grad_norm": 3.2312991619110107, + "learning_rate": 4.386875818715874e-05, + "loss": 1.4648, + "step": 1280 + }, + { + "epoch": 2.29, + "grad_norm": 18.92267417907715, + "learning_rate": 4.3776785789138675e-05, + "loss": 1.4919, + "step": 1290 + }, + { + "epoch": 2.31, + "grad_norm": 5.677367687225342, + "learning_rate": 4.368422664903997e-05, + "loss": 1.2891, + "step": 1300 + }, + { + "epoch": 2.31, + "eval_loss": 1.504623532295227, + "eval_runtime": 124.8541, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1300 + }, + { + "epoch": 2.33, + "grad_norm": 5.031940460205078, + "learning_rate": 4.359108365917565e-05, + "loss": 1.4939, + "step": 1310 + }, + { + "epoch": 2.35, + "grad_norm": 7.701929092407227, + "learning_rate": 4.349735973010305e-05, + "loss": 1.28, + "step": 1320 + }, + { + "epoch": 2.36, + "grad_norm": 5.7498040199279785, + "learning_rate": 4.3403057790532855e-05, + "loss": 1.4584, + "step": 1330 + }, + { + "epoch": 2.38, + "grad_norm": 8.7277193069458, + "learning_rate": 4.330818078723755e-05, + "loss": 1.5871, + "step": 1340 + }, + { + "epoch": 2.4, + "grad_norm": 13.915125846862793, + "learning_rate": 4.32127316849594e-05, + "loss": 1.3794, + "step": 1350 + }, + { + "epoch": 2.42, + "grad_norm": 2.949733018875122, + "learning_rate": 4.311671346631774e-05, + "loss": 1.3543, + "step": 1360 + }, + { + "epoch": 2.44, + "grad_norm": 5.377658843994141, + "learning_rate": 4.302012913171584e-05, + "loss": 1.3695, + "step": 1370 + }, + { + "epoch": 2.45, + "grad_norm": 16.94107437133789, + "learning_rate": 4.292298169924709e-05, + "loss": 1.5168, + "step": 1380 + }, + { + "epoch": 2.47, + "grad_norm": 4.190367221832275, + "learning_rate": 4.282527420460072e-05, + "loss": 1.4058, + "step": 1390 + }, + { + "epoch": 2.49, + "grad_norm": 9.269573211669922, + "learning_rate": 4.272700970096696e-05, + "loss": 1.5794, + "step": 1400 + }, + { + "epoch": 2.49, + "eval_loss": 1.498180627822876, + "eval_runtime": 124.7222, + "eval_samples_per_second": 8.018, + "eval_steps_per_second": 2.004, + "step": 1400 + }, + { + "epoch": 2.51, + "grad_norm": 3.951293468475342, + "learning_rate": 4.262819125894156e-05, + "loss": 1.56, + "step": 1410 + }, + { + "epoch": 2.52, + "grad_norm": 3.8725697994232178, + "learning_rate": 4.252882196642992e-05, + "loss": 1.5159, + "step": 1420 + }, + { + "epoch": 2.54, + "grad_norm": 3.898501396179199, + "learning_rate": 4.242890492855056e-05, + "loss": 1.4659, + "step": 1430 + }, + { + "epoch": 2.56, + "grad_norm": 5.807662487030029, + "learning_rate": 4.23284432675381e-05, + "loss": 1.5736, + "step": 1440 + }, + { + "epoch": 2.58, + "grad_norm": 3.529371500015259, + "learning_rate": 4.222744012264566e-05, + "loss": 1.5011, + "step": 1450 + }, + { + "epoch": 2.6, + "grad_norm": 6.336548805236816, + "learning_rate": 4.212589865004684e-05, + "loss": 1.6629, + "step": 1460 + }, + { + "epoch": 2.61, + "grad_norm": 6.222330093383789, + "learning_rate": 4.2023822022737016e-05, + "loss": 1.5573, + "step": 1470 + }, + { + "epoch": 2.63, + "grad_norm": 4.25172233581543, + "learning_rate": 4.192121343043424e-05, + "loss": 1.3817, + "step": 1480 + }, + { + "epoch": 2.65, + "grad_norm": 4.487111568450928, + "learning_rate": 4.181807607947954e-05, + "loss": 1.5323, + "step": 1490 + }, + { + "epoch": 2.67, + "grad_norm": 4.656155109405518, + "learning_rate": 4.1714413192736754e-05, + "loss": 1.3678, + "step": 1500 + }, + { + "epoch": 2.67, + "eval_loss": 1.5049968957901, + "eval_runtime": 124.7803, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.004, + "step": 1500 + }, + { + "epoch": 2.68, + "grad_norm": 4.431355953216553, + "learning_rate": 4.161022800949177e-05, + "loss": 1.486, + "step": 1510 + }, + { + "epoch": 2.7, + "grad_norm": 18.211524963378906, + "learning_rate": 4.150552378535137e-05, + "loss": 1.4498, + "step": 1520 + }, + { + "epoch": 2.72, + "grad_norm": 5.3755292892456055, + "learning_rate": 4.140030379214147e-05, + "loss": 1.4421, + "step": 1530 + }, + { + "epoch": 2.74, + "grad_norm": 6.626212120056152, + "learning_rate": 4.1294571317804854e-05, + "loss": 1.4322, + "step": 1540 + }, + { + "epoch": 2.76, + "grad_norm": 4.030793190002441, + "learning_rate": 4.1188329666298464e-05, + "loss": 1.3433, + "step": 1550 + }, + { + "epoch": 2.77, + "grad_norm": 6.53309440612793, + "learning_rate": 4.108158215749014e-05, + "loss": 1.5604, + "step": 1560 + }, + { + "epoch": 2.79, + "grad_norm": 3.76047420501709, + "learning_rate": 4.0974332127054914e-05, + "loss": 1.3259, + "step": 1570 + }, + { + "epoch": 2.81, + "grad_norm": 4.58742094039917, + "learning_rate": 4.0866582926370725e-05, + "loss": 1.4228, + "step": 1580 + }, + { + "epoch": 2.83, + "grad_norm": 4.566816806793213, + "learning_rate": 4.0758337922413716e-05, + "loss": 1.3013, + "step": 1590 + }, + { + "epoch": 2.84, + "grad_norm": 6.218478202819824, + "learning_rate": 4.064960049765304e-05, + "loss": 1.5061, + "step": 1600 + }, + { + "epoch": 2.84, + "eval_loss": 1.4853577613830566, + "eval_runtime": 124.7889, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.003, + "step": 1600 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 1.2453933766709084e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1600/training_args.bin b/checkpoint-1600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-1600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-1700/README.md b/checkpoint-1700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-1700/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1700/adapter_config.json b/checkpoint-1700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-1700/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1700/adapter_model.safetensors b/checkpoint-1700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..77df282344e3816f312621cd121f4dd2ede731eb --- /dev/null +++ b/checkpoint-1700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e6dbff24086204d0d9d21e15cf4610f602c5dc5f8c0c7c27c825b9dd3b944d1 +size 3695848 diff --git a/checkpoint-1700/optimizer.pt b/checkpoint-1700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce4d0fc5b5c572e596b983cd4664418ab581c3ea --- /dev/null +++ b/checkpoint-1700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09320c0a6d3dc01e2a102435328998e5b8c897d519fe03076893261d774f9c92 +size 7433594 diff --git a/checkpoint-1700/rng_state.pth b/checkpoint-1700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..e19ae8362b9d3c7a143e8c04778152446d252b1b --- /dev/null +++ b/checkpoint-1700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:852f3fd5f1617fef0127cb60fc7f76662bb7f88dccf08de78bbe18396f9a5465 +size 14244 diff --git a/checkpoint-1700/scheduler.pt b/checkpoint-1700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..14d49e8d5035bde5b5860b1241687a60cf510f59 --- /dev/null +++ b/checkpoint-1700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f34ed12275ce8cb3775cded317a2ea6bc16fa2d373c2282f4903eede5b8afe4 +size 1064 diff --git a/checkpoint-1700/special_tokens_map.json b/checkpoint-1700/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-1700/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1700/tokenizer.model b/checkpoint-1700/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-1700/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-1700/tokenizer_config.json b/checkpoint-1700/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-1700/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-1700/trainer_state.json b/checkpoint-1700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..488586eabb14abc0c3d8e11773d05cb8e7e2ae10 --- /dev/null +++ b/checkpoint-1700/trainer_state.json @@ -0,0 +1,1347 @@ +{ + "best_metric": 1.478974461555481, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-1700", + "epoch": 3.022222222222222, + "eval_steps": 100, + "global_step": 1700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + }, + { + "epoch": 1.44, + "grad_norm": 3.520264148712158, + "learning_rate": 4.7480731191519224e-05, + "loss": 1.4991, + "step": 810 + }, + { + "epoch": 1.46, + "grad_norm": 5.3987531661987305, + "learning_rate": 4.741924250958289e-05, + "loss": 1.6856, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 12.352794647216797, + "learning_rate": 4.7357053265095575e-05, + "loss": 1.4509, + "step": 830 + }, + { + "epoch": 1.49, + "grad_norm": 9.825531005859375, + "learning_rate": 4.729416540136361e-05, + "loss": 1.6168, + "step": 840 + }, + { + "epoch": 1.51, + "grad_norm": 10.881526947021484, + "learning_rate": 4.723058088352395e-05, + "loss": 1.5783, + "step": 850 + }, + { + "epoch": 1.53, + "grad_norm": 6.232407093048096, + "learning_rate": 4.7166301698482815e-05, + "loss": 1.4556, + "step": 860 + }, + { + "epoch": 1.55, + "grad_norm": 3.3216302394866943, + "learning_rate": 4.710132985485355e-05, + "loss": 1.593, + "step": 870 + }, + { + "epoch": 1.56, + "grad_norm": 5.219264984130859, + "learning_rate": 4.703566738289389e-05, + "loss": 1.5131, + "step": 880 + }, + { + "epoch": 1.58, + "grad_norm": 7.875769138336182, + "learning_rate": 4.696931633444251e-05, + "loss": 1.5667, + "step": 890 + }, + { + "epoch": 1.6, + "grad_norm": 5.77959680557251, + "learning_rate": 4.69022787828549e-05, + "loss": 1.5211, + "step": 900 + }, + { + "epoch": 1.6, + "eval_loss": 1.5731443166732788, + "eval_runtime": 124.8025, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 900 + }, + { + "epoch": 1.62, + "grad_norm": 4.806954383850098, + "learning_rate": 4.683455682293863e-05, + "loss": 1.6824, + "step": 910 + }, + { + "epoch": 1.64, + "grad_norm": 5.980200290679932, + "learning_rate": 4.676615257088776e-05, + "loss": 1.5989, + "step": 920 + }, + { + "epoch": 1.65, + "grad_norm": 4.3645429611206055, + "learning_rate": 4.6697068164216896e-05, + "loss": 1.6469, + "step": 930 + }, + { + "epoch": 1.67, + "grad_norm": 3.2400012016296387, + "learning_rate": 4.662730576169423e-05, + "loss": 1.568, + "step": 940 + }, + { + "epoch": 1.69, + "grad_norm": 4.331827640533447, + "learning_rate": 4.6556867543274184e-05, + "loss": 1.5236, + "step": 950 + }, + { + "epoch": 1.71, + "grad_norm": 3.3798201084136963, + "learning_rate": 4.6485755710029256e-05, + "loss": 1.5046, + "step": 960 + }, + { + "epoch": 1.72, + "grad_norm": 5.440864086151123, + "learning_rate": 4.6413972484081216e-05, + "loss": 1.5816, + "step": 970 + }, + { + "epoch": 1.74, + "grad_norm": 5.852995872497559, + "learning_rate": 4.6341520108531746e-05, + "loss": 1.4193, + "step": 980 + }, + { + "epoch": 1.76, + "grad_norm": 4.2782206535339355, + "learning_rate": 4.626840084739224e-05, + "loss": 1.5457, + "step": 990 + }, + { + "epoch": 1.78, + "grad_norm": 8.631403923034668, + "learning_rate": 4.619461698551315e-05, + "loss": 1.652, + "step": 1000 + }, + { + "epoch": 1.78, + "eval_loss": 1.5386379957199097, + "eval_runtime": 124.8384, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 2.003, + "step": 1000 + }, + { + "epoch": 1.8, + "grad_norm": 4.581122875213623, + "learning_rate": 4.612017082851253e-05, + "loss": 1.5746, + "step": 1010 + }, + { + "epoch": 1.81, + "grad_norm": 3.0373165607452393, + "learning_rate": 4.604506470270403e-05, + "loss": 1.6038, + "step": 1020 + }, + { + "epoch": 1.83, + "grad_norm": 3.5066914558410645, + "learning_rate": 4.5969300955024167e-05, + "loss": 1.5725, + "step": 1030 + }, + { + "epoch": 1.85, + "grad_norm": 4.402235507965088, + "learning_rate": 4.589288195295901e-05, + "loss": 1.5469, + "step": 1040 + }, + { + "epoch": 1.87, + "grad_norm": 4.844370365142822, + "learning_rate": 4.58158100844702e-05, + "loss": 1.5424, + "step": 1050 + }, + { + "epoch": 1.88, + "grad_norm": 4.146657943725586, + "learning_rate": 4.573808775792033e-05, + "loss": 1.4878, + "step": 1060 + }, + { + "epoch": 1.9, + "grad_norm": 3.210528612136841, + "learning_rate": 4.5659717401997655e-05, + "loss": 1.6077, + "step": 1070 + }, + { + "epoch": 1.92, + "grad_norm": 5.2232818603515625, + "learning_rate": 4.5580701465640254e-05, + "loss": 1.4824, + "step": 1080 + }, + { + "epoch": 1.94, + "grad_norm": 2.8741068840026855, + "learning_rate": 4.550104241795946e-05, + "loss": 1.6172, + "step": 1090 + }, + { + "epoch": 1.96, + "grad_norm": 8.092519760131836, + "learning_rate": 4.5420742748162734e-05, + "loss": 1.3659, + "step": 1100 + }, + { + "epoch": 1.96, + "eval_loss": 1.5198711156845093, + "eval_runtime": 124.8546, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1100 + }, + { + "epoch": 1.97, + "grad_norm": 5.068336009979248, + "learning_rate": 4.5339804965475875e-05, + "loss": 1.4661, + "step": 1110 + }, + { + "epoch": 1.99, + "grad_norm": 13.167552947998047, + "learning_rate": 4.525823159906459e-05, + "loss": 1.411, + "step": 1120 + }, + { + "epoch": 2.01, + "grad_norm": 4.712369918823242, + "learning_rate": 4.5176025197955494e-05, + "loss": 1.3309, + "step": 1130 + }, + { + "epoch": 2.03, + "grad_norm": 7.261610507965088, + "learning_rate": 4.509318833095642e-05, + "loss": 1.3892, + "step": 1140 + }, + { + "epoch": 2.04, + "grad_norm": 3.8006956577301025, + "learning_rate": 4.500972358657618e-05, + "loss": 1.3927, + "step": 1150 + }, + { + "epoch": 2.06, + "grad_norm": 3.6301958560943604, + "learning_rate": 4.492563357294369e-05, + "loss": 1.4629, + "step": 1160 + }, + { + "epoch": 2.08, + "grad_norm": 4.353027820587158, + "learning_rate": 4.4840920917726426e-05, + "loss": 1.352, + "step": 1170 + }, + { + "epoch": 2.1, + "grad_norm": 3.375173807144165, + "learning_rate": 4.475558826804833e-05, + "loss": 1.4096, + "step": 1180 + }, + { + "epoch": 2.12, + "grad_norm": 6.289668560028076, + "learning_rate": 4.466963829040712e-05, + "loss": 1.4834, + "step": 1190 + }, + { + "epoch": 2.13, + "grad_norm": 4.517002582550049, + "learning_rate": 4.458307367059092e-05, + "loss": 1.4746, + "step": 1200 + }, + { + "epoch": 2.13, + "eval_loss": 1.5145190954208374, + "eval_runtime": 124.8898, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 1200 + }, + { + "epoch": 2.15, + "grad_norm": 3.195769786834717, + "learning_rate": 4.449589711359438e-05, + "loss": 1.4149, + "step": 1210 + }, + { + "epoch": 2.17, + "grad_norm": 3.751405715942383, + "learning_rate": 4.440811134353412e-05, + "loss": 1.5501, + "step": 1220 + }, + { + "epoch": 2.19, + "grad_norm": 4.148709774017334, + "learning_rate": 4.431971910356363e-05, + "loss": 1.5253, + "step": 1230 + }, + { + "epoch": 2.2, + "grad_norm": 20.003253936767578, + "learning_rate": 4.42307231557875e-05, + "loss": 1.6413, + "step": 1240 + }, + { + "epoch": 2.22, + "grad_norm": 4.721023082733154, + "learning_rate": 4.414112628117517e-05, + "loss": 1.5608, + "step": 1250 + }, + { + "epoch": 2.24, + "grad_norm": 4.672358989715576, + "learning_rate": 4.4050931279474015e-05, + "loss": 1.3646, + "step": 1260 + }, + { + "epoch": 2.26, + "grad_norm": 4.073034286499023, + "learning_rate": 4.396014096912182e-05, + "loss": 1.3499, + "step": 1270 + }, + { + "epoch": 2.28, + "grad_norm": 3.2312991619110107, + "learning_rate": 4.386875818715874e-05, + "loss": 1.4648, + "step": 1280 + }, + { + "epoch": 2.29, + "grad_norm": 18.92267417907715, + "learning_rate": 4.3776785789138675e-05, + "loss": 1.4919, + "step": 1290 + }, + { + "epoch": 2.31, + "grad_norm": 5.677367687225342, + "learning_rate": 4.368422664903997e-05, + "loss": 1.2891, + "step": 1300 + }, + { + "epoch": 2.31, + "eval_loss": 1.504623532295227, + "eval_runtime": 124.8541, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1300 + }, + { + "epoch": 2.33, + "grad_norm": 5.031940460205078, + "learning_rate": 4.359108365917565e-05, + "loss": 1.4939, + "step": 1310 + }, + { + "epoch": 2.35, + "grad_norm": 7.701929092407227, + "learning_rate": 4.349735973010305e-05, + "loss": 1.28, + "step": 1320 + }, + { + "epoch": 2.36, + "grad_norm": 5.7498040199279785, + "learning_rate": 4.3403057790532855e-05, + "loss": 1.4584, + "step": 1330 + }, + { + "epoch": 2.38, + "grad_norm": 8.7277193069458, + "learning_rate": 4.330818078723755e-05, + "loss": 1.5871, + "step": 1340 + }, + { + "epoch": 2.4, + "grad_norm": 13.915125846862793, + "learning_rate": 4.32127316849594e-05, + "loss": 1.3794, + "step": 1350 + }, + { + "epoch": 2.42, + "grad_norm": 2.949733018875122, + "learning_rate": 4.311671346631774e-05, + "loss": 1.3543, + "step": 1360 + }, + { + "epoch": 2.44, + "grad_norm": 5.377658843994141, + "learning_rate": 4.302012913171584e-05, + "loss": 1.3695, + "step": 1370 + }, + { + "epoch": 2.45, + "grad_norm": 16.94107437133789, + "learning_rate": 4.292298169924709e-05, + "loss": 1.5168, + "step": 1380 + }, + { + "epoch": 2.47, + "grad_norm": 4.190367221832275, + "learning_rate": 4.282527420460072e-05, + "loss": 1.4058, + "step": 1390 + }, + { + "epoch": 2.49, + "grad_norm": 9.269573211669922, + "learning_rate": 4.272700970096696e-05, + "loss": 1.5794, + "step": 1400 + }, + { + "epoch": 2.49, + "eval_loss": 1.498180627822876, + "eval_runtime": 124.7222, + "eval_samples_per_second": 8.018, + "eval_steps_per_second": 2.004, + "step": 1400 + }, + { + "epoch": 2.51, + "grad_norm": 3.951293468475342, + "learning_rate": 4.262819125894156e-05, + "loss": 1.56, + "step": 1410 + }, + { + "epoch": 2.52, + "grad_norm": 3.8725697994232178, + "learning_rate": 4.252882196642992e-05, + "loss": 1.5159, + "step": 1420 + }, + { + "epoch": 2.54, + "grad_norm": 3.898501396179199, + "learning_rate": 4.242890492855056e-05, + "loss": 1.4659, + "step": 1430 + }, + { + "epoch": 2.56, + "grad_norm": 5.807662487030029, + "learning_rate": 4.23284432675381e-05, + "loss": 1.5736, + "step": 1440 + }, + { + "epoch": 2.58, + "grad_norm": 3.529371500015259, + "learning_rate": 4.222744012264566e-05, + "loss": 1.5011, + "step": 1450 + }, + { + "epoch": 2.6, + "grad_norm": 6.336548805236816, + "learning_rate": 4.212589865004684e-05, + "loss": 1.6629, + "step": 1460 + }, + { + "epoch": 2.61, + "grad_norm": 6.222330093383789, + "learning_rate": 4.2023822022737016e-05, + "loss": 1.5573, + "step": 1470 + }, + { + "epoch": 2.63, + "grad_norm": 4.25172233581543, + "learning_rate": 4.192121343043424e-05, + "loss": 1.3817, + "step": 1480 + }, + { + "epoch": 2.65, + "grad_norm": 4.487111568450928, + "learning_rate": 4.181807607947954e-05, + "loss": 1.5323, + "step": 1490 + }, + { + "epoch": 2.67, + "grad_norm": 4.656155109405518, + "learning_rate": 4.1714413192736754e-05, + "loss": 1.3678, + "step": 1500 + }, + { + "epoch": 2.67, + "eval_loss": 1.5049968957901, + "eval_runtime": 124.7803, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.004, + "step": 1500 + }, + { + "epoch": 2.68, + "grad_norm": 4.431355953216553, + "learning_rate": 4.161022800949177e-05, + "loss": 1.486, + "step": 1510 + }, + { + "epoch": 2.7, + "grad_norm": 18.211524963378906, + "learning_rate": 4.150552378535137e-05, + "loss": 1.4498, + "step": 1520 + }, + { + "epoch": 2.72, + "grad_norm": 5.3755292892456055, + "learning_rate": 4.140030379214147e-05, + "loss": 1.4421, + "step": 1530 + }, + { + "epoch": 2.74, + "grad_norm": 6.626212120056152, + "learning_rate": 4.1294571317804854e-05, + "loss": 1.4322, + "step": 1540 + }, + { + "epoch": 2.76, + "grad_norm": 4.030793190002441, + "learning_rate": 4.1188329666298464e-05, + "loss": 1.3433, + "step": 1550 + }, + { + "epoch": 2.77, + "grad_norm": 6.53309440612793, + "learning_rate": 4.108158215749014e-05, + "loss": 1.5604, + "step": 1560 + }, + { + "epoch": 2.79, + "grad_norm": 3.76047420501709, + "learning_rate": 4.0974332127054914e-05, + "loss": 1.3259, + "step": 1570 + }, + { + "epoch": 2.81, + "grad_norm": 4.58742094039917, + "learning_rate": 4.0866582926370725e-05, + "loss": 1.4228, + "step": 1580 + }, + { + "epoch": 2.83, + "grad_norm": 4.566816806793213, + "learning_rate": 4.0758337922413716e-05, + "loss": 1.3013, + "step": 1590 + }, + { + "epoch": 2.84, + "grad_norm": 6.218478202819824, + "learning_rate": 4.064960049765304e-05, + "loss": 1.5061, + "step": 1600 + }, + { + "epoch": 2.84, + "eval_loss": 1.4853577613830566, + "eval_runtime": 124.7889, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.003, + "step": 1600 + }, + { + "epoch": 2.86, + "grad_norm": 13.811309814453125, + "learning_rate": 4.054037404994516e-05, + "loss": 1.4839, + "step": 1610 + }, + { + "epoch": 2.88, + "grad_norm": 5.560975074768066, + "learning_rate": 4.043066199242762e-05, + "loss": 1.4765, + "step": 1620 + }, + { + "epoch": 2.9, + "grad_norm": 35.27302551269531, + "learning_rate": 4.032046775341247e-05, + "loss": 1.4105, + "step": 1630 + }, + { + "epoch": 2.92, + "grad_norm": 4.9896745681762695, + "learning_rate": 4.020979477627907e-05, + "loss": 1.5688, + "step": 1640 + }, + { + "epoch": 2.93, + "grad_norm": 3.5250892639160156, + "learning_rate": 4.0098646519366534e-05, + "loss": 1.4484, + "step": 1650 + }, + { + "epoch": 2.95, + "grad_norm": 5.281729698181152, + "learning_rate": 3.998702645586565e-05, + "loss": 1.6017, + "step": 1660 + }, + { + "epoch": 2.97, + "grad_norm": 4.667525768280029, + "learning_rate": 3.9874938073710336e-05, + "loss": 1.5006, + "step": 1670 + }, + { + "epoch": 2.99, + "grad_norm": 4.294438362121582, + "learning_rate": 3.976238487546864e-05, + "loss": 1.4218, + "step": 1680 + }, + { + "epoch": 3.0, + "grad_norm": 4.070734977722168, + "learning_rate": 3.9649370378233365e-05, + "loss": 1.6569, + "step": 1690 + }, + { + "epoch": 3.02, + "grad_norm": 4.640359878540039, + "learning_rate": 3.953589811351204e-05, + "loss": 1.5635, + "step": 1700 + }, + { + "epoch": 3.02, + "eval_loss": 1.478974461555481, + "eval_runtime": 124.6852, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 1700 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 1.3237080942080164e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1700/training_args.bin b/checkpoint-1700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-1700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-1800/README.md b/checkpoint-1800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-1800/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1800/adapter_config.json b/checkpoint-1800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-1800/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1800/adapter_model.safetensors b/checkpoint-1800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..37765c01612761bfa18b5cc486d50f5a80bfdf97 --- /dev/null +++ b/checkpoint-1800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd843fe29d28ffe48a4058c2eb56cf10f2e4def7ace9ed8ba244389ab6da424d +size 3695848 diff --git a/checkpoint-1800/optimizer.pt b/checkpoint-1800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..589a12b10bec59692fd40fec64d429acb0141bd9 --- /dev/null +++ b/checkpoint-1800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62b339d3da14343ee9b9148bdf6adcf4efaa5b6e5bb21aa29f51877eb2cf9216 +size 7433594 diff --git a/checkpoint-1800/rng_state.pth b/checkpoint-1800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..64f10103443a944533eac35c97bdab8689d0cf78 --- /dev/null +++ b/checkpoint-1800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfa9e4828b11bf3e5db5b676ee88c74008c62baa76f1779d97fad40fad2af7dc +size 14244 diff --git a/checkpoint-1800/scheduler.pt b/checkpoint-1800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..23a3a937f7ff479251b776f1c96682906e2f8ec1 --- /dev/null +++ b/checkpoint-1800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12ee1bbef9189a4d6d68b66bb79821f46d27258850903ab7df408b4a372d7f34 +size 1064 diff --git a/checkpoint-1800/special_tokens_map.json b/checkpoint-1800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-1800/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1800/tokenizer.model b/checkpoint-1800/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-1800/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-1800/tokenizer_config.json b/checkpoint-1800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-1800/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-1800/trainer_state.json b/checkpoint-1800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9971f0a80c41633637137ddb0751e73335b56883 --- /dev/null +++ b/checkpoint-1800/trainer_state.json @@ -0,0 +1,1425 @@ +{ + "best_metric": 1.4779504537582397, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-1800", + "epoch": 3.2, + "eval_steps": 100, + "global_step": 1800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + }, + { + "epoch": 1.44, + "grad_norm": 3.520264148712158, + "learning_rate": 4.7480731191519224e-05, + "loss": 1.4991, + "step": 810 + }, + { + "epoch": 1.46, + "grad_norm": 5.3987531661987305, + "learning_rate": 4.741924250958289e-05, + "loss": 1.6856, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 12.352794647216797, + "learning_rate": 4.7357053265095575e-05, + "loss": 1.4509, + "step": 830 + }, + { + "epoch": 1.49, + "grad_norm": 9.825531005859375, + "learning_rate": 4.729416540136361e-05, + "loss": 1.6168, + "step": 840 + }, + { + "epoch": 1.51, + "grad_norm": 10.881526947021484, + "learning_rate": 4.723058088352395e-05, + "loss": 1.5783, + "step": 850 + }, + { + "epoch": 1.53, + "grad_norm": 6.232407093048096, + "learning_rate": 4.7166301698482815e-05, + "loss": 1.4556, + "step": 860 + }, + { + "epoch": 1.55, + "grad_norm": 3.3216302394866943, + "learning_rate": 4.710132985485355e-05, + "loss": 1.593, + "step": 870 + }, + { + "epoch": 1.56, + "grad_norm": 5.219264984130859, + "learning_rate": 4.703566738289389e-05, + "loss": 1.5131, + "step": 880 + }, + { + "epoch": 1.58, + "grad_norm": 7.875769138336182, + "learning_rate": 4.696931633444251e-05, + "loss": 1.5667, + "step": 890 + }, + { + "epoch": 1.6, + "grad_norm": 5.77959680557251, + "learning_rate": 4.69022787828549e-05, + "loss": 1.5211, + "step": 900 + }, + { + "epoch": 1.6, + "eval_loss": 1.5731443166732788, + "eval_runtime": 124.8025, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 900 + }, + { + "epoch": 1.62, + "grad_norm": 4.806954383850098, + "learning_rate": 4.683455682293863e-05, + "loss": 1.6824, + "step": 910 + }, + { + "epoch": 1.64, + "grad_norm": 5.980200290679932, + "learning_rate": 4.676615257088776e-05, + "loss": 1.5989, + "step": 920 + }, + { + "epoch": 1.65, + "grad_norm": 4.3645429611206055, + "learning_rate": 4.6697068164216896e-05, + "loss": 1.6469, + "step": 930 + }, + { + "epoch": 1.67, + "grad_norm": 3.2400012016296387, + "learning_rate": 4.662730576169423e-05, + "loss": 1.568, + "step": 940 + }, + { + "epoch": 1.69, + "grad_norm": 4.331827640533447, + "learning_rate": 4.6556867543274184e-05, + "loss": 1.5236, + "step": 950 + }, + { + "epoch": 1.71, + "grad_norm": 3.3798201084136963, + "learning_rate": 4.6485755710029256e-05, + "loss": 1.5046, + "step": 960 + }, + { + "epoch": 1.72, + "grad_norm": 5.440864086151123, + "learning_rate": 4.6413972484081216e-05, + "loss": 1.5816, + "step": 970 + }, + { + "epoch": 1.74, + "grad_norm": 5.852995872497559, + "learning_rate": 4.6341520108531746e-05, + "loss": 1.4193, + "step": 980 + }, + { + "epoch": 1.76, + "grad_norm": 4.2782206535339355, + "learning_rate": 4.626840084739224e-05, + "loss": 1.5457, + "step": 990 + }, + { + "epoch": 1.78, + "grad_norm": 8.631403923034668, + "learning_rate": 4.619461698551315e-05, + "loss": 1.652, + "step": 1000 + }, + { + "epoch": 1.78, + "eval_loss": 1.5386379957199097, + "eval_runtime": 124.8384, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 2.003, + "step": 1000 + }, + { + "epoch": 1.8, + "grad_norm": 4.581122875213623, + "learning_rate": 4.612017082851253e-05, + "loss": 1.5746, + "step": 1010 + }, + { + "epoch": 1.81, + "grad_norm": 3.0373165607452393, + "learning_rate": 4.604506470270403e-05, + "loss": 1.6038, + "step": 1020 + }, + { + "epoch": 1.83, + "grad_norm": 3.5066914558410645, + "learning_rate": 4.5969300955024167e-05, + "loss": 1.5725, + "step": 1030 + }, + { + "epoch": 1.85, + "grad_norm": 4.402235507965088, + "learning_rate": 4.589288195295901e-05, + "loss": 1.5469, + "step": 1040 + }, + { + "epoch": 1.87, + "grad_norm": 4.844370365142822, + "learning_rate": 4.58158100844702e-05, + "loss": 1.5424, + "step": 1050 + }, + { + "epoch": 1.88, + "grad_norm": 4.146657943725586, + "learning_rate": 4.573808775792033e-05, + "loss": 1.4878, + "step": 1060 + }, + { + "epoch": 1.9, + "grad_norm": 3.210528612136841, + "learning_rate": 4.5659717401997655e-05, + "loss": 1.6077, + "step": 1070 + }, + { + "epoch": 1.92, + "grad_norm": 5.2232818603515625, + "learning_rate": 4.5580701465640254e-05, + "loss": 1.4824, + "step": 1080 + }, + { + "epoch": 1.94, + "grad_norm": 2.8741068840026855, + "learning_rate": 4.550104241795946e-05, + "loss": 1.6172, + "step": 1090 + }, + { + "epoch": 1.96, + "grad_norm": 8.092519760131836, + "learning_rate": 4.5420742748162734e-05, + "loss": 1.3659, + "step": 1100 + }, + { + "epoch": 1.96, + "eval_loss": 1.5198711156845093, + "eval_runtime": 124.8546, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1100 + }, + { + "epoch": 1.97, + "grad_norm": 5.068336009979248, + "learning_rate": 4.5339804965475875e-05, + "loss": 1.4661, + "step": 1110 + }, + { + "epoch": 1.99, + "grad_norm": 13.167552947998047, + "learning_rate": 4.525823159906459e-05, + "loss": 1.411, + "step": 1120 + }, + { + "epoch": 2.01, + "grad_norm": 4.712369918823242, + "learning_rate": 4.5176025197955494e-05, + "loss": 1.3309, + "step": 1130 + }, + { + "epoch": 2.03, + "grad_norm": 7.261610507965088, + "learning_rate": 4.509318833095642e-05, + "loss": 1.3892, + "step": 1140 + }, + { + "epoch": 2.04, + "grad_norm": 3.8006956577301025, + "learning_rate": 4.500972358657618e-05, + "loss": 1.3927, + "step": 1150 + }, + { + "epoch": 2.06, + "grad_norm": 3.6301958560943604, + "learning_rate": 4.492563357294369e-05, + "loss": 1.4629, + "step": 1160 + }, + { + "epoch": 2.08, + "grad_norm": 4.353027820587158, + "learning_rate": 4.4840920917726426e-05, + "loss": 1.352, + "step": 1170 + }, + { + "epoch": 2.1, + "grad_norm": 3.375173807144165, + "learning_rate": 4.475558826804833e-05, + "loss": 1.4096, + "step": 1180 + }, + { + "epoch": 2.12, + "grad_norm": 6.289668560028076, + "learning_rate": 4.466963829040712e-05, + "loss": 1.4834, + "step": 1190 + }, + { + "epoch": 2.13, + "grad_norm": 4.517002582550049, + "learning_rate": 4.458307367059092e-05, + "loss": 1.4746, + "step": 1200 + }, + { + "epoch": 2.13, + "eval_loss": 1.5145190954208374, + "eval_runtime": 124.8898, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 1200 + }, + { + "epoch": 2.15, + "grad_norm": 3.195769786834717, + "learning_rate": 4.449589711359438e-05, + "loss": 1.4149, + "step": 1210 + }, + { + "epoch": 2.17, + "grad_norm": 3.751405715942383, + "learning_rate": 4.440811134353412e-05, + "loss": 1.5501, + "step": 1220 + }, + { + "epoch": 2.19, + "grad_norm": 4.148709774017334, + "learning_rate": 4.431971910356363e-05, + "loss": 1.5253, + "step": 1230 + }, + { + "epoch": 2.2, + "grad_norm": 20.003253936767578, + "learning_rate": 4.42307231557875e-05, + "loss": 1.6413, + "step": 1240 + }, + { + "epoch": 2.22, + "grad_norm": 4.721023082733154, + "learning_rate": 4.414112628117517e-05, + "loss": 1.5608, + "step": 1250 + }, + { + "epoch": 2.24, + "grad_norm": 4.672358989715576, + "learning_rate": 4.4050931279474015e-05, + "loss": 1.3646, + "step": 1260 + }, + { + "epoch": 2.26, + "grad_norm": 4.073034286499023, + "learning_rate": 4.396014096912182e-05, + "loss": 1.3499, + "step": 1270 + }, + { + "epoch": 2.28, + "grad_norm": 3.2312991619110107, + "learning_rate": 4.386875818715874e-05, + "loss": 1.4648, + "step": 1280 + }, + { + "epoch": 2.29, + "grad_norm": 18.92267417907715, + "learning_rate": 4.3776785789138675e-05, + "loss": 1.4919, + "step": 1290 + }, + { + "epoch": 2.31, + "grad_norm": 5.677367687225342, + "learning_rate": 4.368422664903997e-05, + "loss": 1.2891, + "step": 1300 + }, + { + "epoch": 2.31, + "eval_loss": 1.504623532295227, + "eval_runtime": 124.8541, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1300 + }, + { + "epoch": 2.33, + "grad_norm": 5.031940460205078, + "learning_rate": 4.359108365917565e-05, + "loss": 1.4939, + "step": 1310 + }, + { + "epoch": 2.35, + "grad_norm": 7.701929092407227, + "learning_rate": 4.349735973010305e-05, + "loss": 1.28, + "step": 1320 + }, + { + "epoch": 2.36, + "grad_norm": 5.7498040199279785, + "learning_rate": 4.3403057790532855e-05, + "loss": 1.4584, + "step": 1330 + }, + { + "epoch": 2.38, + "grad_norm": 8.7277193069458, + "learning_rate": 4.330818078723755e-05, + "loss": 1.5871, + "step": 1340 + }, + { + "epoch": 2.4, + "grad_norm": 13.915125846862793, + "learning_rate": 4.32127316849594e-05, + "loss": 1.3794, + "step": 1350 + }, + { + "epoch": 2.42, + "grad_norm": 2.949733018875122, + "learning_rate": 4.311671346631774e-05, + "loss": 1.3543, + "step": 1360 + }, + { + "epoch": 2.44, + "grad_norm": 5.377658843994141, + "learning_rate": 4.302012913171584e-05, + "loss": 1.3695, + "step": 1370 + }, + { + "epoch": 2.45, + "grad_norm": 16.94107437133789, + "learning_rate": 4.292298169924709e-05, + "loss": 1.5168, + "step": 1380 + }, + { + "epoch": 2.47, + "grad_norm": 4.190367221832275, + "learning_rate": 4.282527420460072e-05, + "loss": 1.4058, + "step": 1390 + }, + { + "epoch": 2.49, + "grad_norm": 9.269573211669922, + "learning_rate": 4.272700970096696e-05, + "loss": 1.5794, + "step": 1400 + }, + { + "epoch": 2.49, + "eval_loss": 1.498180627822876, + "eval_runtime": 124.7222, + "eval_samples_per_second": 8.018, + "eval_steps_per_second": 2.004, + "step": 1400 + }, + { + "epoch": 2.51, + "grad_norm": 3.951293468475342, + "learning_rate": 4.262819125894156e-05, + "loss": 1.56, + "step": 1410 + }, + { + "epoch": 2.52, + "grad_norm": 3.8725697994232178, + "learning_rate": 4.252882196642992e-05, + "loss": 1.5159, + "step": 1420 + }, + { + "epoch": 2.54, + "grad_norm": 3.898501396179199, + "learning_rate": 4.242890492855056e-05, + "loss": 1.4659, + "step": 1430 + }, + { + "epoch": 2.56, + "grad_norm": 5.807662487030029, + "learning_rate": 4.23284432675381e-05, + "loss": 1.5736, + "step": 1440 + }, + { + "epoch": 2.58, + "grad_norm": 3.529371500015259, + "learning_rate": 4.222744012264566e-05, + "loss": 1.5011, + "step": 1450 + }, + { + "epoch": 2.6, + "grad_norm": 6.336548805236816, + "learning_rate": 4.212589865004684e-05, + "loss": 1.6629, + "step": 1460 + }, + { + "epoch": 2.61, + "grad_norm": 6.222330093383789, + "learning_rate": 4.2023822022737016e-05, + "loss": 1.5573, + "step": 1470 + }, + { + "epoch": 2.63, + "grad_norm": 4.25172233581543, + "learning_rate": 4.192121343043424e-05, + "loss": 1.3817, + "step": 1480 + }, + { + "epoch": 2.65, + "grad_norm": 4.487111568450928, + "learning_rate": 4.181807607947954e-05, + "loss": 1.5323, + "step": 1490 + }, + { + "epoch": 2.67, + "grad_norm": 4.656155109405518, + "learning_rate": 4.1714413192736754e-05, + "loss": 1.3678, + "step": 1500 + }, + { + "epoch": 2.67, + "eval_loss": 1.5049968957901, + "eval_runtime": 124.7803, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.004, + "step": 1500 + }, + { + "epoch": 2.68, + "grad_norm": 4.431355953216553, + "learning_rate": 4.161022800949177e-05, + "loss": 1.486, + "step": 1510 + }, + { + "epoch": 2.7, + "grad_norm": 18.211524963378906, + "learning_rate": 4.150552378535137e-05, + "loss": 1.4498, + "step": 1520 + }, + { + "epoch": 2.72, + "grad_norm": 5.3755292892456055, + "learning_rate": 4.140030379214147e-05, + "loss": 1.4421, + "step": 1530 + }, + { + "epoch": 2.74, + "grad_norm": 6.626212120056152, + "learning_rate": 4.1294571317804854e-05, + "loss": 1.4322, + "step": 1540 + }, + { + "epoch": 2.76, + "grad_norm": 4.030793190002441, + "learning_rate": 4.1188329666298464e-05, + "loss": 1.3433, + "step": 1550 + }, + { + "epoch": 2.77, + "grad_norm": 6.53309440612793, + "learning_rate": 4.108158215749014e-05, + "loss": 1.5604, + "step": 1560 + }, + { + "epoch": 2.79, + "grad_norm": 3.76047420501709, + "learning_rate": 4.0974332127054914e-05, + "loss": 1.3259, + "step": 1570 + }, + { + "epoch": 2.81, + "grad_norm": 4.58742094039917, + "learning_rate": 4.0866582926370725e-05, + "loss": 1.4228, + "step": 1580 + }, + { + "epoch": 2.83, + "grad_norm": 4.566816806793213, + "learning_rate": 4.0758337922413716e-05, + "loss": 1.3013, + "step": 1590 + }, + { + "epoch": 2.84, + "grad_norm": 6.218478202819824, + "learning_rate": 4.064960049765304e-05, + "loss": 1.5061, + "step": 1600 + }, + { + "epoch": 2.84, + "eval_loss": 1.4853577613830566, + "eval_runtime": 124.7889, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.003, + "step": 1600 + }, + { + "epoch": 2.86, + "grad_norm": 13.811309814453125, + "learning_rate": 4.054037404994516e-05, + "loss": 1.4839, + "step": 1610 + }, + { + "epoch": 2.88, + "grad_norm": 5.560975074768066, + "learning_rate": 4.043066199242762e-05, + "loss": 1.4765, + "step": 1620 + }, + { + "epoch": 2.9, + "grad_norm": 35.27302551269531, + "learning_rate": 4.032046775341247e-05, + "loss": 1.4105, + "step": 1630 + }, + { + "epoch": 2.92, + "grad_norm": 4.9896745681762695, + "learning_rate": 4.020979477627907e-05, + "loss": 1.5688, + "step": 1640 + }, + { + "epoch": 2.93, + "grad_norm": 3.5250892639160156, + "learning_rate": 4.0098646519366534e-05, + "loss": 1.4484, + "step": 1650 + }, + { + "epoch": 2.95, + "grad_norm": 5.281729698181152, + "learning_rate": 3.998702645586565e-05, + "loss": 1.6017, + "step": 1660 + }, + { + "epoch": 2.97, + "grad_norm": 4.667525768280029, + "learning_rate": 3.9874938073710336e-05, + "loss": 1.5006, + "step": 1670 + }, + { + "epoch": 2.99, + "grad_norm": 4.294438362121582, + "learning_rate": 3.976238487546864e-05, + "loss": 1.4218, + "step": 1680 + }, + { + "epoch": 3.0, + "grad_norm": 4.070734977722168, + "learning_rate": 3.9649370378233365e-05, + "loss": 1.6569, + "step": 1690 + }, + { + "epoch": 3.02, + "grad_norm": 4.640359878540039, + "learning_rate": 3.953589811351204e-05, + "loss": 1.5635, + "step": 1700 + }, + { + "epoch": 3.02, + "eval_loss": 1.478974461555481, + "eval_runtime": 124.6852, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 1700 + }, + { + "epoch": 3.04, + "grad_norm": 4.43009090423584, + "learning_rate": 3.94219716271167e-05, + "loss": 1.3304, + "step": 1710 + }, + { + "epoch": 3.06, + "grad_norm": 4.001712799072266, + "learning_rate": 3.930759447905298e-05, + "loss": 1.3534, + "step": 1720 + }, + { + "epoch": 3.08, + "grad_norm": 4.664085388183594, + "learning_rate": 3.919277024340891e-05, + "loss": 1.368, + "step": 1730 + }, + { + "epoch": 3.09, + "grad_norm": 4.42681360244751, + "learning_rate": 3.907750250824327e-05, + "loss": 1.4164, + "step": 1740 + }, + { + "epoch": 3.11, + "grad_norm": 7.331808567047119, + "learning_rate": 3.8961794875473394e-05, + "loss": 1.4333, + "step": 1750 + }, + { + "epoch": 3.13, + "grad_norm": 5.612239837646484, + "learning_rate": 3.884565096076269e-05, + "loss": 1.5754, + "step": 1760 + }, + { + "epoch": 3.15, + "grad_norm": 5.236481666564941, + "learning_rate": 3.872907439340758e-05, + "loss": 1.4017, + "step": 1770 + }, + { + "epoch": 3.16, + "grad_norm": 4.995403289794922, + "learning_rate": 3.861206881622419e-05, + "loss": 1.5011, + "step": 1780 + }, + { + "epoch": 3.18, + "grad_norm": 41.0167236328125, + "learning_rate": 3.8494637885434396e-05, + "loss": 1.4472, + "step": 1790 + }, + { + "epoch": 3.2, + "grad_norm": 5.136650562286377, + "learning_rate": 3.837678527055168e-05, + "loss": 1.3939, + "step": 1800 + }, + { + "epoch": 3.2, + "eval_loss": 1.4779504537582397, + "eval_runtime": 124.6728, + "eval_samples_per_second": 8.021, + "eval_steps_per_second": 2.005, + "step": 1800 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 1.4031463107723264e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1800/training_args.bin b/checkpoint-1800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-1800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-1900/README.md b/checkpoint-1900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-1900/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1900/adapter_config.json b/checkpoint-1900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-1900/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1900/adapter_model.safetensors b/checkpoint-1900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aaf94f6146a9ac40025436b6af78f2aaf8d23d63 --- /dev/null +++ b/checkpoint-1900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da5c8176c4bf9cc1494f5db12726e6962dc5bd6899855d36e1a88e187fa0f41e +size 3695848 diff --git a/checkpoint-1900/optimizer.pt b/checkpoint-1900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0fc890fff8e5481259044266e236ba8aad8b57c3 --- /dev/null +++ b/checkpoint-1900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55409586050dc946264e51e79838642d3e1367cee298211e7922245f5dd8c331 +size 7433594 diff --git a/checkpoint-1900/rng_state.pth b/checkpoint-1900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8fec1b8ead84658fd34d030dbea89657122e87e4 --- /dev/null +++ b/checkpoint-1900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36ec95f4c38d7d08b93c7ccf1ae68fe2d234a875add515be76cb3882595dff2c +size 14244 diff --git a/checkpoint-1900/scheduler.pt b/checkpoint-1900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..78d01dad912b67e9dada64bbca5f3da5066b4022 --- /dev/null +++ b/checkpoint-1900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a97f86040ef74c987109342a3078b56769fa70b7735a911f2cdf1c4f4e7da4c0 +size 1064 diff --git a/checkpoint-1900/special_tokens_map.json b/checkpoint-1900/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-1900/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1900/tokenizer.model b/checkpoint-1900/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-1900/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-1900/tokenizer_config.json b/checkpoint-1900/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-1900/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-1900/trainer_state.json b/checkpoint-1900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..befd3ff8d0106b1efc372047c151703931646b52 --- /dev/null +++ b/checkpoint-1900/trainer_state.json @@ -0,0 +1,1503 @@ +{ + "best_metric": 1.4685039520263672, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-1900", + "epoch": 3.3777777777777778, + "eval_steps": 100, + "global_step": 1900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + }, + { + "epoch": 1.44, + "grad_norm": 3.520264148712158, + "learning_rate": 4.7480731191519224e-05, + "loss": 1.4991, + "step": 810 + }, + { + "epoch": 1.46, + "grad_norm": 5.3987531661987305, + "learning_rate": 4.741924250958289e-05, + "loss": 1.6856, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 12.352794647216797, + "learning_rate": 4.7357053265095575e-05, + "loss": 1.4509, + "step": 830 + }, + { + "epoch": 1.49, + "grad_norm": 9.825531005859375, + "learning_rate": 4.729416540136361e-05, + "loss": 1.6168, + "step": 840 + }, + { + "epoch": 1.51, + "grad_norm": 10.881526947021484, + "learning_rate": 4.723058088352395e-05, + "loss": 1.5783, + "step": 850 + }, + { + "epoch": 1.53, + "grad_norm": 6.232407093048096, + "learning_rate": 4.7166301698482815e-05, + "loss": 1.4556, + "step": 860 + }, + { + "epoch": 1.55, + "grad_norm": 3.3216302394866943, + "learning_rate": 4.710132985485355e-05, + "loss": 1.593, + "step": 870 + }, + { + "epoch": 1.56, + "grad_norm": 5.219264984130859, + "learning_rate": 4.703566738289389e-05, + "loss": 1.5131, + "step": 880 + }, + { + "epoch": 1.58, + "grad_norm": 7.875769138336182, + "learning_rate": 4.696931633444251e-05, + "loss": 1.5667, + "step": 890 + }, + { + "epoch": 1.6, + "grad_norm": 5.77959680557251, + "learning_rate": 4.69022787828549e-05, + "loss": 1.5211, + "step": 900 + }, + { + "epoch": 1.6, + "eval_loss": 1.5731443166732788, + "eval_runtime": 124.8025, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 900 + }, + { + "epoch": 1.62, + "grad_norm": 4.806954383850098, + "learning_rate": 4.683455682293863e-05, + "loss": 1.6824, + "step": 910 + }, + { + "epoch": 1.64, + "grad_norm": 5.980200290679932, + "learning_rate": 4.676615257088776e-05, + "loss": 1.5989, + "step": 920 + }, + { + "epoch": 1.65, + "grad_norm": 4.3645429611206055, + "learning_rate": 4.6697068164216896e-05, + "loss": 1.6469, + "step": 930 + }, + { + "epoch": 1.67, + "grad_norm": 3.2400012016296387, + "learning_rate": 4.662730576169423e-05, + "loss": 1.568, + "step": 940 + }, + { + "epoch": 1.69, + "grad_norm": 4.331827640533447, + "learning_rate": 4.6556867543274184e-05, + "loss": 1.5236, + "step": 950 + }, + { + "epoch": 1.71, + "grad_norm": 3.3798201084136963, + "learning_rate": 4.6485755710029256e-05, + "loss": 1.5046, + "step": 960 + }, + { + "epoch": 1.72, + "grad_norm": 5.440864086151123, + "learning_rate": 4.6413972484081216e-05, + "loss": 1.5816, + "step": 970 + }, + { + "epoch": 1.74, + "grad_norm": 5.852995872497559, + "learning_rate": 4.6341520108531746e-05, + "loss": 1.4193, + "step": 980 + }, + { + "epoch": 1.76, + "grad_norm": 4.2782206535339355, + "learning_rate": 4.626840084739224e-05, + "loss": 1.5457, + "step": 990 + }, + { + "epoch": 1.78, + "grad_norm": 8.631403923034668, + "learning_rate": 4.619461698551315e-05, + "loss": 1.652, + "step": 1000 + }, + { + "epoch": 1.78, + "eval_loss": 1.5386379957199097, + "eval_runtime": 124.8384, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 2.003, + "step": 1000 + }, + { + "epoch": 1.8, + "grad_norm": 4.581122875213623, + "learning_rate": 4.612017082851253e-05, + "loss": 1.5746, + "step": 1010 + }, + { + "epoch": 1.81, + "grad_norm": 3.0373165607452393, + "learning_rate": 4.604506470270403e-05, + "loss": 1.6038, + "step": 1020 + }, + { + "epoch": 1.83, + "grad_norm": 3.5066914558410645, + "learning_rate": 4.5969300955024167e-05, + "loss": 1.5725, + "step": 1030 + }, + { + "epoch": 1.85, + "grad_norm": 4.402235507965088, + "learning_rate": 4.589288195295901e-05, + "loss": 1.5469, + "step": 1040 + }, + { + "epoch": 1.87, + "grad_norm": 4.844370365142822, + "learning_rate": 4.58158100844702e-05, + "loss": 1.5424, + "step": 1050 + }, + { + "epoch": 1.88, + "grad_norm": 4.146657943725586, + "learning_rate": 4.573808775792033e-05, + "loss": 1.4878, + "step": 1060 + }, + { + "epoch": 1.9, + "grad_norm": 3.210528612136841, + "learning_rate": 4.5659717401997655e-05, + "loss": 1.6077, + "step": 1070 + }, + { + "epoch": 1.92, + "grad_norm": 5.2232818603515625, + "learning_rate": 4.5580701465640254e-05, + "loss": 1.4824, + "step": 1080 + }, + { + "epoch": 1.94, + "grad_norm": 2.8741068840026855, + "learning_rate": 4.550104241795946e-05, + "loss": 1.6172, + "step": 1090 + }, + { + "epoch": 1.96, + "grad_norm": 8.092519760131836, + "learning_rate": 4.5420742748162734e-05, + "loss": 1.3659, + "step": 1100 + }, + { + "epoch": 1.96, + "eval_loss": 1.5198711156845093, + "eval_runtime": 124.8546, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1100 + }, + { + "epoch": 1.97, + "grad_norm": 5.068336009979248, + "learning_rate": 4.5339804965475875e-05, + "loss": 1.4661, + "step": 1110 + }, + { + "epoch": 1.99, + "grad_norm": 13.167552947998047, + "learning_rate": 4.525823159906459e-05, + "loss": 1.411, + "step": 1120 + }, + { + "epoch": 2.01, + "grad_norm": 4.712369918823242, + "learning_rate": 4.5176025197955494e-05, + "loss": 1.3309, + "step": 1130 + }, + { + "epoch": 2.03, + "grad_norm": 7.261610507965088, + "learning_rate": 4.509318833095642e-05, + "loss": 1.3892, + "step": 1140 + }, + { + "epoch": 2.04, + "grad_norm": 3.8006956577301025, + "learning_rate": 4.500972358657618e-05, + "loss": 1.3927, + "step": 1150 + }, + { + "epoch": 2.06, + "grad_norm": 3.6301958560943604, + "learning_rate": 4.492563357294369e-05, + "loss": 1.4629, + "step": 1160 + }, + { + "epoch": 2.08, + "grad_norm": 4.353027820587158, + "learning_rate": 4.4840920917726426e-05, + "loss": 1.352, + "step": 1170 + }, + { + "epoch": 2.1, + "grad_norm": 3.375173807144165, + "learning_rate": 4.475558826804833e-05, + "loss": 1.4096, + "step": 1180 + }, + { + "epoch": 2.12, + "grad_norm": 6.289668560028076, + "learning_rate": 4.466963829040712e-05, + "loss": 1.4834, + "step": 1190 + }, + { + "epoch": 2.13, + "grad_norm": 4.517002582550049, + "learning_rate": 4.458307367059092e-05, + "loss": 1.4746, + "step": 1200 + }, + { + "epoch": 2.13, + "eval_loss": 1.5145190954208374, + "eval_runtime": 124.8898, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 1200 + }, + { + "epoch": 2.15, + "grad_norm": 3.195769786834717, + "learning_rate": 4.449589711359438e-05, + "loss": 1.4149, + "step": 1210 + }, + { + "epoch": 2.17, + "grad_norm": 3.751405715942383, + "learning_rate": 4.440811134353412e-05, + "loss": 1.5501, + "step": 1220 + }, + { + "epoch": 2.19, + "grad_norm": 4.148709774017334, + "learning_rate": 4.431971910356363e-05, + "loss": 1.5253, + "step": 1230 + }, + { + "epoch": 2.2, + "grad_norm": 20.003253936767578, + "learning_rate": 4.42307231557875e-05, + "loss": 1.6413, + "step": 1240 + }, + { + "epoch": 2.22, + "grad_norm": 4.721023082733154, + "learning_rate": 4.414112628117517e-05, + "loss": 1.5608, + "step": 1250 + }, + { + "epoch": 2.24, + "grad_norm": 4.672358989715576, + "learning_rate": 4.4050931279474015e-05, + "loss": 1.3646, + "step": 1260 + }, + { + "epoch": 2.26, + "grad_norm": 4.073034286499023, + "learning_rate": 4.396014096912182e-05, + "loss": 1.3499, + "step": 1270 + }, + { + "epoch": 2.28, + "grad_norm": 3.2312991619110107, + "learning_rate": 4.386875818715874e-05, + "loss": 1.4648, + "step": 1280 + }, + { + "epoch": 2.29, + "grad_norm": 18.92267417907715, + "learning_rate": 4.3776785789138675e-05, + "loss": 1.4919, + "step": 1290 + }, + { + "epoch": 2.31, + "grad_norm": 5.677367687225342, + "learning_rate": 4.368422664903997e-05, + "loss": 1.2891, + "step": 1300 + }, + { + "epoch": 2.31, + "eval_loss": 1.504623532295227, + "eval_runtime": 124.8541, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1300 + }, + { + "epoch": 2.33, + "grad_norm": 5.031940460205078, + "learning_rate": 4.359108365917565e-05, + "loss": 1.4939, + "step": 1310 + }, + { + "epoch": 2.35, + "grad_norm": 7.701929092407227, + "learning_rate": 4.349735973010305e-05, + "loss": 1.28, + "step": 1320 + }, + { + "epoch": 2.36, + "grad_norm": 5.7498040199279785, + "learning_rate": 4.3403057790532855e-05, + "loss": 1.4584, + "step": 1330 + }, + { + "epoch": 2.38, + "grad_norm": 8.7277193069458, + "learning_rate": 4.330818078723755e-05, + "loss": 1.5871, + "step": 1340 + }, + { + "epoch": 2.4, + "grad_norm": 13.915125846862793, + "learning_rate": 4.32127316849594e-05, + "loss": 1.3794, + "step": 1350 + }, + { + "epoch": 2.42, + "grad_norm": 2.949733018875122, + "learning_rate": 4.311671346631774e-05, + "loss": 1.3543, + "step": 1360 + }, + { + "epoch": 2.44, + "grad_norm": 5.377658843994141, + "learning_rate": 4.302012913171584e-05, + "loss": 1.3695, + "step": 1370 + }, + { + "epoch": 2.45, + "grad_norm": 16.94107437133789, + "learning_rate": 4.292298169924709e-05, + "loss": 1.5168, + "step": 1380 + }, + { + "epoch": 2.47, + "grad_norm": 4.190367221832275, + "learning_rate": 4.282527420460072e-05, + "loss": 1.4058, + "step": 1390 + }, + { + "epoch": 2.49, + "grad_norm": 9.269573211669922, + "learning_rate": 4.272700970096696e-05, + "loss": 1.5794, + "step": 1400 + }, + { + "epoch": 2.49, + "eval_loss": 1.498180627822876, + "eval_runtime": 124.7222, + "eval_samples_per_second": 8.018, + "eval_steps_per_second": 2.004, + "step": 1400 + }, + { + "epoch": 2.51, + "grad_norm": 3.951293468475342, + "learning_rate": 4.262819125894156e-05, + "loss": 1.56, + "step": 1410 + }, + { + "epoch": 2.52, + "grad_norm": 3.8725697994232178, + "learning_rate": 4.252882196642992e-05, + "loss": 1.5159, + "step": 1420 + }, + { + "epoch": 2.54, + "grad_norm": 3.898501396179199, + "learning_rate": 4.242890492855056e-05, + "loss": 1.4659, + "step": 1430 + }, + { + "epoch": 2.56, + "grad_norm": 5.807662487030029, + "learning_rate": 4.23284432675381e-05, + "loss": 1.5736, + "step": 1440 + }, + { + "epoch": 2.58, + "grad_norm": 3.529371500015259, + "learning_rate": 4.222744012264566e-05, + "loss": 1.5011, + "step": 1450 + }, + { + "epoch": 2.6, + "grad_norm": 6.336548805236816, + "learning_rate": 4.212589865004684e-05, + "loss": 1.6629, + "step": 1460 + }, + { + "epoch": 2.61, + "grad_norm": 6.222330093383789, + "learning_rate": 4.2023822022737016e-05, + "loss": 1.5573, + "step": 1470 + }, + { + "epoch": 2.63, + "grad_norm": 4.25172233581543, + "learning_rate": 4.192121343043424e-05, + "loss": 1.3817, + "step": 1480 + }, + { + "epoch": 2.65, + "grad_norm": 4.487111568450928, + "learning_rate": 4.181807607947954e-05, + "loss": 1.5323, + "step": 1490 + }, + { + "epoch": 2.67, + "grad_norm": 4.656155109405518, + "learning_rate": 4.1714413192736754e-05, + "loss": 1.3678, + "step": 1500 + }, + { + "epoch": 2.67, + "eval_loss": 1.5049968957901, + "eval_runtime": 124.7803, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.004, + "step": 1500 + }, + { + "epoch": 2.68, + "grad_norm": 4.431355953216553, + "learning_rate": 4.161022800949177e-05, + "loss": 1.486, + "step": 1510 + }, + { + "epoch": 2.7, + "grad_norm": 18.211524963378906, + "learning_rate": 4.150552378535137e-05, + "loss": 1.4498, + "step": 1520 + }, + { + "epoch": 2.72, + "grad_norm": 5.3755292892456055, + "learning_rate": 4.140030379214147e-05, + "loss": 1.4421, + "step": 1530 + }, + { + "epoch": 2.74, + "grad_norm": 6.626212120056152, + "learning_rate": 4.1294571317804854e-05, + "loss": 1.4322, + "step": 1540 + }, + { + "epoch": 2.76, + "grad_norm": 4.030793190002441, + "learning_rate": 4.1188329666298464e-05, + "loss": 1.3433, + "step": 1550 + }, + { + "epoch": 2.77, + "grad_norm": 6.53309440612793, + "learning_rate": 4.108158215749014e-05, + "loss": 1.5604, + "step": 1560 + }, + { + "epoch": 2.79, + "grad_norm": 3.76047420501709, + "learning_rate": 4.0974332127054914e-05, + "loss": 1.3259, + "step": 1570 + }, + { + "epoch": 2.81, + "grad_norm": 4.58742094039917, + "learning_rate": 4.0866582926370725e-05, + "loss": 1.4228, + "step": 1580 + }, + { + "epoch": 2.83, + "grad_norm": 4.566816806793213, + "learning_rate": 4.0758337922413716e-05, + "loss": 1.3013, + "step": 1590 + }, + { + "epoch": 2.84, + "grad_norm": 6.218478202819824, + "learning_rate": 4.064960049765304e-05, + "loss": 1.5061, + "step": 1600 + }, + { + "epoch": 2.84, + "eval_loss": 1.4853577613830566, + "eval_runtime": 124.7889, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.003, + "step": 1600 + }, + { + "epoch": 2.86, + "grad_norm": 13.811309814453125, + "learning_rate": 4.054037404994516e-05, + "loss": 1.4839, + "step": 1610 + }, + { + "epoch": 2.88, + "grad_norm": 5.560975074768066, + "learning_rate": 4.043066199242762e-05, + "loss": 1.4765, + "step": 1620 + }, + { + "epoch": 2.9, + "grad_norm": 35.27302551269531, + "learning_rate": 4.032046775341247e-05, + "loss": 1.4105, + "step": 1630 + }, + { + "epoch": 2.92, + "grad_norm": 4.9896745681762695, + "learning_rate": 4.020979477627907e-05, + "loss": 1.5688, + "step": 1640 + }, + { + "epoch": 2.93, + "grad_norm": 3.5250892639160156, + "learning_rate": 4.0098646519366534e-05, + "loss": 1.4484, + "step": 1650 + }, + { + "epoch": 2.95, + "grad_norm": 5.281729698181152, + "learning_rate": 3.998702645586565e-05, + "loss": 1.6017, + "step": 1660 + }, + { + "epoch": 2.97, + "grad_norm": 4.667525768280029, + "learning_rate": 3.9874938073710336e-05, + "loss": 1.5006, + "step": 1670 + }, + { + "epoch": 2.99, + "grad_norm": 4.294438362121582, + "learning_rate": 3.976238487546864e-05, + "loss": 1.4218, + "step": 1680 + }, + { + "epoch": 3.0, + "grad_norm": 4.070734977722168, + "learning_rate": 3.9649370378233365e-05, + "loss": 1.6569, + "step": 1690 + }, + { + "epoch": 3.02, + "grad_norm": 4.640359878540039, + "learning_rate": 3.953589811351204e-05, + "loss": 1.5635, + "step": 1700 + }, + { + "epoch": 3.02, + "eval_loss": 1.478974461555481, + "eval_runtime": 124.6852, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 1700 + }, + { + "epoch": 3.04, + "grad_norm": 4.43009090423584, + "learning_rate": 3.94219716271167e-05, + "loss": 1.3304, + "step": 1710 + }, + { + "epoch": 3.06, + "grad_norm": 4.001712799072266, + "learning_rate": 3.930759447905298e-05, + "loss": 1.3534, + "step": 1720 + }, + { + "epoch": 3.08, + "grad_norm": 4.664085388183594, + "learning_rate": 3.919277024340891e-05, + "loss": 1.368, + "step": 1730 + }, + { + "epoch": 3.09, + "grad_norm": 4.42681360244751, + "learning_rate": 3.907750250824327e-05, + "loss": 1.4164, + "step": 1740 + }, + { + "epoch": 3.11, + "grad_norm": 7.331808567047119, + "learning_rate": 3.8961794875473394e-05, + "loss": 1.4333, + "step": 1750 + }, + { + "epoch": 3.13, + "grad_norm": 5.612239837646484, + "learning_rate": 3.884565096076269e-05, + "loss": 1.5754, + "step": 1760 + }, + { + "epoch": 3.15, + "grad_norm": 5.236481666564941, + "learning_rate": 3.872907439340758e-05, + "loss": 1.4017, + "step": 1770 + }, + { + "epoch": 3.16, + "grad_norm": 4.995403289794922, + "learning_rate": 3.861206881622419e-05, + "loss": 1.5011, + "step": 1780 + }, + { + "epoch": 3.18, + "grad_norm": 41.0167236328125, + "learning_rate": 3.8494637885434396e-05, + "loss": 1.4472, + "step": 1790 + }, + { + "epoch": 3.2, + "grad_norm": 5.136650562286377, + "learning_rate": 3.837678527055168e-05, + "loss": 1.3939, + "step": 1800 + }, + { + "epoch": 3.2, + "eval_loss": 1.4779504537582397, + "eval_runtime": 124.6728, + "eval_samples_per_second": 8.021, + "eval_steps_per_second": 2.005, + "step": 1800 + }, + { + "epoch": 3.22, + "grad_norm": 5.178096294403076, + "learning_rate": 3.8258514654266434e-05, + "loss": 1.5265, + "step": 1810 + }, + { + "epoch": 3.24, + "grad_norm": 6.949739456176758, + "learning_rate": 3.813982973233083e-05, + "loss": 1.3674, + "step": 1820 + }, + { + "epoch": 3.25, + "grad_norm": 3.84801983833313, + "learning_rate": 3.802073421344339e-05, + "loss": 1.4305, + "step": 1830 + }, + { + "epoch": 3.27, + "grad_norm": 3.5803613662719727, + "learning_rate": 3.7901231819133105e-05, + "loss": 1.557, + "step": 1840 + }, + { + "epoch": 3.29, + "grad_norm": 3.509099245071411, + "learning_rate": 3.7781326283643085e-05, + "loss": 1.3611, + "step": 1850 + }, + { + "epoch": 3.31, + "grad_norm": 41.185279846191406, + "learning_rate": 3.766102135381393e-05, + "loss": 1.3944, + "step": 1860 + }, + { + "epoch": 3.32, + "grad_norm": 3.797672748565674, + "learning_rate": 3.75403207889666e-05, + "loss": 1.2557, + "step": 1870 + }, + { + "epoch": 3.34, + "grad_norm": 4.237602233886719, + "learning_rate": 3.741922836078499e-05, + "loss": 1.3583, + "step": 1880 + }, + { + "epoch": 3.36, + "grad_norm": 4.5037312507629395, + "learning_rate": 3.729774785319801e-05, + "loss": 1.4619, + "step": 1890 + }, + { + "epoch": 3.38, + "grad_norm": 4.292742729187012, + "learning_rate": 3.717588306226143e-05, + "loss": 1.3986, + "step": 1900 + }, + { + "epoch": 3.38, + "eval_loss": 1.4685039520263672, + "eval_runtime": 124.7126, + "eval_samples_per_second": 8.018, + "eval_steps_per_second": 2.005, + "step": 1900 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 1.4782532316010906e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1900/training_args.bin b/checkpoint-1900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-1900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-200/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-200/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-200/adapter_model.safetensors b/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b0ededaa496b9da879d66b42fb81654306f2ef70 --- /dev/null +++ b/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f47a66591776a9b27423c2d590268a4c1459646318aa8fd05788e737092ff4e +size 3695848 diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..1ea9609212aabd8f64e9eac7492d220ccdb65879 --- /dev/null +++ b/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76c402a3caca01a40b86e20628c7085372b9f06b9ad905962d0ae3e46afa5b7d +size 7433594 diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f117a9ad349640db82853b066f18de7c76959ba5 --- /dev/null +++ b/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f25dae3a1acf89a39570fafa2dbe3168373e743792e659c1c98646a426968d3c +size 14244 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8c24fcb68a3ab42506f78b055ff7780d68049a44 --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0138d0ffd6b4d8f6ded7f39531698ac8be82688a7f3f9e5696c883e0b215f9f2 +size 1064 diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-200/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-200/tokenizer.model b/checkpoint-200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-200/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..17a92d70c32a4b91e35394c87947a4938402b90c --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,177 @@ +{ + "best_metric": 1.8619400262832642, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-200", + "epoch": 0.35555555555555557, + "eval_steps": 100, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 1.5694713402792346e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-2000/README.md b/checkpoint-2000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-2000/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-2000/adapter_config.json b/checkpoint-2000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-2000/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2000/adapter_model.safetensors b/checkpoint-2000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c7ee1f8cd2c711f999d6787d9dea5288c280a3a3 --- /dev/null +++ b/checkpoint-2000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49cbee4dd4e55f624520dd6d6ce72e1ebb246fb2cad3777a8fdb8c4dfd08838a +size 3695848 diff --git a/checkpoint-2000/optimizer.pt b/checkpoint-2000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..cdefc4c22f67fe24f9e9d0b494eacf4bca48baab --- /dev/null +++ b/checkpoint-2000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ea2c1be3bc67de972e9e0ad95457b8459b5296753a9c15b9659fc6c6e2acacf +size 7433594 diff --git a/checkpoint-2000/rng_state.pth b/checkpoint-2000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a7c8891a134cb419fd6419b4ddb3f353c4aaa4af --- /dev/null +++ b/checkpoint-2000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cfbca8f6aded8567dbf52663fad88626e9ff81399d30f9b4324b100df00d2e8 +size 14244 diff --git a/checkpoint-2000/scheduler.pt b/checkpoint-2000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..687cb76d309cbe40e66529e32e946c78f8bdb487 --- /dev/null +++ b/checkpoint-2000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfba70f1616c3f7d458e303ed4473e7aa0762778705b8ca6aad7e7472d25014b +size 1064 diff --git a/checkpoint-2000/special_tokens_map.json b/checkpoint-2000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-2000/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-2000/tokenizer.model b/checkpoint-2000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-2000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-2000/tokenizer_config.json b/checkpoint-2000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-2000/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-2000/trainer_state.json b/checkpoint-2000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1ca5d0e4bfb57828c3eab9fb172d0877267c8117 --- /dev/null +++ b/checkpoint-2000/trainer_state.json @@ -0,0 +1,1581 @@ +{ + "best_metric": 1.4598569869995117, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-2000", + "epoch": 3.5555555555555554, + "eval_steps": 100, + "global_step": 2000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + }, + { + "epoch": 1.44, + "grad_norm": 3.520264148712158, + "learning_rate": 4.7480731191519224e-05, + "loss": 1.4991, + "step": 810 + }, + { + "epoch": 1.46, + "grad_norm": 5.3987531661987305, + "learning_rate": 4.741924250958289e-05, + "loss": 1.6856, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 12.352794647216797, + "learning_rate": 4.7357053265095575e-05, + "loss": 1.4509, + "step": 830 + }, + { + "epoch": 1.49, + "grad_norm": 9.825531005859375, + "learning_rate": 4.729416540136361e-05, + "loss": 1.6168, + "step": 840 + }, + { + "epoch": 1.51, + "grad_norm": 10.881526947021484, + "learning_rate": 4.723058088352395e-05, + "loss": 1.5783, + "step": 850 + }, + { + "epoch": 1.53, + "grad_norm": 6.232407093048096, + "learning_rate": 4.7166301698482815e-05, + "loss": 1.4556, + "step": 860 + }, + { + "epoch": 1.55, + "grad_norm": 3.3216302394866943, + "learning_rate": 4.710132985485355e-05, + "loss": 1.593, + "step": 870 + }, + { + "epoch": 1.56, + "grad_norm": 5.219264984130859, + "learning_rate": 4.703566738289389e-05, + "loss": 1.5131, + "step": 880 + }, + { + "epoch": 1.58, + "grad_norm": 7.875769138336182, + "learning_rate": 4.696931633444251e-05, + "loss": 1.5667, + "step": 890 + }, + { + "epoch": 1.6, + "grad_norm": 5.77959680557251, + "learning_rate": 4.69022787828549e-05, + "loss": 1.5211, + "step": 900 + }, + { + "epoch": 1.6, + "eval_loss": 1.5731443166732788, + "eval_runtime": 124.8025, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 900 + }, + { + "epoch": 1.62, + "grad_norm": 4.806954383850098, + "learning_rate": 4.683455682293863e-05, + "loss": 1.6824, + "step": 910 + }, + { + "epoch": 1.64, + "grad_norm": 5.980200290679932, + "learning_rate": 4.676615257088776e-05, + "loss": 1.5989, + "step": 920 + }, + { + "epoch": 1.65, + "grad_norm": 4.3645429611206055, + "learning_rate": 4.6697068164216896e-05, + "loss": 1.6469, + "step": 930 + }, + { + "epoch": 1.67, + "grad_norm": 3.2400012016296387, + "learning_rate": 4.662730576169423e-05, + "loss": 1.568, + "step": 940 + }, + { + "epoch": 1.69, + "grad_norm": 4.331827640533447, + "learning_rate": 4.6556867543274184e-05, + "loss": 1.5236, + "step": 950 + }, + { + "epoch": 1.71, + "grad_norm": 3.3798201084136963, + "learning_rate": 4.6485755710029256e-05, + "loss": 1.5046, + "step": 960 + }, + { + "epoch": 1.72, + "grad_norm": 5.440864086151123, + "learning_rate": 4.6413972484081216e-05, + "loss": 1.5816, + "step": 970 + }, + { + "epoch": 1.74, + "grad_norm": 5.852995872497559, + "learning_rate": 4.6341520108531746e-05, + "loss": 1.4193, + "step": 980 + }, + { + "epoch": 1.76, + "grad_norm": 4.2782206535339355, + "learning_rate": 4.626840084739224e-05, + "loss": 1.5457, + "step": 990 + }, + { + "epoch": 1.78, + "grad_norm": 8.631403923034668, + "learning_rate": 4.619461698551315e-05, + "loss": 1.652, + "step": 1000 + }, + { + "epoch": 1.78, + "eval_loss": 1.5386379957199097, + "eval_runtime": 124.8384, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 2.003, + "step": 1000 + }, + { + "epoch": 1.8, + "grad_norm": 4.581122875213623, + "learning_rate": 4.612017082851253e-05, + "loss": 1.5746, + "step": 1010 + }, + { + "epoch": 1.81, + "grad_norm": 3.0373165607452393, + "learning_rate": 4.604506470270403e-05, + "loss": 1.6038, + "step": 1020 + }, + { + "epoch": 1.83, + "grad_norm": 3.5066914558410645, + "learning_rate": 4.5969300955024167e-05, + "loss": 1.5725, + "step": 1030 + }, + { + "epoch": 1.85, + "grad_norm": 4.402235507965088, + "learning_rate": 4.589288195295901e-05, + "loss": 1.5469, + "step": 1040 + }, + { + "epoch": 1.87, + "grad_norm": 4.844370365142822, + "learning_rate": 4.58158100844702e-05, + "loss": 1.5424, + "step": 1050 + }, + { + "epoch": 1.88, + "grad_norm": 4.146657943725586, + "learning_rate": 4.573808775792033e-05, + "loss": 1.4878, + "step": 1060 + }, + { + "epoch": 1.9, + "grad_norm": 3.210528612136841, + "learning_rate": 4.5659717401997655e-05, + "loss": 1.6077, + "step": 1070 + }, + { + "epoch": 1.92, + "grad_norm": 5.2232818603515625, + "learning_rate": 4.5580701465640254e-05, + "loss": 1.4824, + "step": 1080 + }, + { + "epoch": 1.94, + "grad_norm": 2.8741068840026855, + "learning_rate": 4.550104241795946e-05, + "loss": 1.6172, + "step": 1090 + }, + { + "epoch": 1.96, + "grad_norm": 8.092519760131836, + "learning_rate": 4.5420742748162734e-05, + "loss": 1.3659, + "step": 1100 + }, + { + "epoch": 1.96, + "eval_loss": 1.5198711156845093, + "eval_runtime": 124.8546, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1100 + }, + { + "epoch": 1.97, + "grad_norm": 5.068336009979248, + "learning_rate": 4.5339804965475875e-05, + "loss": 1.4661, + "step": 1110 + }, + { + "epoch": 1.99, + "grad_norm": 13.167552947998047, + "learning_rate": 4.525823159906459e-05, + "loss": 1.411, + "step": 1120 + }, + { + "epoch": 2.01, + "grad_norm": 4.712369918823242, + "learning_rate": 4.5176025197955494e-05, + "loss": 1.3309, + "step": 1130 + }, + { + "epoch": 2.03, + "grad_norm": 7.261610507965088, + "learning_rate": 4.509318833095642e-05, + "loss": 1.3892, + "step": 1140 + }, + { + "epoch": 2.04, + "grad_norm": 3.8006956577301025, + "learning_rate": 4.500972358657618e-05, + "loss": 1.3927, + "step": 1150 + }, + { + "epoch": 2.06, + "grad_norm": 3.6301958560943604, + "learning_rate": 4.492563357294369e-05, + "loss": 1.4629, + "step": 1160 + }, + { + "epoch": 2.08, + "grad_norm": 4.353027820587158, + "learning_rate": 4.4840920917726426e-05, + "loss": 1.352, + "step": 1170 + }, + { + "epoch": 2.1, + "grad_norm": 3.375173807144165, + "learning_rate": 4.475558826804833e-05, + "loss": 1.4096, + "step": 1180 + }, + { + "epoch": 2.12, + "grad_norm": 6.289668560028076, + "learning_rate": 4.466963829040712e-05, + "loss": 1.4834, + "step": 1190 + }, + { + "epoch": 2.13, + "grad_norm": 4.517002582550049, + "learning_rate": 4.458307367059092e-05, + "loss": 1.4746, + "step": 1200 + }, + { + "epoch": 2.13, + "eval_loss": 1.5145190954208374, + "eval_runtime": 124.8898, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 1200 + }, + { + "epoch": 2.15, + "grad_norm": 3.195769786834717, + "learning_rate": 4.449589711359438e-05, + "loss": 1.4149, + "step": 1210 + }, + { + "epoch": 2.17, + "grad_norm": 3.751405715942383, + "learning_rate": 4.440811134353412e-05, + "loss": 1.5501, + "step": 1220 + }, + { + "epoch": 2.19, + "grad_norm": 4.148709774017334, + "learning_rate": 4.431971910356363e-05, + "loss": 1.5253, + "step": 1230 + }, + { + "epoch": 2.2, + "grad_norm": 20.003253936767578, + "learning_rate": 4.42307231557875e-05, + "loss": 1.6413, + "step": 1240 + }, + { + "epoch": 2.22, + "grad_norm": 4.721023082733154, + "learning_rate": 4.414112628117517e-05, + "loss": 1.5608, + "step": 1250 + }, + { + "epoch": 2.24, + "grad_norm": 4.672358989715576, + "learning_rate": 4.4050931279474015e-05, + "loss": 1.3646, + "step": 1260 + }, + { + "epoch": 2.26, + "grad_norm": 4.073034286499023, + "learning_rate": 4.396014096912182e-05, + "loss": 1.3499, + "step": 1270 + }, + { + "epoch": 2.28, + "grad_norm": 3.2312991619110107, + "learning_rate": 4.386875818715874e-05, + "loss": 1.4648, + "step": 1280 + }, + { + "epoch": 2.29, + "grad_norm": 18.92267417907715, + "learning_rate": 4.3776785789138675e-05, + "loss": 1.4919, + "step": 1290 + }, + { + "epoch": 2.31, + "grad_norm": 5.677367687225342, + "learning_rate": 4.368422664903997e-05, + "loss": 1.2891, + "step": 1300 + }, + { + "epoch": 2.31, + "eval_loss": 1.504623532295227, + "eval_runtime": 124.8541, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1300 + }, + { + "epoch": 2.33, + "grad_norm": 5.031940460205078, + "learning_rate": 4.359108365917565e-05, + "loss": 1.4939, + "step": 1310 + }, + { + "epoch": 2.35, + "grad_norm": 7.701929092407227, + "learning_rate": 4.349735973010305e-05, + "loss": 1.28, + "step": 1320 + }, + { + "epoch": 2.36, + "grad_norm": 5.7498040199279785, + "learning_rate": 4.3403057790532855e-05, + "loss": 1.4584, + "step": 1330 + }, + { + "epoch": 2.38, + "grad_norm": 8.7277193069458, + "learning_rate": 4.330818078723755e-05, + "loss": 1.5871, + "step": 1340 + }, + { + "epoch": 2.4, + "grad_norm": 13.915125846862793, + "learning_rate": 4.32127316849594e-05, + "loss": 1.3794, + "step": 1350 + }, + { + "epoch": 2.42, + "grad_norm": 2.949733018875122, + "learning_rate": 4.311671346631774e-05, + "loss": 1.3543, + "step": 1360 + }, + { + "epoch": 2.44, + "grad_norm": 5.377658843994141, + "learning_rate": 4.302012913171584e-05, + "loss": 1.3695, + "step": 1370 + }, + { + "epoch": 2.45, + "grad_norm": 16.94107437133789, + "learning_rate": 4.292298169924709e-05, + "loss": 1.5168, + "step": 1380 + }, + { + "epoch": 2.47, + "grad_norm": 4.190367221832275, + "learning_rate": 4.282527420460072e-05, + "loss": 1.4058, + "step": 1390 + }, + { + "epoch": 2.49, + "grad_norm": 9.269573211669922, + "learning_rate": 4.272700970096696e-05, + "loss": 1.5794, + "step": 1400 + }, + { + "epoch": 2.49, + "eval_loss": 1.498180627822876, + "eval_runtime": 124.7222, + "eval_samples_per_second": 8.018, + "eval_steps_per_second": 2.004, + "step": 1400 + }, + { + "epoch": 2.51, + "grad_norm": 3.951293468475342, + "learning_rate": 4.262819125894156e-05, + "loss": 1.56, + "step": 1410 + }, + { + "epoch": 2.52, + "grad_norm": 3.8725697994232178, + "learning_rate": 4.252882196642992e-05, + "loss": 1.5159, + "step": 1420 + }, + { + "epoch": 2.54, + "grad_norm": 3.898501396179199, + "learning_rate": 4.242890492855056e-05, + "loss": 1.4659, + "step": 1430 + }, + { + "epoch": 2.56, + "grad_norm": 5.807662487030029, + "learning_rate": 4.23284432675381e-05, + "loss": 1.5736, + "step": 1440 + }, + { + "epoch": 2.58, + "grad_norm": 3.529371500015259, + "learning_rate": 4.222744012264566e-05, + "loss": 1.5011, + "step": 1450 + }, + { + "epoch": 2.6, + "grad_norm": 6.336548805236816, + "learning_rate": 4.212589865004684e-05, + "loss": 1.6629, + "step": 1460 + }, + { + "epoch": 2.61, + "grad_norm": 6.222330093383789, + "learning_rate": 4.2023822022737016e-05, + "loss": 1.5573, + "step": 1470 + }, + { + "epoch": 2.63, + "grad_norm": 4.25172233581543, + "learning_rate": 4.192121343043424e-05, + "loss": 1.3817, + "step": 1480 + }, + { + "epoch": 2.65, + "grad_norm": 4.487111568450928, + "learning_rate": 4.181807607947954e-05, + "loss": 1.5323, + "step": 1490 + }, + { + "epoch": 2.67, + "grad_norm": 4.656155109405518, + "learning_rate": 4.1714413192736754e-05, + "loss": 1.3678, + "step": 1500 + }, + { + "epoch": 2.67, + "eval_loss": 1.5049968957901, + "eval_runtime": 124.7803, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.004, + "step": 1500 + }, + { + "epoch": 2.68, + "grad_norm": 4.431355953216553, + "learning_rate": 4.161022800949177e-05, + "loss": 1.486, + "step": 1510 + }, + { + "epoch": 2.7, + "grad_norm": 18.211524963378906, + "learning_rate": 4.150552378535137e-05, + "loss": 1.4498, + "step": 1520 + }, + { + "epoch": 2.72, + "grad_norm": 5.3755292892456055, + "learning_rate": 4.140030379214147e-05, + "loss": 1.4421, + "step": 1530 + }, + { + "epoch": 2.74, + "grad_norm": 6.626212120056152, + "learning_rate": 4.1294571317804854e-05, + "loss": 1.4322, + "step": 1540 + }, + { + "epoch": 2.76, + "grad_norm": 4.030793190002441, + "learning_rate": 4.1188329666298464e-05, + "loss": 1.3433, + "step": 1550 + }, + { + "epoch": 2.77, + "grad_norm": 6.53309440612793, + "learning_rate": 4.108158215749014e-05, + "loss": 1.5604, + "step": 1560 + }, + { + "epoch": 2.79, + "grad_norm": 3.76047420501709, + "learning_rate": 4.0974332127054914e-05, + "loss": 1.3259, + "step": 1570 + }, + { + "epoch": 2.81, + "grad_norm": 4.58742094039917, + "learning_rate": 4.0866582926370725e-05, + "loss": 1.4228, + "step": 1580 + }, + { + "epoch": 2.83, + "grad_norm": 4.566816806793213, + "learning_rate": 4.0758337922413716e-05, + "loss": 1.3013, + "step": 1590 + }, + { + "epoch": 2.84, + "grad_norm": 6.218478202819824, + "learning_rate": 4.064960049765304e-05, + "loss": 1.5061, + "step": 1600 + }, + { + "epoch": 2.84, + "eval_loss": 1.4853577613830566, + "eval_runtime": 124.7889, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.003, + "step": 1600 + }, + { + "epoch": 2.86, + "grad_norm": 13.811309814453125, + "learning_rate": 4.054037404994516e-05, + "loss": 1.4839, + "step": 1610 + }, + { + "epoch": 2.88, + "grad_norm": 5.560975074768066, + "learning_rate": 4.043066199242762e-05, + "loss": 1.4765, + "step": 1620 + }, + { + "epoch": 2.9, + "grad_norm": 35.27302551269531, + "learning_rate": 4.032046775341247e-05, + "loss": 1.4105, + "step": 1630 + }, + { + "epoch": 2.92, + "grad_norm": 4.9896745681762695, + "learning_rate": 4.020979477627907e-05, + "loss": 1.5688, + "step": 1640 + }, + { + "epoch": 2.93, + "grad_norm": 3.5250892639160156, + "learning_rate": 4.0098646519366534e-05, + "loss": 1.4484, + "step": 1650 + }, + { + "epoch": 2.95, + "grad_norm": 5.281729698181152, + "learning_rate": 3.998702645586565e-05, + "loss": 1.6017, + "step": 1660 + }, + { + "epoch": 2.97, + "grad_norm": 4.667525768280029, + "learning_rate": 3.9874938073710336e-05, + "loss": 1.5006, + "step": 1670 + }, + { + "epoch": 2.99, + "grad_norm": 4.294438362121582, + "learning_rate": 3.976238487546864e-05, + "loss": 1.4218, + "step": 1680 + }, + { + "epoch": 3.0, + "grad_norm": 4.070734977722168, + "learning_rate": 3.9649370378233365e-05, + "loss": 1.6569, + "step": 1690 + }, + { + "epoch": 3.02, + "grad_norm": 4.640359878540039, + "learning_rate": 3.953589811351204e-05, + "loss": 1.5635, + "step": 1700 + }, + { + "epoch": 3.02, + "eval_loss": 1.478974461555481, + "eval_runtime": 124.6852, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 1700 + }, + { + "epoch": 3.04, + "grad_norm": 4.43009090423584, + "learning_rate": 3.94219716271167e-05, + "loss": 1.3304, + "step": 1710 + }, + { + "epoch": 3.06, + "grad_norm": 4.001712799072266, + "learning_rate": 3.930759447905298e-05, + "loss": 1.3534, + "step": 1720 + }, + { + "epoch": 3.08, + "grad_norm": 4.664085388183594, + "learning_rate": 3.919277024340891e-05, + "loss": 1.368, + "step": 1730 + }, + { + "epoch": 3.09, + "grad_norm": 4.42681360244751, + "learning_rate": 3.907750250824327e-05, + "loss": 1.4164, + "step": 1740 + }, + { + "epoch": 3.11, + "grad_norm": 7.331808567047119, + "learning_rate": 3.8961794875473394e-05, + "loss": 1.4333, + "step": 1750 + }, + { + "epoch": 3.13, + "grad_norm": 5.612239837646484, + "learning_rate": 3.884565096076269e-05, + "loss": 1.5754, + "step": 1760 + }, + { + "epoch": 3.15, + "grad_norm": 5.236481666564941, + "learning_rate": 3.872907439340758e-05, + "loss": 1.4017, + "step": 1770 + }, + { + "epoch": 3.16, + "grad_norm": 4.995403289794922, + "learning_rate": 3.861206881622419e-05, + "loss": 1.5011, + "step": 1780 + }, + { + "epoch": 3.18, + "grad_norm": 41.0167236328125, + "learning_rate": 3.8494637885434396e-05, + "loss": 1.4472, + "step": 1790 + }, + { + "epoch": 3.2, + "grad_norm": 5.136650562286377, + "learning_rate": 3.837678527055168e-05, + "loss": 1.3939, + "step": 1800 + }, + { + "epoch": 3.2, + "eval_loss": 1.4779504537582397, + "eval_runtime": 124.6728, + "eval_samples_per_second": 8.021, + "eval_steps_per_second": 2.005, + "step": 1800 + }, + { + "epoch": 3.22, + "grad_norm": 5.178096294403076, + "learning_rate": 3.8258514654266434e-05, + "loss": 1.5265, + "step": 1810 + }, + { + "epoch": 3.24, + "grad_norm": 6.949739456176758, + "learning_rate": 3.813982973233083e-05, + "loss": 1.3674, + "step": 1820 + }, + { + "epoch": 3.25, + "grad_norm": 3.84801983833313, + "learning_rate": 3.802073421344339e-05, + "loss": 1.4305, + "step": 1830 + }, + { + "epoch": 3.27, + "grad_norm": 3.5803613662719727, + "learning_rate": 3.7901231819133105e-05, + "loss": 1.557, + "step": 1840 + }, + { + "epoch": 3.29, + "grad_norm": 3.509099245071411, + "learning_rate": 3.7781326283643085e-05, + "loss": 1.3611, + "step": 1850 + }, + { + "epoch": 3.31, + "grad_norm": 41.185279846191406, + "learning_rate": 3.766102135381393e-05, + "loss": 1.3944, + "step": 1860 + }, + { + "epoch": 3.32, + "grad_norm": 3.797672748565674, + "learning_rate": 3.75403207889666e-05, + "loss": 1.2557, + "step": 1870 + }, + { + "epoch": 3.34, + "grad_norm": 4.237602233886719, + "learning_rate": 3.741922836078499e-05, + "loss": 1.3583, + "step": 1880 + }, + { + "epoch": 3.36, + "grad_norm": 4.5037312507629395, + "learning_rate": 3.729774785319801e-05, + "loss": 1.4619, + "step": 1890 + }, + { + "epoch": 3.38, + "grad_norm": 4.292742729187012, + "learning_rate": 3.717588306226143e-05, + "loss": 1.3986, + "step": 1900 + }, + { + "epoch": 3.38, + "eval_loss": 1.4685039520263672, + "eval_runtime": 124.7126, + "eval_samples_per_second": 8.018, + "eval_steps_per_second": 2.005, + "step": 1900 + }, + { + "epoch": 3.4, + "grad_norm": 5.198277950286865, + "learning_rate": 3.705363779603917e-05, + "loss": 1.349, + "step": 1910 + }, + { + "epoch": 3.41, + "grad_norm": 3.86722469329834, + "learning_rate": 3.693101587448436e-05, + "loss": 1.5053, + "step": 1920 + }, + { + "epoch": 3.43, + "grad_norm": 8.68099594116211, + "learning_rate": 3.680802112931996e-05, + "loss": 1.3899, + "step": 1930 + }, + { + "epoch": 3.45, + "grad_norm": 3.8347198963165283, + "learning_rate": 3.6684657403919005e-05, + "loss": 1.4519, + "step": 1940 + }, + { + "epoch": 3.47, + "grad_norm": 9.875212669372559, + "learning_rate": 3.6560928553184554e-05, + "loss": 1.4788, + "step": 1950 + }, + { + "epoch": 3.48, + "grad_norm": 8.638535499572754, + "learning_rate": 3.6436838443429175e-05, + "loss": 1.3777, + "step": 1960 + }, + { + "epoch": 3.5, + "grad_norm": 3.73545503616333, + "learning_rate": 3.631239095225417e-05, + "loss": 1.4962, + "step": 1970 + }, + { + "epoch": 3.52, + "grad_norm": 8.485962867736816, + "learning_rate": 3.618758996842839e-05, + "loss": 1.4377, + "step": 1980 + }, + { + "epoch": 3.54, + "grad_norm": 4.3264055252075195, + "learning_rate": 3.60624393917667e-05, + "loss": 1.5695, + "step": 1990 + }, + { + "epoch": 3.56, + "grad_norm": 3.979128837585449, + "learning_rate": 3.5936943133008183e-05, + "loss": 1.2959, + "step": 2000 + }, + { + "epoch": 3.56, + "eval_loss": 1.4598569869995117, + "eval_runtime": 124.8036, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 2000 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 1.558141521683153e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2000/training_args.bin b/checkpoint-2000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-2000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-2100/README.md b/checkpoint-2100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-2100/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-2100/adapter_config.json b/checkpoint-2100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-2100/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2100/adapter_model.safetensors b/checkpoint-2100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..28d39b2250045331a552389758d776972565a55a --- /dev/null +++ b/checkpoint-2100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b004da8334249609efdffe2262b682ded1a8d2becb9c74047796455c2409ff6 +size 3695848 diff --git a/checkpoint-2100/optimizer.pt b/checkpoint-2100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f684714ec16dee5ed3d89c71791ebb385784330d --- /dev/null +++ b/checkpoint-2100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df4df76ca3b66c695125bee6844320ea7ff57edeab757c5c7f29352c9c04494e +size 7433594 diff --git a/checkpoint-2100/rng_state.pth b/checkpoint-2100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..281716a99e271d14df76be4655ee8ca5c529b42b --- /dev/null +++ b/checkpoint-2100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03f086cae8ff126f7717bbbbd3e116e9f2b7d1d450451adf89edaa599c3499bd +size 14244 diff --git a/checkpoint-2100/scheduler.pt b/checkpoint-2100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c09c3d669471aedd2a184337b878e09b413021a3 --- /dev/null +++ b/checkpoint-2100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4302be746dbdc6f036e55f2f42bca31244286c53ab241fbecfbcb971d62c9a2c +size 1064 diff --git a/checkpoint-2100/special_tokens_map.json b/checkpoint-2100/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-2100/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-2100/tokenizer.model b/checkpoint-2100/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-2100/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-2100/tokenizer_config.json b/checkpoint-2100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-2100/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-2100/trainer_state.json b/checkpoint-2100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d02884e6e215971e9b2ab41a898df8230e3ea831 --- /dev/null +++ b/checkpoint-2100/trainer_state.json @@ -0,0 +1,1659 @@ +{ + "best_metric": 1.4545879364013672, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-2100", + "epoch": 3.7333333333333334, + "eval_steps": 100, + "global_step": 2100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + }, + { + "epoch": 1.44, + "grad_norm": 3.520264148712158, + "learning_rate": 4.7480731191519224e-05, + "loss": 1.4991, + "step": 810 + }, + { + "epoch": 1.46, + "grad_norm": 5.3987531661987305, + "learning_rate": 4.741924250958289e-05, + "loss": 1.6856, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 12.352794647216797, + "learning_rate": 4.7357053265095575e-05, + "loss": 1.4509, + "step": 830 + }, + { + "epoch": 1.49, + "grad_norm": 9.825531005859375, + "learning_rate": 4.729416540136361e-05, + "loss": 1.6168, + "step": 840 + }, + { + "epoch": 1.51, + "grad_norm": 10.881526947021484, + "learning_rate": 4.723058088352395e-05, + "loss": 1.5783, + "step": 850 + }, + { + "epoch": 1.53, + "grad_norm": 6.232407093048096, + "learning_rate": 4.7166301698482815e-05, + "loss": 1.4556, + "step": 860 + }, + { + "epoch": 1.55, + "grad_norm": 3.3216302394866943, + "learning_rate": 4.710132985485355e-05, + "loss": 1.593, + "step": 870 + }, + { + "epoch": 1.56, + "grad_norm": 5.219264984130859, + "learning_rate": 4.703566738289389e-05, + "loss": 1.5131, + "step": 880 + }, + { + "epoch": 1.58, + "grad_norm": 7.875769138336182, + "learning_rate": 4.696931633444251e-05, + "loss": 1.5667, + "step": 890 + }, + { + "epoch": 1.6, + "grad_norm": 5.77959680557251, + "learning_rate": 4.69022787828549e-05, + "loss": 1.5211, + "step": 900 + }, + { + "epoch": 1.6, + "eval_loss": 1.5731443166732788, + "eval_runtime": 124.8025, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 900 + }, + { + "epoch": 1.62, + "grad_norm": 4.806954383850098, + "learning_rate": 4.683455682293863e-05, + "loss": 1.6824, + "step": 910 + }, + { + "epoch": 1.64, + "grad_norm": 5.980200290679932, + "learning_rate": 4.676615257088776e-05, + "loss": 1.5989, + "step": 920 + }, + { + "epoch": 1.65, + "grad_norm": 4.3645429611206055, + "learning_rate": 4.6697068164216896e-05, + "loss": 1.6469, + "step": 930 + }, + { + "epoch": 1.67, + "grad_norm": 3.2400012016296387, + "learning_rate": 4.662730576169423e-05, + "loss": 1.568, + "step": 940 + }, + { + "epoch": 1.69, + "grad_norm": 4.331827640533447, + "learning_rate": 4.6556867543274184e-05, + "loss": 1.5236, + "step": 950 + }, + { + "epoch": 1.71, + "grad_norm": 3.3798201084136963, + "learning_rate": 4.6485755710029256e-05, + "loss": 1.5046, + "step": 960 + }, + { + "epoch": 1.72, + "grad_norm": 5.440864086151123, + "learning_rate": 4.6413972484081216e-05, + "loss": 1.5816, + "step": 970 + }, + { + "epoch": 1.74, + "grad_norm": 5.852995872497559, + "learning_rate": 4.6341520108531746e-05, + "loss": 1.4193, + "step": 980 + }, + { + "epoch": 1.76, + "grad_norm": 4.2782206535339355, + "learning_rate": 4.626840084739224e-05, + "loss": 1.5457, + "step": 990 + }, + { + "epoch": 1.78, + "grad_norm": 8.631403923034668, + "learning_rate": 4.619461698551315e-05, + "loss": 1.652, + "step": 1000 + }, + { + "epoch": 1.78, + "eval_loss": 1.5386379957199097, + "eval_runtime": 124.8384, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 2.003, + "step": 1000 + }, + { + "epoch": 1.8, + "grad_norm": 4.581122875213623, + "learning_rate": 4.612017082851253e-05, + "loss": 1.5746, + "step": 1010 + }, + { + "epoch": 1.81, + "grad_norm": 3.0373165607452393, + "learning_rate": 4.604506470270403e-05, + "loss": 1.6038, + "step": 1020 + }, + { + "epoch": 1.83, + "grad_norm": 3.5066914558410645, + "learning_rate": 4.5969300955024167e-05, + "loss": 1.5725, + "step": 1030 + }, + { + "epoch": 1.85, + "grad_norm": 4.402235507965088, + "learning_rate": 4.589288195295901e-05, + "loss": 1.5469, + "step": 1040 + }, + { + "epoch": 1.87, + "grad_norm": 4.844370365142822, + "learning_rate": 4.58158100844702e-05, + "loss": 1.5424, + "step": 1050 + }, + { + "epoch": 1.88, + "grad_norm": 4.146657943725586, + "learning_rate": 4.573808775792033e-05, + "loss": 1.4878, + "step": 1060 + }, + { + "epoch": 1.9, + "grad_norm": 3.210528612136841, + "learning_rate": 4.5659717401997655e-05, + "loss": 1.6077, + "step": 1070 + }, + { + "epoch": 1.92, + "grad_norm": 5.2232818603515625, + "learning_rate": 4.5580701465640254e-05, + "loss": 1.4824, + "step": 1080 + }, + { + "epoch": 1.94, + "grad_norm": 2.8741068840026855, + "learning_rate": 4.550104241795946e-05, + "loss": 1.6172, + "step": 1090 + }, + { + "epoch": 1.96, + "grad_norm": 8.092519760131836, + "learning_rate": 4.5420742748162734e-05, + "loss": 1.3659, + "step": 1100 + }, + { + "epoch": 1.96, + "eval_loss": 1.5198711156845093, + "eval_runtime": 124.8546, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1100 + }, + { + "epoch": 1.97, + "grad_norm": 5.068336009979248, + "learning_rate": 4.5339804965475875e-05, + "loss": 1.4661, + "step": 1110 + }, + { + "epoch": 1.99, + "grad_norm": 13.167552947998047, + "learning_rate": 4.525823159906459e-05, + "loss": 1.411, + "step": 1120 + }, + { + "epoch": 2.01, + "grad_norm": 4.712369918823242, + "learning_rate": 4.5176025197955494e-05, + "loss": 1.3309, + "step": 1130 + }, + { + "epoch": 2.03, + "grad_norm": 7.261610507965088, + "learning_rate": 4.509318833095642e-05, + "loss": 1.3892, + "step": 1140 + }, + { + "epoch": 2.04, + "grad_norm": 3.8006956577301025, + "learning_rate": 4.500972358657618e-05, + "loss": 1.3927, + "step": 1150 + }, + { + "epoch": 2.06, + "grad_norm": 3.6301958560943604, + "learning_rate": 4.492563357294369e-05, + "loss": 1.4629, + "step": 1160 + }, + { + "epoch": 2.08, + "grad_norm": 4.353027820587158, + "learning_rate": 4.4840920917726426e-05, + "loss": 1.352, + "step": 1170 + }, + { + "epoch": 2.1, + "grad_norm": 3.375173807144165, + "learning_rate": 4.475558826804833e-05, + "loss": 1.4096, + "step": 1180 + }, + { + "epoch": 2.12, + "grad_norm": 6.289668560028076, + "learning_rate": 4.466963829040712e-05, + "loss": 1.4834, + "step": 1190 + }, + { + "epoch": 2.13, + "grad_norm": 4.517002582550049, + "learning_rate": 4.458307367059092e-05, + "loss": 1.4746, + "step": 1200 + }, + { + "epoch": 2.13, + "eval_loss": 1.5145190954208374, + "eval_runtime": 124.8898, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 1200 + }, + { + "epoch": 2.15, + "grad_norm": 3.195769786834717, + "learning_rate": 4.449589711359438e-05, + "loss": 1.4149, + "step": 1210 + }, + { + "epoch": 2.17, + "grad_norm": 3.751405715942383, + "learning_rate": 4.440811134353412e-05, + "loss": 1.5501, + "step": 1220 + }, + { + "epoch": 2.19, + "grad_norm": 4.148709774017334, + "learning_rate": 4.431971910356363e-05, + "loss": 1.5253, + "step": 1230 + }, + { + "epoch": 2.2, + "grad_norm": 20.003253936767578, + "learning_rate": 4.42307231557875e-05, + "loss": 1.6413, + "step": 1240 + }, + { + "epoch": 2.22, + "grad_norm": 4.721023082733154, + "learning_rate": 4.414112628117517e-05, + "loss": 1.5608, + "step": 1250 + }, + { + "epoch": 2.24, + "grad_norm": 4.672358989715576, + "learning_rate": 4.4050931279474015e-05, + "loss": 1.3646, + "step": 1260 + }, + { + "epoch": 2.26, + "grad_norm": 4.073034286499023, + "learning_rate": 4.396014096912182e-05, + "loss": 1.3499, + "step": 1270 + }, + { + "epoch": 2.28, + "grad_norm": 3.2312991619110107, + "learning_rate": 4.386875818715874e-05, + "loss": 1.4648, + "step": 1280 + }, + { + "epoch": 2.29, + "grad_norm": 18.92267417907715, + "learning_rate": 4.3776785789138675e-05, + "loss": 1.4919, + "step": 1290 + }, + { + "epoch": 2.31, + "grad_norm": 5.677367687225342, + "learning_rate": 4.368422664903997e-05, + "loss": 1.2891, + "step": 1300 + }, + { + "epoch": 2.31, + "eval_loss": 1.504623532295227, + "eval_runtime": 124.8541, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1300 + }, + { + "epoch": 2.33, + "grad_norm": 5.031940460205078, + "learning_rate": 4.359108365917565e-05, + "loss": 1.4939, + "step": 1310 + }, + { + "epoch": 2.35, + "grad_norm": 7.701929092407227, + "learning_rate": 4.349735973010305e-05, + "loss": 1.28, + "step": 1320 + }, + { + "epoch": 2.36, + "grad_norm": 5.7498040199279785, + "learning_rate": 4.3403057790532855e-05, + "loss": 1.4584, + "step": 1330 + }, + { + "epoch": 2.38, + "grad_norm": 8.7277193069458, + "learning_rate": 4.330818078723755e-05, + "loss": 1.5871, + "step": 1340 + }, + { + "epoch": 2.4, + "grad_norm": 13.915125846862793, + "learning_rate": 4.32127316849594e-05, + "loss": 1.3794, + "step": 1350 + }, + { + "epoch": 2.42, + "grad_norm": 2.949733018875122, + "learning_rate": 4.311671346631774e-05, + "loss": 1.3543, + "step": 1360 + }, + { + "epoch": 2.44, + "grad_norm": 5.377658843994141, + "learning_rate": 4.302012913171584e-05, + "loss": 1.3695, + "step": 1370 + }, + { + "epoch": 2.45, + "grad_norm": 16.94107437133789, + "learning_rate": 4.292298169924709e-05, + "loss": 1.5168, + "step": 1380 + }, + { + "epoch": 2.47, + "grad_norm": 4.190367221832275, + "learning_rate": 4.282527420460072e-05, + "loss": 1.4058, + "step": 1390 + }, + { + "epoch": 2.49, + "grad_norm": 9.269573211669922, + "learning_rate": 4.272700970096696e-05, + "loss": 1.5794, + "step": 1400 + }, + { + "epoch": 2.49, + "eval_loss": 1.498180627822876, + "eval_runtime": 124.7222, + "eval_samples_per_second": 8.018, + "eval_steps_per_second": 2.004, + "step": 1400 + }, + { + "epoch": 2.51, + "grad_norm": 3.951293468475342, + "learning_rate": 4.262819125894156e-05, + "loss": 1.56, + "step": 1410 + }, + { + "epoch": 2.52, + "grad_norm": 3.8725697994232178, + "learning_rate": 4.252882196642992e-05, + "loss": 1.5159, + "step": 1420 + }, + { + "epoch": 2.54, + "grad_norm": 3.898501396179199, + "learning_rate": 4.242890492855056e-05, + "loss": 1.4659, + "step": 1430 + }, + { + "epoch": 2.56, + "grad_norm": 5.807662487030029, + "learning_rate": 4.23284432675381e-05, + "loss": 1.5736, + "step": 1440 + }, + { + "epoch": 2.58, + "grad_norm": 3.529371500015259, + "learning_rate": 4.222744012264566e-05, + "loss": 1.5011, + "step": 1450 + }, + { + "epoch": 2.6, + "grad_norm": 6.336548805236816, + "learning_rate": 4.212589865004684e-05, + "loss": 1.6629, + "step": 1460 + }, + { + "epoch": 2.61, + "grad_norm": 6.222330093383789, + "learning_rate": 4.2023822022737016e-05, + "loss": 1.5573, + "step": 1470 + }, + { + "epoch": 2.63, + "grad_norm": 4.25172233581543, + "learning_rate": 4.192121343043424e-05, + "loss": 1.3817, + "step": 1480 + }, + { + "epoch": 2.65, + "grad_norm": 4.487111568450928, + "learning_rate": 4.181807607947954e-05, + "loss": 1.5323, + "step": 1490 + }, + { + "epoch": 2.67, + "grad_norm": 4.656155109405518, + "learning_rate": 4.1714413192736754e-05, + "loss": 1.3678, + "step": 1500 + }, + { + "epoch": 2.67, + "eval_loss": 1.5049968957901, + "eval_runtime": 124.7803, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.004, + "step": 1500 + }, + { + "epoch": 2.68, + "grad_norm": 4.431355953216553, + "learning_rate": 4.161022800949177e-05, + "loss": 1.486, + "step": 1510 + }, + { + "epoch": 2.7, + "grad_norm": 18.211524963378906, + "learning_rate": 4.150552378535137e-05, + "loss": 1.4498, + "step": 1520 + }, + { + "epoch": 2.72, + "grad_norm": 5.3755292892456055, + "learning_rate": 4.140030379214147e-05, + "loss": 1.4421, + "step": 1530 + }, + { + "epoch": 2.74, + "grad_norm": 6.626212120056152, + "learning_rate": 4.1294571317804854e-05, + "loss": 1.4322, + "step": 1540 + }, + { + "epoch": 2.76, + "grad_norm": 4.030793190002441, + "learning_rate": 4.1188329666298464e-05, + "loss": 1.3433, + "step": 1550 + }, + { + "epoch": 2.77, + "grad_norm": 6.53309440612793, + "learning_rate": 4.108158215749014e-05, + "loss": 1.5604, + "step": 1560 + }, + { + "epoch": 2.79, + "grad_norm": 3.76047420501709, + "learning_rate": 4.0974332127054914e-05, + "loss": 1.3259, + "step": 1570 + }, + { + "epoch": 2.81, + "grad_norm": 4.58742094039917, + "learning_rate": 4.0866582926370725e-05, + "loss": 1.4228, + "step": 1580 + }, + { + "epoch": 2.83, + "grad_norm": 4.566816806793213, + "learning_rate": 4.0758337922413716e-05, + "loss": 1.3013, + "step": 1590 + }, + { + "epoch": 2.84, + "grad_norm": 6.218478202819824, + "learning_rate": 4.064960049765304e-05, + "loss": 1.5061, + "step": 1600 + }, + { + "epoch": 2.84, + "eval_loss": 1.4853577613830566, + "eval_runtime": 124.7889, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.003, + "step": 1600 + }, + { + "epoch": 2.86, + "grad_norm": 13.811309814453125, + "learning_rate": 4.054037404994516e-05, + "loss": 1.4839, + "step": 1610 + }, + { + "epoch": 2.88, + "grad_norm": 5.560975074768066, + "learning_rate": 4.043066199242762e-05, + "loss": 1.4765, + "step": 1620 + }, + { + "epoch": 2.9, + "grad_norm": 35.27302551269531, + "learning_rate": 4.032046775341247e-05, + "loss": 1.4105, + "step": 1630 + }, + { + "epoch": 2.92, + "grad_norm": 4.9896745681762695, + "learning_rate": 4.020979477627907e-05, + "loss": 1.5688, + "step": 1640 + }, + { + "epoch": 2.93, + "grad_norm": 3.5250892639160156, + "learning_rate": 4.0098646519366534e-05, + "loss": 1.4484, + "step": 1650 + }, + { + "epoch": 2.95, + "grad_norm": 5.281729698181152, + "learning_rate": 3.998702645586565e-05, + "loss": 1.6017, + "step": 1660 + }, + { + "epoch": 2.97, + "grad_norm": 4.667525768280029, + "learning_rate": 3.9874938073710336e-05, + "loss": 1.5006, + "step": 1670 + }, + { + "epoch": 2.99, + "grad_norm": 4.294438362121582, + "learning_rate": 3.976238487546864e-05, + "loss": 1.4218, + "step": 1680 + }, + { + "epoch": 3.0, + "grad_norm": 4.070734977722168, + "learning_rate": 3.9649370378233365e-05, + "loss": 1.6569, + "step": 1690 + }, + { + "epoch": 3.02, + "grad_norm": 4.640359878540039, + "learning_rate": 3.953589811351204e-05, + "loss": 1.5635, + "step": 1700 + }, + { + "epoch": 3.02, + "eval_loss": 1.478974461555481, + "eval_runtime": 124.6852, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 1700 + }, + { + "epoch": 3.04, + "grad_norm": 4.43009090423584, + "learning_rate": 3.94219716271167e-05, + "loss": 1.3304, + "step": 1710 + }, + { + "epoch": 3.06, + "grad_norm": 4.001712799072266, + "learning_rate": 3.930759447905298e-05, + "loss": 1.3534, + "step": 1720 + }, + { + "epoch": 3.08, + "grad_norm": 4.664085388183594, + "learning_rate": 3.919277024340891e-05, + "loss": 1.368, + "step": 1730 + }, + { + "epoch": 3.09, + "grad_norm": 4.42681360244751, + "learning_rate": 3.907750250824327e-05, + "loss": 1.4164, + "step": 1740 + }, + { + "epoch": 3.11, + "grad_norm": 7.331808567047119, + "learning_rate": 3.8961794875473394e-05, + "loss": 1.4333, + "step": 1750 + }, + { + "epoch": 3.13, + "grad_norm": 5.612239837646484, + "learning_rate": 3.884565096076269e-05, + "loss": 1.5754, + "step": 1760 + }, + { + "epoch": 3.15, + "grad_norm": 5.236481666564941, + "learning_rate": 3.872907439340758e-05, + "loss": 1.4017, + "step": 1770 + }, + { + "epoch": 3.16, + "grad_norm": 4.995403289794922, + "learning_rate": 3.861206881622419e-05, + "loss": 1.5011, + "step": 1780 + }, + { + "epoch": 3.18, + "grad_norm": 41.0167236328125, + "learning_rate": 3.8494637885434396e-05, + "loss": 1.4472, + "step": 1790 + }, + { + "epoch": 3.2, + "grad_norm": 5.136650562286377, + "learning_rate": 3.837678527055168e-05, + "loss": 1.3939, + "step": 1800 + }, + { + "epoch": 3.2, + "eval_loss": 1.4779504537582397, + "eval_runtime": 124.6728, + "eval_samples_per_second": 8.021, + "eval_steps_per_second": 2.005, + "step": 1800 + }, + { + "epoch": 3.22, + "grad_norm": 5.178096294403076, + "learning_rate": 3.8258514654266434e-05, + "loss": 1.5265, + "step": 1810 + }, + { + "epoch": 3.24, + "grad_norm": 6.949739456176758, + "learning_rate": 3.813982973233083e-05, + "loss": 1.3674, + "step": 1820 + }, + { + "epoch": 3.25, + "grad_norm": 3.84801983833313, + "learning_rate": 3.802073421344339e-05, + "loss": 1.4305, + "step": 1830 + }, + { + "epoch": 3.27, + "grad_norm": 3.5803613662719727, + "learning_rate": 3.7901231819133105e-05, + "loss": 1.557, + "step": 1840 + }, + { + "epoch": 3.29, + "grad_norm": 3.509099245071411, + "learning_rate": 3.7781326283643085e-05, + "loss": 1.3611, + "step": 1850 + }, + { + "epoch": 3.31, + "grad_norm": 41.185279846191406, + "learning_rate": 3.766102135381393e-05, + "loss": 1.3944, + "step": 1860 + }, + { + "epoch": 3.32, + "grad_norm": 3.797672748565674, + "learning_rate": 3.75403207889666e-05, + "loss": 1.2557, + "step": 1870 + }, + { + "epoch": 3.34, + "grad_norm": 4.237602233886719, + "learning_rate": 3.741922836078499e-05, + "loss": 1.3583, + "step": 1880 + }, + { + "epoch": 3.36, + "grad_norm": 4.5037312507629395, + "learning_rate": 3.729774785319801e-05, + "loss": 1.4619, + "step": 1890 + }, + { + "epoch": 3.38, + "grad_norm": 4.292742729187012, + "learning_rate": 3.717588306226143e-05, + "loss": 1.3986, + "step": 1900 + }, + { + "epoch": 3.38, + "eval_loss": 1.4685039520263672, + "eval_runtime": 124.7126, + "eval_samples_per_second": 8.018, + "eval_steps_per_second": 2.005, + "step": 1900 + }, + { + "epoch": 3.4, + "grad_norm": 5.198277950286865, + "learning_rate": 3.705363779603917e-05, + "loss": 1.349, + "step": 1910 + }, + { + "epoch": 3.41, + "grad_norm": 3.86722469329834, + "learning_rate": 3.693101587448436e-05, + "loss": 1.5053, + "step": 1920 + }, + { + "epoch": 3.43, + "grad_norm": 8.68099594116211, + "learning_rate": 3.680802112931996e-05, + "loss": 1.3899, + "step": 1930 + }, + { + "epoch": 3.45, + "grad_norm": 3.8347198963165283, + "learning_rate": 3.6684657403919005e-05, + "loss": 1.4519, + "step": 1940 + }, + { + "epoch": 3.47, + "grad_norm": 9.875212669372559, + "learning_rate": 3.6560928553184554e-05, + "loss": 1.4788, + "step": 1950 + }, + { + "epoch": 3.48, + "grad_norm": 8.638535499572754, + "learning_rate": 3.6436838443429175e-05, + "loss": 1.3777, + "step": 1960 + }, + { + "epoch": 3.5, + "grad_norm": 3.73545503616333, + "learning_rate": 3.631239095225417e-05, + "loss": 1.4962, + "step": 1970 + }, + { + "epoch": 3.52, + "grad_norm": 8.485962867736816, + "learning_rate": 3.618758996842839e-05, + "loss": 1.4377, + "step": 1980 + }, + { + "epoch": 3.54, + "grad_norm": 4.3264055252075195, + "learning_rate": 3.60624393917667e-05, + "loss": 1.5695, + "step": 1990 + }, + { + "epoch": 3.56, + "grad_norm": 3.979128837585449, + "learning_rate": 3.5936943133008183e-05, + "loss": 1.2959, + "step": 2000 + }, + { + "epoch": 3.56, + "eval_loss": 1.4598569869995117, + "eval_runtime": 124.8036, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 2000 + }, + { + "epoch": 3.57, + "grad_norm": 7.0864338874816895, + "learning_rate": 3.581110511369384e-05, + "loss": 1.4301, + "step": 2010 + }, + { + "epoch": 3.59, + "grad_norm": 4.597893714904785, + "learning_rate": 3.568492926604412e-05, + "loss": 1.2962, + "step": 2020 + }, + { + "epoch": 3.61, + "grad_norm": 3.7413573265075684, + "learning_rate": 3.555841953283603e-05, + "loss": 1.3708, + "step": 2030 + }, + { + "epoch": 3.63, + "grad_norm": 4.206364631652832, + "learning_rate": 3.5431579867279905e-05, + "loss": 1.4758, + "step": 2040 + }, + { + "epoch": 3.64, + "grad_norm": 5.203850269317627, + "learning_rate": 3.530441423289591e-05, + "loss": 1.4563, + "step": 2050 + }, + { + "epoch": 3.66, + "grad_norm": 3.0671565532684326, + "learning_rate": 3.517692660339018e-05, + "loss": 1.322, + "step": 2060 + }, + { + "epoch": 3.68, + "grad_norm": 4.655951499938965, + "learning_rate": 3.504912096253061e-05, + "loss": 1.4868, + "step": 2070 + }, + { + "epoch": 3.7, + "grad_norm": 3.5286195278167725, + "learning_rate": 3.492100130402242e-05, + "loss": 1.3379, + "step": 2080 + }, + { + "epoch": 3.72, + "grad_norm": 3.526078701019287, + "learning_rate": 3.479257163138334e-05, + "loss": 1.4864, + "step": 2090 + }, + { + "epoch": 3.73, + "grad_norm": 4.072696208953857, + "learning_rate": 3.4663835957818515e-05, + "loss": 1.4187, + "step": 2100 + }, + { + "epoch": 3.73, + "eval_loss": 1.4545879364013672, + "eval_runtime": 124.6694, + "eval_samples_per_second": 8.021, + "eval_steps_per_second": 2.005, + "step": 2100 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 1.6370752708179395e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2100/training_args.bin b/checkpoint-2100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-2100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-2200/README.md b/checkpoint-2200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-2200/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-2200/adapter_config.json b/checkpoint-2200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-2200/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-2200/adapter_model.safetensors b/checkpoint-2200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8651fe97ee81a004df96ae95db9ce4d24986ab7a --- /dev/null +++ b/checkpoint-2200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61c0821969cba83fb4ddfe3e06c4f9b790b26ed38f568a4fdcb10d728880fa89 +size 3695848 diff --git a/checkpoint-2200/optimizer.pt b/checkpoint-2200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..43804723f02033ac47be532983b43b3b590c35c9 --- /dev/null +++ b/checkpoint-2200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11261f4953e1e480fe038bc9455756601f7957d51e38997af816f1e735e46cb4 +size 7433594 diff --git a/checkpoint-2200/rng_state.pth b/checkpoint-2200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..8dfec0828a7402797682cd12944ac94c10f781d0 --- /dev/null +++ b/checkpoint-2200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a1b45396351f9be4812299398594de84d4ac97cdb52933bc1794e751252fcab +size 14244 diff --git a/checkpoint-2200/scheduler.pt b/checkpoint-2200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..84b3806e5c6261722acd7af77ac5a1b89c846c9f --- /dev/null +++ b/checkpoint-2200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5a23d8473e9043fe4710673fa5c72b13b7436a0f6a39c0d85ebd8b538a79f02 +size 1064 diff --git a/checkpoint-2200/special_tokens_map.json b/checkpoint-2200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-2200/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-2200/tokenizer.model b/checkpoint-2200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-2200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-2200/tokenizer_config.json b/checkpoint-2200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-2200/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-2200/trainer_state.json b/checkpoint-2200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..025d8f96a4bb52378ab725344d2fee8874935148 --- /dev/null +++ b/checkpoint-2200/trainer_state.json @@ -0,0 +1,1737 @@ +{ + "best_metric": 1.4545879364013672, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-2100", + "epoch": 3.911111111111111, + "eval_steps": 100, + "global_step": 2200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + }, + { + "epoch": 1.44, + "grad_norm": 3.520264148712158, + "learning_rate": 4.7480731191519224e-05, + "loss": 1.4991, + "step": 810 + }, + { + "epoch": 1.46, + "grad_norm": 5.3987531661987305, + "learning_rate": 4.741924250958289e-05, + "loss": 1.6856, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 12.352794647216797, + "learning_rate": 4.7357053265095575e-05, + "loss": 1.4509, + "step": 830 + }, + { + "epoch": 1.49, + "grad_norm": 9.825531005859375, + "learning_rate": 4.729416540136361e-05, + "loss": 1.6168, + "step": 840 + }, + { + "epoch": 1.51, + "grad_norm": 10.881526947021484, + "learning_rate": 4.723058088352395e-05, + "loss": 1.5783, + "step": 850 + }, + { + "epoch": 1.53, + "grad_norm": 6.232407093048096, + "learning_rate": 4.7166301698482815e-05, + "loss": 1.4556, + "step": 860 + }, + { + "epoch": 1.55, + "grad_norm": 3.3216302394866943, + "learning_rate": 4.710132985485355e-05, + "loss": 1.593, + "step": 870 + }, + { + "epoch": 1.56, + "grad_norm": 5.219264984130859, + "learning_rate": 4.703566738289389e-05, + "loss": 1.5131, + "step": 880 + }, + { + "epoch": 1.58, + "grad_norm": 7.875769138336182, + "learning_rate": 4.696931633444251e-05, + "loss": 1.5667, + "step": 890 + }, + { + "epoch": 1.6, + "grad_norm": 5.77959680557251, + "learning_rate": 4.69022787828549e-05, + "loss": 1.5211, + "step": 900 + }, + { + "epoch": 1.6, + "eval_loss": 1.5731443166732788, + "eval_runtime": 124.8025, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 900 + }, + { + "epoch": 1.62, + "grad_norm": 4.806954383850098, + "learning_rate": 4.683455682293863e-05, + "loss": 1.6824, + "step": 910 + }, + { + "epoch": 1.64, + "grad_norm": 5.980200290679932, + "learning_rate": 4.676615257088776e-05, + "loss": 1.5989, + "step": 920 + }, + { + "epoch": 1.65, + "grad_norm": 4.3645429611206055, + "learning_rate": 4.6697068164216896e-05, + "loss": 1.6469, + "step": 930 + }, + { + "epoch": 1.67, + "grad_norm": 3.2400012016296387, + "learning_rate": 4.662730576169423e-05, + "loss": 1.568, + "step": 940 + }, + { + "epoch": 1.69, + "grad_norm": 4.331827640533447, + "learning_rate": 4.6556867543274184e-05, + "loss": 1.5236, + "step": 950 + }, + { + "epoch": 1.71, + "grad_norm": 3.3798201084136963, + "learning_rate": 4.6485755710029256e-05, + "loss": 1.5046, + "step": 960 + }, + { + "epoch": 1.72, + "grad_norm": 5.440864086151123, + "learning_rate": 4.6413972484081216e-05, + "loss": 1.5816, + "step": 970 + }, + { + "epoch": 1.74, + "grad_norm": 5.852995872497559, + "learning_rate": 4.6341520108531746e-05, + "loss": 1.4193, + "step": 980 + }, + { + "epoch": 1.76, + "grad_norm": 4.2782206535339355, + "learning_rate": 4.626840084739224e-05, + "loss": 1.5457, + "step": 990 + }, + { + "epoch": 1.78, + "grad_norm": 8.631403923034668, + "learning_rate": 4.619461698551315e-05, + "loss": 1.652, + "step": 1000 + }, + { + "epoch": 1.78, + "eval_loss": 1.5386379957199097, + "eval_runtime": 124.8384, + "eval_samples_per_second": 8.01, + "eval_steps_per_second": 2.003, + "step": 1000 + }, + { + "epoch": 1.8, + "grad_norm": 4.581122875213623, + "learning_rate": 4.612017082851253e-05, + "loss": 1.5746, + "step": 1010 + }, + { + "epoch": 1.81, + "grad_norm": 3.0373165607452393, + "learning_rate": 4.604506470270403e-05, + "loss": 1.6038, + "step": 1020 + }, + { + "epoch": 1.83, + "grad_norm": 3.5066914558410645, + "learning_rate": 4.5969300955024167e-05, + "loss": 1.5725, + "step": 1030 + }, + { + "epoch": 1.85, + "grad_norm": 4.402235507965088, + "learning_rate": 4.589288195295901e-05, + "loss": 1.5469, + "step": 1040 + }, + { + "epoch": 1.87, + "grad_norm": 4.844370365142822, + "learning_rate": 4.58158100844702e-05, + "loss": 1.5424, + "step": 1050 + }, + { + "epoch": 1.88, + "grad_norm": 4.146657943725586, + "learning_rate": 4.573808775792033e-05, + "loss": 1.4878, + "step": 1060 + }, + { + "epoch": 1.9, + "grad_norm": 3.210528612136841, + "learning_rate": 4.5659717401997655e-05, + "loss": 1.6077, + "step": 1070 + }, + { + "epoch": 1.92, + "grad_norm": 5.2232818603515625, + "learning_rate": 4.5580701465640254e-05, + "loss": 1.4824, + "step": 1080 + }, + { + "epoch": 1.94, + "grad_norm": 2.8741068840026855, + "learning_rate": 4.550104241795946e-05, + "loss": 1.6172, + "step": 1090 + }, + { + "epoch": 1.96, + "grad_norm": 8.092519760131836, + "learning_rate": 4.5420742748162734e-05, + "loss": 1.3659, + "step": 1100 + }, + { + "epoch": 1.96, + "eval_loss": 1.5198711156845093, + "eval_runtime": 124.8546, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1100 + }, + { + "epoch": 1.97, + "grad_norm": 5.068336009979248, + "learning_rate": 4.5339804965475875e-05, + "loss": 1.4661, + "step": 1110 + }, + { + "epoch": 1.99, + "grad_norm": 13.167552947998047, + "learning_rate": 4.525823159906459e-05, + "loss": 1.411, + "step": 1120 + }, + { + "epoch": 2.01, + "grad_norm": 4.712369918823242, + "learning_rate": 4.5176025197955494e-05, + "loss": 1.3309, + "step": 1130 + }, + { + "epoch": 2.03, + "grad_norm": 7.261610507965088, + "learning_rate": 4.509318833095642e-05, + "loss": 1.3892, + "step": 1140 + }, + { + "epoch": 2.04, + "grad_norm": 3.8006956577301025, + "learning_rate": 4.500972358657618e-05, + "loss": 1.3927, + "step": 1150 + }, + { + "epoch": 2.06, + "grad_norm": 3.6301958560943604, + "learning_rate": 4.492563357294369e-05, + "loss": 1.4629, + "step": 1160 + }, + { + "epoch": 2.08, + "grad_norm": 4.353027820587158, + "learning_rate": 4.4840920917726426e-05, + "loss": 1.352, + "step": 1170 + }, + { + "epoch": 2.1, + "grad_norm": 3.375173807144165, + "learning_rate": 4.475558826804833e-05, + "loss": 1.4096, + "step": 1180 + }, + { + "epoch": 2.12, + "grad_norm": 6.289668560028076, + "learning_rate": 4.466963829040712e-05, + "loss": 1.4834, + "step": 1190 + }, + { + "epoch": 2.13, + "grad_norm": 4.517002582550049, + "learning_rate": 4.458307367059092e-05, + "loss": 1.4746, + "step": 1200 + }, + { + "epoch": 2.13, + "eval_loss": 1.5145190954208374, + "eval_runtime": 124.8898, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 1200 + }, + { + "epoch": 2.15, + "grad_norm": 3.195769786834717, + "learning_rate": 4.449589711359438e-05, + "loss": 1.4149, + "step": 1210 + }, + { + "epoch": 2.17, + "grad_norm": 3.751405715942383, + "learning_rate": 4.440811134353412e-05, + "loss": 1.5501, + "step": 1220 + }, + { + "epoch": 2.19, + "grad_norm": 4.148709774017334, + "learning_rate": 4.431971910356363e-05, + "loss": 1.5253, + "step": 1230 + }, + { + "epoch": 2.2, + "grad_norm": 20.003253936767578, + "learning_rate": 4.42307231557875e-05, + "loss": 1.6413, + "step": 1240 + }, + { + "epoch": 2.22, + "grad_norm": 4.721023082733154, + "learning_rate": 4.414112628117517e-05, + "loss": 1.5608, + "step": 1250 + }, + { + "epoch": 2.24, + "grad_norm": 4.672358989715576, + "learning_rate": 4.4050931279474015e-05, + "loss": 1.3646, + "step": 1260 + }, + { + "epoch": 2.26, + "grad_norm": 4.073034286499023, + "learning_rate": 4.396014096912182e-05, + "loss": 1.3499, + "step": 1270 + }, + { + "epoch": 2.28, + "grad_norm": 3.2312991619110107, + "learning_rate": 4.386875818715874e-05, + "loss": 1.4648, + "step": 1280 + }, + { + "epoch": 2.29, + "grad_norm": 18.92267417907715, + "learning_rate": 4.3776785789138675e-05, + "loss": 1.4919, + "step": 1290 + }, + { + "epoch": 2.31, + "grad_norm": 5.677367687225342, + "learning_rate": 4.368422664903997e-05, + "loss": 1.2891, + "step": 1300 + }, + { + "epoch": 2.31, + "eval_loss": 1.504623532295227, + "eval_runtime": 124.8541, + "eval_samples_per_second": 8.009, + "eval_steps_per_second": 2.002, + "step": 1300 + }, + { + "epoch": 2.33, + "grad_norm": 5.031940460205078, + "learning_rate": 4.359108365917565e-05, + "loss": 1.4939, + "step": 1310 + }, + { + "epoch": 2.35, + "grad_norm": 7.701929092407227, + "learning_rate": 4.349735973010305e-05, + "loss": 1.28, + "step": 1320 + }, + { + "epoch": 2.36, + "grad_norm": 5.7498040199279785, + "learning_rate": 4.3403057790532855e-05, + "loss": 1.4584, + "step": 1330 + }, + { + "epoch": 2.38, + "grad_norm": 8.7277193069458, + "learning_rate": 4.330818078723755e-05, + "loss": 1.5871, + "step": 1340 + }, + { + "epoch": 2.4, + "grad_norm": 13.915125846862793, + "learning_rate": 4.32127316849594e-05, + "loss": 1.3794, + "step": 1350 + }, + { + "epoch": 2.42, + "grad_norm": 2.949733018875122, + "learning_rate": 4.311671346631774e-05, + "loss": 1.3543, + "step": 1360 + }, + { + "epoch": 2.44, + "grad_norm": 5.377658843994141, + "learning_rate": 4.302012913171584e-05, + "loss": 1.3695, + "step": 1370 + }, + { + "epoch": 2.45, + "grad_norm": 16.94107437133789, + "learning_rate": 4.292298169924709e-05, + "loss": 1.5168, + "step": 1380 + }, + { + "epoch": 2.47, + "grad_norm": 4.190367221832275, + "learning_rate": 4.282527420460072e-05, + "loss": 1.4058, + "step": 1390 + }, + { + "epoch": 2.49, + "grad_norm": 9.269573211669922, + "learning_rate": 4.272700970096696e-05, + "loss": 1.5794, + "step": 1400 + }, + { + "epoch": 2.49, + "eval_loss": 1.498180627822876, + "eval_runtime": 124.7222, + "eval_samples_per_second": 8.018, + "eval_steps_per_second": 2.004, + "step": 1400 + }, + { + "epoch": 2.51, + "grad_norm": 3.951293468475342, + "learning_rate": 4.262819125894156e-05, + "loss": 1.56, + "step": 1410 + }, + { + "epoch": 2.52, + "grad_norm": 3.8725697994232178, + "learning_rate": 4.252882196642992e-05, + "loss": 1.5159, + "step": 1420 + }, + { + "epoch": 2.54, + "grad_norm": 3.898501396179199, + "learning_rate": 4.242890492855056e-05, + "loss": 1.4659, + "step": 1430 + }, + { + "epoch": 2.56, + "grad_norm": 5.807662487030029, + "learning_rate": 4.23284432675381e-05, + "loss": 1.5736, + "step": 1440 + }, + { + "epoch": 2.58, + "grad_norm": 3.529371500015259, + "learning_rate": 4.222744012264566e-05, + "loss": 1.5011, + "step": 1450 + }, + { + "epoch": 2.6, + "grad_norm": 6.336548805236816, + "learning_rate": 4.212589865004684e-05, + "loss": 1.6629, + "step": 1460 + }, + { + "epoch": 2.61, + "grad_norm": 6.222330093383789, + "learning_rate": 4.2023822022737016e-05, + "loss": 1.5573, + "step": 1470 + }, + { + "epoch": 2.63, + "grad_norm": 4.25172233581543, + "learning_rate": 4.192121343043424e-05, + "loss": 1.3817, + "step": 1480 + }, + { + "epoch": 2.65, + "grad_norm": 4.487111568450928, + "learning_rate": 4.181807607947954e-05, + "loss": 1.5323, + "step": 1490 + }, + { + "epoch": 2.67, + "grad_norm": 4.656155109405518, + "learning_rate": 4.1714413192736754e-05, + "loss": 1.3678, + "step": 1500 + }, + { + "epoch": 2.67, + "eval_loss": 1.5049968957901, + "eval_runtime": 124.7803, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.004, + "step": 1500 + }, + { + "epoch": 2.68, + "grad_norm": 4.431355953216553, + "learning_rate": 4.161022800949177e-05, + "loss": 1.486, + "step": 1510 + }, + { + "epoch": 2.7, + "grad_norm": 18.211524963378906, + "learning_rate": 4.150552378535137e-05, + "loss": 1.4498, + "step": 1520 + }, + { + "epoch": 2.72, + "grad_norm": 5.3755292892456055, + "learning_rate": 4.140030379214147e-05, + "loss": 1.4421, + "step": 1530 + }, + { + "epoch": 2.74, + "grad_norm": 6.626212120056152, + "learning_rate": 4.1294571317804854e-05, + "loss": 1.4322, + "step": 1540 + }, + { + "epoch": 2.76, + "grad_norm": 4.030793190002441, + "learning_rate": 4.1188329666298464e-05, + "loss": 1.3433, + "step": 1550 + }, + { + "epoch": 2.77, + "grad_norm": 6.53309440612793, + "learning_rate": 4.108158215749014e-05, + "loss": 1.5604, + "step": 1560 + }, + { + "epoch": 2.79, + "grad_norm": 3.76047420501709, + "learning_rate": 4.0974332127054914e-05, + "loss": 1.3259, + "step": 1570 + }, + { + "epoch": 2.81, + "grad_norm": 4.58742094039917, + "learning_rate": 4.0866582926370725e-05, + "loss": 1.4228, + "step": 1580 + }, + { + "epoch": 2.83, + "grad_norm": 4.566816806793213, + "learning_rate": 4.0758337922413716e-05, + "loss": 1.3013, + "step": 1590 + }, + { + "epoch": 2.84, + "grad_norm": 6.218478202819824, + "learning_rate": 4.064960049765304e-05, + "loss": 1.5061, + "step": 1600 + }, + { + "epoch": 2.84, + "eval_loss": 1.4853577613830566, + "eval_runtime": 124.7889, + "eval_samples_per_second": 8.014, + "eval_steps_per_second": 2.003, + "step": 1600 + }, + { + "epoch": 2.86, + "grad_norm": 13.811309814453125, + "learning_rate": 4.054037404994516e-05, + "loss": 1.4839, + "step": 1610 + }, + { + "epoch": 2.88, + "grad_norm": 5.560975074768066, + "learning_rate": 4.043066199242762e-05, + "loss": 1.4765, + "step": 1620 + }, + { + "epoch": 2.9, + "grad_norm": 35.27302551269531, + "learning_rate": 4.032046775341247e-05, + "loss": 1.4105, + "step": 1630 + }, + { + "epoch": 2.92, + "grad_norm": 4.9896745681762695, + "learning_rate": 4.020979477627907e-05, + "loss": 1.5688, + "step": 1640 + }, + { + "epoch": 2.93, + "grad_norm": 3.5250892639160156, + "learning_rate": 4.0098646519366534e-05, + "loss": 1.4484, + "step": 1650 + }, + { + "epoch": 2.95, + "grad_norm": 5.281729698181152, + "learning_rate": 3.998702645586565e-05, + "loss": 1.6017, + "step": 1660 + }, + { + "epoch": 2.97, + "grad_norm": 4.667525768280029, + "learning_rate": 3.9874938073710336e-05, + "loss": 1.5006, + "step": 1670 + }, + { + "epoch": 2.99, + "grad_norm": 4.294438362121582, + "learning_rate": 3.976238487546864e-05, + "loss": 1.4218, + "step": 1680 + }, + { + "epoch": 3.0, + "grad_norm": 4.070734977722168, + "learning_rate": 3.9649370378233365e-05, + "loss": 1.6569, + "step": 1690 + }, + { + "epoch": 3.02, + "grad_norm": 4.640359878540039, + "learning_rate": 3.953589811351204e-05, + "loss": 1.5635, + "step": 1700 + }, + { + "epoch": 3.02, + "eval_loss": 1.478974461555481, + "eval_runtime": 124.6852, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 1700 + }, + { + "epoch": 3.04, + "grad_norm": 4.43009090423584, + "learning_rate": 3.94219716271167e-05, + "loss": 1.3304, + "step": 1710 + }, + { + "epoch": 3.06, + "grad_norm": 4.001712799072266, + "learning_rate": 3.930759447905298e-05, + "loss": 1.3534, + "step": 1720 + }, + { + "epoch": 3.08, + "grad_norm": 4.664085388183594, + "learning_rate": 3.919277024340891e-05, + "loss": 1.368, + "step": 1730 + }, + { + "epoch": 3.09, + "grad_norm": 4.42681360244751, + "learning_rate": 3.907750250824327e-05, + "loss": 1.4164, + "step": 1740 + }, + { + "epoch": 3.11, + "grad_norm": 7.331808567047119, + "learning_rate": 3.8961794875473394e-05, + "loss": 1.4333, + "step": 1750 + }, + { + "epoch": 3.13, + "grad_norm": 5.612239837646484, + "learning_rate": 3.884565096076269e-05, + "loss": 1.5754, + "step": 1760 + }, + { + "epoch": 3.15, + "grad_norm": 5.236481666564941, + "learning_rate": 3.872907439340758e-05, + "loss": 1.4017, + "step": 1770 + }, + { + "epoch": 3.16, + "grad_norm": 4.995403289794922, + "learning_rate": 3.861206881622419e-05, + "loss": 1.5011, + "step": 1780 + }, + { + "epoch": 3.18, + "grad_norm": 41.0167236328125, + "learning_rate": 3.8494637885434396e-05, + "loss": 1.4472, + "step": 1790 + }, + { + "epoch": 3.2, + "grad_norm": 5.136650562286377, + "learning_rate": 3.837678527055168e-05, + "loss": 1.3939, + "step": 1800 + }, + { + "epoch": 3.2, + "eval_loss": 1.4779504537582397, + "eval_runtime": 124.6728, + "eval_samples_per_second": 8.021, + "eval_steps_per_second": 2.005, + "step": 1800 + }, + { + "epoch": 3.22, + "grad_norm": 5.178096294403076, + "learning_rate": 3.8258514654266434e-05, + "loss": 1.5265, + "step": 1810 + }, + { + "epoch": 3.24, + "grad_norm": 6.949739456176758, + "learning_rate": 3.813982973233083e-05, + "loss": 1.3674, + "step": 1820 + }, + { + "epoch": 3.25, + "grad_norm": 3.84801983833313, + "learning_rate": 3.802073421344339e-05, + "loss": 1.4305, + "step": 1830 + }, + { + "epoch": 3.27, + "grad_norm": 3.5803613662719727, + "learning_rate": 3.7901231819133105e-05, + "loss": 1.557, + "step": 1840 + }, + { + "epoch": 3.29, + "grad_norm": 3.509099245071411, + "learning_rate": 3.7781326283643085e-05, + "loss": 1.3611, + "step": 1850 + }, + { + "epoch": 3.31, + "grad_norm": 41.185279846191406, + "learning_rate": 3.766102135381393e-05, + "loss": 1.3944, + "step": 1860 + }, + { + "epoch": 3.32, + "grad_norm": 3.797672748565674, + "learning_rate": 3.75403207889666e-05, + "loss": 1.2557, + "step": 1870 + }, + { + "epoch": 3.34, + "grad_norm": 4.237602233886719, + "learning_rate": 3.741922836078499e-05, + "loss": 1.3583, + "step": 1880 + }, + { + "epoch": 3.36, + "grad_norm": 4.5037312507629395, + "learning_rate": 3.729774785319801e-05, + "loss": 1.4619, + "step": 1890 + }, + { + "epoch": 3.38, + "grad_norm": 4.292742729187012, + "learning_rate": 3.717588306226143e-05, + "loss": 1.3986, + "step": 1900 + }, + { + "epoch": 3.38, + "eval_loss": 1.4685039520263672, + "eval_runtime": 124.7126, + "eval_samples_per_second": 8.018, + "eval_steps_per_second": 2.005, + "step": 1900 + }, + { + "epoch": 3.4, + "grad_norm": 5.198277950286865, + "learning_rate": 3.705363779603917e-05, + "loss": 1.349, + "step": 1910 + }, + { + "epoch": 3.41, + "grad_norm": 3.86722469329834, + "learning_rate": 3.693101587448436e-05, + "loss": 1.5053, + "step": 1920 + }, + { + "epoch": 3.43, + "grad_norm": 8.68099594116211, + "learning_rate": 3.680802112931996e-05, + "loss": 1.3899, + "step": 1930 + }, + { + "epoch": 3.45, + "grad_norm": 3.8347198963165283, + "learning_rate": 3.6684657403919005e-05, + "loss": 1.4519, + "step": 1940 + }, + { + "epoch": 3.47, + "grad_norm": 9.875212669372559, + "learning_rate": 3.6560928553184554e-05, + "loss": 1.4788, + "step": 1950 + }, + { + "epoch": 3.48, + "grad_norm": 8.638535499572754, + "learning_rate": 3.6436838443429175e-05, + "loss": 1.3777, + "step": 1960 + }, + { + "epoch": 3.5, + "grad_norm": 3.73545503616333, + "learning_rate": 3.631239095225417e-05, + "loss": 1.4962, + "step": 1970 + }, + { + "epoch": 3.52, + "grad_norm": 8.485962867736816, + "learning_rate": 3.618758996842839e-05, + "loss": 1.4377, + "step": 1980 + }, + { + "epoch": 3.54, + "grad_norm": 4.3264055252075195, + "learning_rate": 3.60624393917667e-05, + "loss": 1.5695, + "step": 1990 + }, + { + "epoch": 3.56, + "grad_norm": 3.979128837585449, + "learning_rate": 3.5936943133008183e-05, + "loss": 1.2959, + "step": 2000 + }, + { + "epoch": 3.56, + "eval_loss": 1.4598569869995117, + "eval_runtime": 124.8036, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 2000 + }, + { + "epoch": 3.57, + "grad_norm": 7.0864338874816895, + "learning_rate": 3.581110511369384e-05, + "loss": 1.4301, + "step": 2010 + }, + { + "epoch": 3.59, + "grad_norm": 4.597893714904785, + "learning_rate": 3.568492926604412e-05, + "loss": 1.2962, + "step": 2020 + }, + { + "epoch": 3.61, + "grad_norm": 3.7413573265075684, + "learning_rate": 3.555841953283603e-05, + "loss": 1.3708, + "step": 2030 + }, + { + "epoch": 3.63, + "grad_norm": 4.206364631652832, + "learning_rate": 3.5431579867279905e-05, + "loss": 1.4758, + "step": 2040 + }, + { + "epoch": 3.64, + "grad_norm": 5.203850269317627, + "learning_rate": 3.530441423289591e-05, + "loss": 1.4563, + "step": 2050 + }, + { + "epoch": 3.66, + "grad_norm": 3.0671565532684326, + "learning_rate": 3.517692660339018e-05, + "loss": 1.322, + "step": 2060 + }, + { + "epoch": 3.68, + "grad_norm": 4.655951499938965, + "learning_rate": 3.504912096253061e-05, + "loss": 1.4868, + "step": 2070 + }, + { + "epoch": 3.7, + "grad_norm": 3.5286195278167725, + "learning_rate": 3.492100130402242e-05, + "loss": 1.3379, + "step": 2080 + }, + { + "epoch": 3.72, + "grad_norm": 3.526078701019287, + "learning_rate": 3.479257163138334e-05, + "loss": 1.4864, + "step": 2090 + }, + { + "epoch": 3.73, + "grad_norm": 4.072696208953857, + "learning_rate": 3.4663835957818515e-05, + "loss": 1.4187, + "step": 2100 + }, + { + "epoch": 3.73, + "eval_loss": 1.4545879364013672, + "eval_runtime": 124.6694, + "eval_samples_per_second": 8.021, + "eval_steps_per_second": 2.005, + "step": 2100 + }, + { + "epoch": 3.75, + "grad_norm": 6.851144313812256, + "learning_rate": 3.453479830609505e-05, + "loss": 1.5079, + "step": 2110 + }, + { + "epoch": 3.77, + "grad_norm": 3.958353042602539, + "learning_rate": 3.440546270841639e-05, + "loss": 1.3578, + "step": 2120 + }, + { + "epoch": 3.79, + "grad_norm": 5.926538944244385, + "learning_rate": 3.427583320629626e-05, + "loss": 1.3127, + "step": 2130 + }, + { + "epoch": 3.8, + "grad_norm": 6.969961643218994, + "learning_rate": 3.414591385043237e-05, + "loss": 1.4254, + "step": 2140 + }, + { + "epoch": 3.82, + "grad_norm": 5.364356994628906, + "learning_rate": 3.401570870057989e-05, + "loss": 1.3755, + "step": 2150 + }, + { + "epoch": 3.84, + "grad_norm": 6.824484825134277, + "learning_rate": 3.3885221825424537e-05, + "loss": 1.544, + "step": 2160 + }, + { + "epoch": 3.86, + "grad_norm": 7.165542125701904, + "learning_rate": 3.375445730245546e-05, + "loss": 1.3792, + "step": 2170 + }, + { + "epoch": 3.88, + "grad_norm": 5.511162757873535, + "learning_rate": 3.362341921783784e-05, + "loss": 1.3733, + "step": 2180 + }, + { + "epoch": 3.89, + "grad_norm": 3.903485059738159, + "learning_rate": 3.349211166628515e-05, + "loss": 1.3905, + "step": 2190 + }, + { + "epoch": 3.91, + "grad_norm": 7.315738677978516, + "learning_rate": 3.336053875093128e-05, + "loss": 1.3279, + "step": 2200 + }, + { + "epoch": 3.91, + "eval_loss": 1.4570436477661133, + "eval_runtime": 124.7476, + "eval_samples_per_second": 8.016, + "eval_steps_per_second": 2.004, + "step": 2200 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 1.714511502611841e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-2200/training_args.bin b/checkpoint-2200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-2200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-300/README.md b/checkpoint-300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-300/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-300/adapter_config.json b/checkpoint-300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-300/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-300/adapter_model.safetensors b/checkpoint-300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..1eac524e71150fb686c857b3ecde8b9d60c55a7e --- /dev/null +++ b/checkpoint-300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a62a8d28271ab68808ee44f71aa2b4f2dcc08dac87d51374fe444dde09e4ba73 +size 3695848 diff --git a/checkpoint-300/optimizer.pt b/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ee8e6e316a63ec5a00bf0bc1a7d81ad59028fef --- /dev/null +++ b/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b724a509912c69f119c06fa3b99c6bc7e4feaad1af0d525b2d8636ab5c1df84 +size 7433594 diff --git a/checkpoint-300/rng_state.pth b/checkpoint-300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c3cd38d64e5d7c3a674069e9029d353f1f2a9639 --- /dev/null +++ b/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e91b4ea1ac4122d75cee55ecf1437282eb9805106878d0036e55b683f7e445d7 +size 14244 diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b7435ee8a97a7b70807fd2523a70ed9ce2ecbca --- /dev/null +++ b/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8175ddae8fb9e4cb48cfee8d71a165eafcac69c750c31f6222ec6ff2ad52eacb +size 1064 diff --git a/checkpoint-300/special_tokens_map.json b/checkpoint-300/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-300/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-300/tokenizer.model b/checkpoint-300/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-300/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-300/tokenizer_config.json b/checkpoint-300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-300/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..290a130b25513b4cb72814b1c2b7e84aedcb8ced --- /dev/null +++ b/checkpoint-300/trainer_state.json @@ -0,0 +1,255 @@ +{ + "best_metric": 1.7350378036499023, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-300", + "epoch": 0.5333333333333333, + "eval_steps": 100, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 2.3293542877220045e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-300/training_args.bin b/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-400/README.md b/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-400/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-400/adapter_config.json b/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-400/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-400/adapter_model.safetensors b/checkpoint-400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2bee83805c9cb1bef35d80ca26ca65206d91cdb5 --- /dev/null +++ b/checkpoint-400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eab271a4b70a5c35c29d5fb695355d2e3b920f7b05098071de97f7ef6d9855a8 +size 3695848 diff --git a/checkpoint-400/optimizer.pt b/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..599f07d920d574ace9c503363a3f61f1d1656144 --- /dev/null +++ b/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be0fc3086d5e2016399f03fcb7c65f3a04e0e302f677254d801505f38c726d52 +size 7433594 diff --git a/checkpoint-400/rng_state.pth b/checkpoint-400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cb825955c7b6f4d1916a925b98dd80e4a9b21daf --- /dev/null +++ b/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9d3cbca4c3fd3552f9312d9c0b0118f85c1d03a64923a22ccb32945c38a1de3 +size 14244 diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..657c0ae6e493b810cd9ffb38058b26c3f9e5dbd8 --- /dev/null +++ b/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a8e7f2a2b05604957d96a1a4c87d00ccd9fbbe98833f40440a985146176688b +size 1064 diff --git a/checkpoint-400/special_tokens_map.json b/checkpoint-400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-400/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-400/tokenizer.model b/checkpoint-400/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-400/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-400/tokenizer_config.json b/checkpoint-400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-400/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c28baafebe04a60284ae71eb06353f8a2eaadf6b --- /dev/null +++ b/checkpoint-400/trainer_state.json @@ -0,0 +1,333 @@ +{ + "best_metric": 1.6840696334838867, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-400", + "epoch": 0.7111111111111111, + "eval_steps": 100, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 3.103976541168599e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-500/README.md b/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-500/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-500/adapter_config.json b/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-500/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-500/adapter_model.safetensors b/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..af01df7433c902621b6eabe1bb9f27ad72728df5 --- /dev/null +++ b/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a52ba5043422f3484785817facbbc0ae6be05c9bf53fdaca9723495ef5a1535 +size 3695848 diff --git a/checkpoint-500/optimizer.pt b/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..d4639645ff0f2cbff8c8a76a7c22278b4d4d9565 --- /dev/null +++ b/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b70b4f8532779697ca80bf262658a8b18802dd4f451d5ea1399caccf6545c81 +size 7433594 diff --git a/checkpoint-500/rng_state.pth b/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..74e05692af8f6b030acb27251818991b0f73c563 --- /dev/null +++ b/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52731161244c121a3b62356a5e8bf0f8cc2e15fefd2cd720673b8ae854a4ccf5 +size 14244 diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d067ef917e5fe0dba941a255af54f14e7391263b --- /dev/null +++ b/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91a57e18fb18db14878b8a155e12ea4b0ffa8b708eab5a5e3b3010da34e109ee +size 1064 diff --git a/checkpoint-500/special_tokens_map.json b/checkpoint-500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-500/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-500/tokenizer.model b/checkpoint-500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-500/tokenizer_config.json b/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-500/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9732139d797460906e649392f8561201d1655895 --- /dev/null +++ b/checkpoint-500/trainer_state.json @@ -0,0 +1,411 @@ +{ + "best_metric": 1.6494970321655273, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-500", + "epoch": 0.8888888888888888, + "eval_steps": 100, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 3.900024219508408e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-600/README.md b/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-600/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-600/adapter_config.json b/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-600/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-600/adapter_model.safetensors b/checkpoint-600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6a65599373aefc016cea66fb3a4e85a7aaab2b03 --- /dev/null +++ b/checkpoint-600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e3694054c6a2131e54f6caf4b2c0e27ae1a3c2b535cf180bb934e5b32cffaf3 +size 3695848 diff --git a/checkpoint-600/optimizer.pt b/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..332c49ff96e7a94dd30c43b70164ba72b95efb73 --- /dev/null +++ b/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8159c9c508d0ef1f0b6bca3438b8ea65456ce1188997bb18e689b9ee29f17e1d +size 7433594 diff --git a/checkpoint-600/rng_state.pth b/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..cace1b1f61a0e508e2d18b1f78140ffcf2c0aa17 --- /dev/null +++ b/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08175c45b878379b425d20603ff8117b6914cf3fa61640e2e083737ac207a756 +size 14244 diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..de80603803404cc8c6a8a259cfa6e7cee7e31963 --- /dev/null +++ b/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6532af35d88fbf684b13484cb01697ea6ea2a722c45a22fe3bcaa3a376d99541 +size 1064 diff --git a/checkpoint-600/special_tokens_map.json b/checkpoint-600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-600/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-600/tokenizer.model b/checkpoint-600/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-600/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-600/tokenizer_config.json b/checkpoint-600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-600/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..454317f8cdc695a9ab26192b562dced727f19145 --- /dev/null +++ b/checkpoint-600/trainer_state.json @@ -0,0 +1,489 @@ +{ + "best_metric": 1.6121476888656616, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-600", + "epoch": 1.0666666666666667, + "eval_steps": 100, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 4.721237505878262e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-600/training_args.bin b/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-700/README.md b/checkpoint-700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-700/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-700/adapter_config.json b/checkpoint-700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-700/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-700/adapter_model.safetensors b/checkpoint-700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d3ef5564f72f99dc5f25f8aba9ac8611d46915e6 --- /dev/null +++ b/checkpoint-700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f379a39138a824d1913075128eebd0f886157f78aa4e236d8e844b2c99c87a8 +size 3695848 diff --git a/checkpoint-700/optimizer.pt b/checkpoint-700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..9de044b9d8067aadfc9def06bba341babf4ac85c --- /dev/null +++ b/checkpoint-700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97f5f27581f89051d59d7296a395fe7508f965b9a67d2371530cd2f70b2b3155 +size 7433594 diff --git a/checkpoint-700/rng_state.pth b/checkpoint-700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..bfcfe87cf506e8b7d471b048c4789bccdf54579c --- /dev/null +++ b/checkpoint-700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3374e4382848f81495cebce1074c1d945df4b41896ba0064e25dee533589aa42 +size 14244 diff --git a/checkpoint-700/scheduler.pt b/checkpoint-700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9c3c37237c39c2c1c406f6fa3fa578bbecf9cea9 --- /dev/null +++ b/checkpoint-700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b77ac663dcaaf91a448ae8668c4d5946cd0cae79c85108863b5890d7b4b1a76 +size 1064 diff --git a/checkpoint-700/special_tokens_map.json b/checkpoint-700/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-700/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-700/tokenizer.model b/checkpoint-700/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-700/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-700/tokenizer_config.json b/checkpoint-700/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-700/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-700/trainer_state.json b/checkpoint-700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0a57eea159da63de17fe5ba1c24c80c89ddf95af --- /dev/null +++ b/checkpoint-700/trainer_state.json @@ -0,0 +1,567 @@ +{ + "best_metric": 1.5970302820205688, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-700", + "epoch": 1.2444444444444445, + "eval_steps": 100, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 5.4696929182285824e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-700/training_args.bin b/checkpoint-700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-800/README.md b/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-800/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-800/adapter_config.json b/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-800/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-800/adapter_model.safetensors b/checkpoint-800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..faef3866440d2bc76614eaec75524cf6529adb5f --- /dev/null +++ b/checkpoint-800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbb6fe1f205290e90a1b0d201ead217df8f3a6213df58a439af1854bfbcbb390 +size 3695848 diff --git a/checkpoint-800/optimizer.pt b/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f85aa4a90efef7f99dd43a73af7c86b5078996d --- /dev/null +++ b/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:663daa8b311e642a97e4d5564c1e1bc1fbc95702ab052d6860cdc621ab1d14d1 +size 7433594 diff --git a/checkpoint-800/rng_state.pth b/checkpoint-800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a26604bb05f029e8c445a2d7b4639f6a2ebd34ed --- /dev/null +++ b/checkpoint-800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c84a4879b22d336e78e06b3337d056dda27b09f85240912d9f0bb7cd11e3be34 +size 14244 diff --git a/checkpoint-800/scheduler.pt b/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..3df93f3bb8f515740934b0599e7bf93799506ba9 --- /dev/null +++ b/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eb6260e0865cd6fd54f15a4c3ca863963507aa707d0b43c4907093999453f25 +size 1064 diff --git a/checkpoint-800/special_tokens_map.json b/checkpoint-800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-800/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-800/tokenizer.model b/checkpoint-800/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-800/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-800/tokenizer_config.json b/checkpoint-800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-800/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-800/trainer_state.json b/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b26641d4cd027204ea871b078a84786ac0636d47 --- /dev/null +++ b/checkpoint-800/trainer_state.json @@ -0,0 +1,645 @@ +{ + "best_metric": 1.5639870166778564, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-800", + "epoch": 1.4222222222222223, + "eval_steps": 100, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 6.264551632302244e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-800/training_args.bin b/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/checkpoint-900/README.md b/checkpoint-900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a367c446e3a5f9a585222f3bda62c8b677d2b --- /dev/null +++ b/checkpoint-900/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: google/gemma-2b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-900/adapter_config.json b/checkpoint-900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6acfc8290c87b51d6dc715b6eab7cf151b0652fb --- /dev/null +++ b/checkpoint-900/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "google/gemma-2b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": "unsloth", + "target_modules": [ + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-900/adapter_model.safetensors b/checkpoint-900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..57d5d4ca9e5cb14f57bb44e11378c52b58860875 --- /dev/null +++ b/checkpoint-900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4009e77e0868b3fe29c9eca2dd493a8d2362c0f828115f5cba4a3d1c9853588e +size 3695848 diff --git a/checkpoint-900/optimizer.pt b/checkpoint-900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6873b372af0dde73829dbe74a750dd286ae6a554 --- /dev/null +++ b/checkpoint-900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:facfc39a156152327521e946dbf042307e946af7dcef94e6ae2ec430d3e5fef4 +size 7433594 diff --git a/checkpoint-900/rng_state.pth b/checkpoint-900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..c3ce56c7eb9ca092520c460db9e87c601284eb72 --- /dev/null +++ b/checkpoint-900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92e9a581ca85a141ade05a3dc7e0010c5a1cb23bfe7f026419952e74bb6df42d +size 14244 diff --git a/checkpoint-900/scheduler.pt b/checkpoint-900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6f42f81266d7af99c1174c3f52e283997fd064e --- /dev/null +++ b/checkpoint-900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be9cbba91548afe366b31f98f77877763ec337b14ec80fd4d1b111caad3b7936 +size 1064 diff --git a/checkpoint-900/special_tokens_map.json b/checkpoint-900/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..f6119589e367b2de0fc8cbd2f1217667532e3174 --- /dev/null +++ b/checkpoint-900/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-900/tokenizer.model b/checkpoint-900/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..796efe9ab515c15e146ce7588e6d7b9b8134dbf8 --- /dev/null +++ b/checkpoint-900/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a7b147390c64585d6c3543dd6fc636906c9af3865a5548f27f31aee1d4c8e2 +size 4241003 diff --git a/checkpoint-900/tokenizer_config.json b/checkpoint-900/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f08ad0abe7fe305103ce90b38e6f11a84d917681 --- /dev/null +++ b/checkpoint-900/tokenizer_config.json @@ -0,0 +1,51 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "3": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": null, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "GemmaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-900/trainer_state.json b/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..615e80baee5bb4949e642e242db434802c825c0b --- /dev/null +++ b/checkpoint-900/trainer_state.json @@ -0,0 +1,723 @@ +{ + "best_metric": 1.5639870166778564, + "best_model_checkpoint": "saves/Gemma-2B/lora/train_2024-03-01-04-36-32/checkpoint-800", + "epoch": 1.6, + "eval_steps": 100, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.02, + "grad_norm": 6.6079277992248535, + "learning_rate": 4.999960939662063e-05, + "loss": 3.747, + "step": 10 + }, + { + "epoch": 0.04, + "grad_norm": 3.2283411026000977, + "learning_rate": 4.999843759868819e-05, + "loss": 3.5789, + "step": 20 + }, + { + "epoch": 0.05, + "grad_norm": 41.573001861572266, + "learning_rate": 4.999648464281934e-05, + "loss": 3.1683, + "step": 30 + }, + { + "epoch": 0.07, + "grad_norm": 4.080965518951416, + "learning_rate": 4.9993750590040575e-05, + "loss": 2.8275, + "step": 40 + }, + { + "epoch": 0.09, + "grad_norm": 4.576275825500488, + "learning_rate": 4.999023552578632e-05, + "loss": 2.6758, + "step": 50 + }, + { + "epoch": 0.11, + "grad_norm": 18.012842178344727, + "learning_rate": 4.998593955989626e-05, + "loss": 2.6287, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 5.738934516906738, + "learning_rate": 4.9980862826611875e-05, + "loss": 2.5284, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 3.353776216506958, + "learning_rate": 4.9975005484572305e-05, + "loss": 2.2608, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 4.6298699378967285, + "learning_rate": 4.9968367716809374e-05, + "loss": 2.2475, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 50.594207763671875, + "learning_rate": 4.996094973074183e-05, + "loss": 2.2007, + "step": 100 + }, + { + "epoch": 0.18, + "eval_loss": 2.126384735107422, + "eval_runtime": 124.9221, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 10.225520133972168, + "learning_rate": 4.995275175816891e-05, + "loss": 1.9414, + "step": 110 + }, + { + "epoch": 0.21, + "grad_norm": 4.777626991271973, + "learning_rate": 4.994377405526308e-05, + "loss": 1.9729, + "step": 120 + }, + { + "epoch": 0.23, + "grad_norm": 6.133576393127441, + "learning_rate": 4.993401690256203e-05, + "loss": 2.0237, + "step": 130 + }, + { + "epoch": 0.25, + "grad_norm": 5.396271228790283, + "learning_rate": 4.992348060495989e-05, + "loss": 2.009, + "step": 140 + }, + { + "epoch": 0.27, + "grad_norm": 3.4974453449249268, + "learning_rate": 4.991216549169776e-05, + "loss": 2.032, + "step": 150 + }, + { + "epoch": 0.28, + "grad_norm": 12.256199836730957, + "learning_rate": 4.990007191635334e-05, + "loss": 1.9548, + "step": 160 + }, + { + "epoch": 0.3, + "grad_norm": 7.5634379386901855, + "learning_rate": 4.988720025682995e-05, + "loss": 1.8164, + "step": 170 + }, + { + "epoch": 0.32, + "grad_norm": 14.023727416992188, + "learning_rate": 4.987355091534468e-05, + "loss": 1.8517, + "step": 180 + }, + { + "epoch": 0.34, + "grad_norm": 4.622091293334961, + "learning_rate": 4.985912431841584e-05, + "loss": 2.0255, + "step": 190 + }, + { + "epoch": 0.36, + "grad_norm": 3.9935083389282227, + "learning_rate": 4.9843920916849645e-05, + "loss": 1.8777, + "step": 200 + }, + { + "epoch": 0.36, + "eval_loss": 1.8619400262832642, + "eval_runtime": 124.8712, + "eval_samples_per_second": 8.008, + "eval_steps_per_second": 2.002, + "step": 200 + }, + { + "epoch": 0.37, + "grad_norm": 6.256485939025879, + "learning_rate": 4.982794118572609e-05, + "loss": 1.8885, + "step": 210 + }, + { + "epoch": 0.39, + "grad_norm": 13.212824821472168, + "learning_rate": 4.981118562438414e-05, + "loss": 1.7744, + "step": 220 + }, + { + "epoch": 0.41, + "grad_norm": 4.2626118659973145, + "learning_rate": 4.9793654756406085e-05, + "loss": 1.7545, + "step": 230 + }, + { + "epoch": 0.43, + "grad_norm": 4.217405796051025, + "learning_rate": 4.9775349129601243e-05, + "loss": 1.5633, + "step": 240 + }, + { + "epoch": 0.44, + "grad_norm": 22.393404006958008, + "learning_rate": 4.9756269315988804e-05, + "loss": 1.8871, + "step": 250 + }, + { + "epoch": 0.46, + "grad_norm": 3.6576473712921143, + "learning_rate": 4.973641591177991e-05, + "loss": 1.7037, + "step": 260 + }, + { + "epoch": 0.48, + "grad_norm": 4.2433271408081055, + "learning_rate": 4.971578953735912e-05, + "loss": 1.7631, + "step": 270 + }, + { + "epoch": 0.5, + "grad_norm": 3.7399721145629883, + "learning_rate": 4.969439083726496e-05, + "loss": 1.7714, + "step": 280 + }, + { + "epoch": 0.52, + "grad_norm": 4.575680255889893, + "learning_rate": 4.967222048016979e-05, + "loss": 1.8699, + "step": 290 + }, + { + "epoch": 0.53, + "grad_norm": 7.729683876037598, + "learning_rate": 4.964927915885893e-05, + "loss": 1.6566, + "step": 300 + }, + { + "epoch": 0.53, + "eval_loss": 1.7350378036499023, + "eval_runtime": 124.9278, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 300 + }, + { + "epoch": 0.55, + "grad_norm": 2.755899667739868, + "learning_rate": 4.962556759020898e-05, + "loss": 1.7193, + "step": 310 + }, + { + "epoch": 0.57, + "grad_norm": 3.513024091720581, + "learning_rate": 4.960108651516545e-05, + "loss": 1.852, + "step": 320 + }, + { + "epoch": 0.59, + "grad_norm": 3.7794790267944336, + "learning_rate": 4.9575836698719605e-05, + "loss": 1.6785, + "step": 330 + }, + { + "epoch": 0.6, + "grad_norm": 3.2256739139556885, + "learning_rate": 4.954981892988451e-05, + "loss": 1.6648, + "step": 340 + }, + { + "epoch": 0.62, + "grad_norm": 2.8756954669952393, + "learning_rate": 4.952303402167047e-05, + "loss": 1.6399, + "step": 350 + }, + { + "epoch": 0.64, + "grad_norm": 7.057961463928223, + "learning_rate": 4.949548281105951e-05, + "loss": 1.5875, + "step": 360 + }, + { + "epoch": 0.66, + "grad_norm": 4.63081169128418, + "learning_rate": 4.946716615897932e-05, + "loss": 1.6708, + "step": 370 + }, + { + "epoch": 0.68, + "grad_norm": 8.755204200744629, + "learning_rate": 4.943808495027631e-05, + "loss": 1.636, + "step": 380 + }, + { + "epoch": 0.69, + "grad_norm": 10.21866226196289, + "learning_rate": 4.940824009368793e-05, + "loss": 1.5714, + "step": 390 + }, + { + "epoch": 0.71, + "grad_norm": 5.44133186340332, + "learning_rate": 4.937763252181434e-05, + "loss": 1.4084, + "step": 400 + }, + { + "epoch": 0.71, + "eval_loss": 1.6840696334838867, + "eval_runtime": 124.8851, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 2.002, + "step": 400 + }, + { + "epoch": 0.73, + "grad_norm": 3.056345224380493, + "learning_rate": 4.934626319108923e-05, + "loss": 1.7233, + "step": 410 + }, + { + "epoch": 0.75, + "grad_norm": 4.303133487701416, + "learning_rate": 4.93141330817499e-05, + "loss": 1.5374, + "step": 420 + }, + { + "epoch": 0.76, + "grad_norm": 5.2246623039245605, + "learning_rate": 4.9281243197806726e-05, + "loss": 1.8547, + "step": 430 + }, + { + "epoch": 0.78, + "grad_norm": 3.8070685863494873, + "learning_rate": 4.924759456701167e-05, + "loss": 1.5721, + "step": 440 + }, + { + "epoch": 0.8, + "grad_norm": 3.243337392807007, + "learning_rate": 4.9213188240826245e-05, + "loss": 1.4322, + "step": 450 + }, + { + "epoch": 0.82, + "grad_norm": 4.166132926940918, + "learning_rate": 4.917802529438864e-05, + "loss": 1.6621, + "step": 460 + }, + { + "epoch": 0.84, + "grad_norm": 4.54414701461792, + "learning_rate": 4.9142106826480114e-05, + "loss": 1.6088, + "step": 470 + }, + { + "epoch": 0.85, + "grad_norm": 9.983458518981934, + "learning_rate": 4.910543395949067e-05, + "loss": 1.6152, + "step": 480 + }, + { + "epoch": 0.87, + "grad_norm": 6.45111608505249, + "learning_rate": 4.9068007839383946e-05, + "loss": 1.6361, + "step": 490 + }, + { + "epoch": 0.89, + "grad_norm": 108.82310485839844, + "learning_rate": 4.9029829635661475e-05, + "loss": 1.7045, + "step": 500 + }, + { + "epoch": 0.89, + "eval_loss": 1.6494970321655273, + "eval_runtime": 124.6904, + "eval_samples_per_second": 8.02, + "eval_steps_per_second": 2.005, + "step": 500 + }, + { + "epoch": 0.91, + "grad_norm": 5.705786228179932, + "learning_rate": 4.899090054132609e-05, + "loss": 1.738, + "step": 510 + }, + { + "epoch": 0.92, + "grad_norm": 4.800131320953369, + "learning_rate": 4.895122177284465e-05, + "loss": 1.6218, + "step": 520 + }, + { + "epoch": 0.94, + "grad_norm": 10.11057186126709, + "learning_rate": 4.891079457011005e-05, + "loss": 1.5169, + "step": 530 + }, + { + "epoch": 0.96, + "grad_norm": 9.329095840454102, + "learning_rate": 4.8869620196402436e-05, + "loss": 1.7979, + "step": 540 + }, + { + "epoch": 0.98, + "grad_norm": 3.9115641117095947, + "learning_rate": 4.882769993834978e-05, + "loss": 1.7073, + "step": 550 + }, + { + "epoch": 1.0, + "grad_norm": 4.80266809463501, + "learning_rate": 4.878503510588765e-05, + "loss": 1.6541, + "step": 560 + }, + { + "epoch": 1.01, + "grad_norm": 9.07653522491455, + "learning_rate": 4.874162703221823e-05, + "loss": 1.6888, + "step": 570 + }, + { + "epoch": 1.03, + "grad_norm": 4.492751598358154, + "learning_rate": 4.8697477073768766e-05, + "loss": 1.6448, + "step": 580 + }, + { + "epoch": 1.05, + "grad_norm": 13.852599143981934, + "learning_rate": 4.8652586610149095e-05, + "loss": 1.6236, + "step": 590 + }, + { + "epoch": 1.07, + "grad_norm": 5.424524307250977, + "learning_rate": 4.8606957044108556e-05, + "loss": 1.4969, + "step": 600 + }, + { + "epoch": 1.07, + "eval_loss": 1.6121476888656616, + "eval_runtime": 124.7413, + "eval_samples_per_second": 8.017, + "eval_steps_per_second": 2.004, + "step": 600 + }, + { + "epoch": 1.08, + "grad_norm": 3.611617088317871, + "learning_rate": 4.856058980149216e-05, + "loss": 1.4571, + "step": 610 + }, + { + "epoch": 1.1, + "grad_norm": 4.210519313812256, + "learning_rate": 4.851348633119606e-05, + "loss": 1.63, + "step": 620 + }, + { + "epoch": 1.12, + "grad_norm": 95.43629455566406, + "learning_rate": 4.84656481051222e-05, + "loss": 1.6034, + "step": 630 + }, + { + "epoch": 1.14, + "grad_norm": 4.3693528175354, + "learning_rate": 4.8417076618132426e-05, + "loss": 1.5791, + "step": 640 + }, + { + "epoch": 1.16, + "grad_norm": 3.691178321838379, + "learning_rate": 4.836777338800168e-05, + "loss": 1.5327, + "step": 650 + }, + { + "epoch": 1.17, + "grad_norm": 3.547637939453125, + "learning_rate": 4.8317739955370636e-05, + "loss": 1.4278, + "step": 660 + }, + { + "epoch": 1.19, + "grad_norm": 3.426717519760132, + "learning_rate": 4.8266977883697515e-05, + "loss": 1.5317, + "step": 670 + }, + { + "epoch": 1.21, + "grad_norm": 3.004473924636841, + "learning_rate": 4.821548875920927e-05, + "loss": 1.6848, + "step": 680 + }, + { + "epoch": 1.23, + "grad_norm": 3.686044931411743, + "learning_rate": 4.816327419085196e-05, + "loss": 1.6079, + "step": 690 + }, + { + "epoch": 1.24, + "grad_norm": 4.130298137664795, + "learning_rate": 4.811033581024056e-05, + "loss": 1.5998, + "step": 700 + }, + { + "epoch": 1.24, + "eval_loss": 1.5970302820205688, + "eval_runtime": 124.9388, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 2.001, + "step": 700 + }, + { + "epoch": 1.26, + "grad_norm": 6.1143059730529785, + "learning_rate": 4.805667527160788e-05, + "loss": 1.554, + "step": 710 + }, + { + "epoch": 1.28, + "grad_norm": 31.27813148498535, + "learning_rate": 4.800229425175294e-05, + "loss": 1.5824, + "step": 720 + }, + { + "epoch": 1.3, + "grad_norm": 9.035768508911133, + "learning_rate": 4.7947194449988555e-05, + "loss": 1.547, + "step": 730 + }, + { + "epoch": 1.32, + "grad_norm": 39.38993835449219, + "learning_rate": 4.7891377588088223e-05, + "loss": 1.5795, + "step": 740 + }, + { + "epoch": 1.33, + "grad_norm": 7.738800048828125, + "learning_rate": 4.7834845410232356e-05, + "loss": 1.5761, + "step": 750 + }, + { + "epoch": 1.35, + "grad_norm": 3.3933961391448975, + "learning_rate": 4.777759968295369e-05, + "loss": 1.6293, + "step": 760 + }, + { + "epoch": 1.37, + "grad_norm": 4.511744022369385, + "learning_rate": 4.771964219508222e-05, + "loss": 1.4761, + "step": 770 + }, + { + "epoch": 1.39, + "grad_norm": 3.566397190093994, + "learning_rate": 4.766097475768919e-05, + "loss": 1.5707, + "step": 780 + }, + { + "epoch": 1.4, + "grad_norm": 9.365654945373535, + "learning_rate": 4.7601599204030544e-05, + "loss": 1.3932, + "step": 790 + }, + { + "epoch": 1.42, + "grad_norm": 3.3254847526550293, + "learning_rate": 4.754151738948962e-05, + "loss": 1.6041, + "step": 800 + }, + { + "epoch": 1.42, + "eval_loss": 1.5639870166778564, + "eval_runtime": 124.923, + "eval_samples_per_second": 8.005, + "eval_steps_per_second": 2.001, + "step": 800 + }, + { + "epoch": 1.44, + "grad_norm": 3.520264148712158, + "learning_rate": 4.7480731191519224e-05, + "loss": 1.4991, + "step": 810 + }, + { + "epoch": 1.46, + "grad_norm": 5.3987531661987305, + "learning_rate": 4.741924250958289e-05, + "loss": 1.6856, + "step": 820 + }, + { + "epoch": 1.48, + "grad_norm": 12.352794647216797, + "learning_rate": 4.7357053265095575e-05, + "loss": 1.4509, + "step": 830 + }, + { + "epoch": 1.49, + "grad_norm": 9.825531005859375, + "learning_rate": 4.729416540136361e-05, + "loss": 1.6168, + "step": 840 + }, + { + "epoch": 1.51, + "grad_norm": 10.881526947021484, + "learning_rate": 4.723058088352395e-05, + "loss": 1.5783, + "step": 850 + }, + { + "epoch": 1.53, + "grad_norm": 6.232407093048096, + "learning_rate": 4.7166301698482815e-05, + "loss": 1.4556, + "step": 860 + }, + { + "epoch": 1.55, + "grad_norm": 3.3216302394866943, + "learning_rate": 4.710132985485355e-05, + "loss": 1.593, + "step": 870 + }, + { + "epoch": 1.56, + "grad_norm": 5.219264984130859, + "learning_rate": 4.703566738289389e-05, + "loss": 1.5131, + "step": 880 + }, + { + "epoch": 1.58, + "grad_norm": 7.875769138336182, + "learning_rate": 4.696931633444251e-05, + "loss": 1.5667, + "step": 890 + }, + { + "epoch": 1.6, + "grad_norm": 5.77959680557251, + "learning_rate": 4.69022787828549e-05, + "loss": 1.5211, + "step": 900 + }, + { + "epoch": 1.6, + "eval_loss": 1.5731443166732788, + "eval_runtime": 124.8025, + "eval_samples_per_second": 8.013, + "eval_steps_per_second": 2.003, + "step": 900 + } + ], + "logging_steps": 10, + "max_steps": 5620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "total_flos": 7.080392918182134e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-900/training_args.bin b/checkpoint-900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..4a9f141004eacb993ed2644ad1df0d132a6a81d8 --- /dev/null +++ b/checkpoint-900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3882dfa1e7d13fcab0815293b9ac841a1a275771a1fdadce3f013196b52e019b +size 5048 diff --git a/runs/.DS_Store b/runs/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..8afaa88626997669157da5e19b44fcf5718e3b5a Binary files /dev/null and b/runs/.DS_Store differ diff --git a/runs/events.out.tfevents.1709267884.cca2c738afde b/runs/events.out.tfevents.1709267884.cca2c738afde new file mode 100644 index 0000000000000000000000000000000000000000..a96834e25b6e250c36a2b4a3e5ba7bfd4b070c11 --- /dev/null +++ b/runs/events.out.tfevents.1709267884.cca2c738afde @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e56c2ec018bb543cc1d5215c48ac5d5ad5d95b670c099eeab972d04eed30268 +size 57768