diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..17729c873634bbfb1699f787a721c2f034265b15 --- /dev/null +++ b/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/ubuntu/Apps/DataInf/models/model +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8b6984d46d6583eed5953a32eacf9f9ce36613d --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/ubuntu/Apps/DataInf/models/model", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b8569b1b01e4815c728777d4e1f28ddf15949f2f --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3c618c18055bfce72d306d6b635cd0ecbd60120c3067688e0d526ab340b6b02 +size 26235704 diff --git a/checkpoint-100/README.md b/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..17729c873634bbfb1699f787a721c2f034265b15 --- /dev/null +++ b/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/ubuntu/Apps/DataInf/models/model +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/checkpoint-100/adapter_config.json b/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8b6984d46d6583eed5953a32eacf9f9ce36613d --- /dev/null +++ b/checkpoint-100/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/ubuntu/Apps/DataInf/models/model", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-100/adapter_model.safetensors b/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..921290275a586ea3dab33fe7762bc7224f3f3d3a --- /dev/null +++ b/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3471e7f6f9007da19571b83b2c2ffc8c2b14cebd1f4f0edafaf9d5308fe78d79 +size 26235704 diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..2cdfb6f99d82112a43d15d23ffc5b71b7ceee9f3 --- /dev/null +++ b/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6aabca352b9775a966b16d5684f7f41606975c89d075033dcf2ac50fd4b63e4 +size 52563258 diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ec7f0bc9183e6d8f5e6e48d33aa3cd13068989ee --- /dev/null +++ b/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a16561ec857cdb55d8ca0062103fa8db84597e98bbcbc54a602f1c5d7574907a +size 14244 diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..80b5f49597962dd5e855774dfd8e32005cf4598d --- /dev/null +++ b/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de4aa167f166f0cccc89350e9848a4fe7936d18a758a5fb69167c22863ba1414 +size 1064 diff --git a/checkpoint-100/special_tokens_map.json b/checkpoint-100/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-100/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-100/tokenizer.model b/checkpoint-100/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-100/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6ca9f0b39df7b30b561a2070b66bf0059e2aa9c8 --- /dev/null +++ b/checkpoint-100/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5178c6e405040eb29dd31eadebfca97b4ff24b6e --- /dev/null +++ b/checkpoint-100/trainer_state.json @@ -0,0 +1,733 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.2, + "eval_steps": 500, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.032, + "grad_norm": 0.3297976851463318, + "learning_rate": 0.0002990322580645161, + "loss": 1.0389, + "step": 1 + }, + { + "epoch": 0.064, + "grad_norm": 0.4069916307926178, + "learning_rate": 0.0002980645161290322, + "loss": 1.3377, + "step": 2 + }, + { + "epoch": 0.096, + "grad_norm": 0.42084500193595886, + "learning_rate": 0.00029709677419354836, + "loss": 0.9366, + "step": 3 + }, + { + "epoch": 0.128, + "grad_norm": 0.4641948938369751, + "learning_rate": 0.0002961290322580645, + "loss": 1.0086, + "step": 4 + }, + { + "epoch": 0.16, + "grad_norm": 0.3840750455856323, + "learning_rate": 0.00029516129032258065, + "loss": 0.8333, + "step": 5 + }, + { + "epoch": 0.192, + "grad_norm": 0.4263865053653717, + "learning_rate": 0.00029419354838709674, + "loss": 0.854, + "step": 6 + }, + { + "epoch": 0.224, + "grad_norm": 0.48615148663520813, + "learning_rate": 0.0002932258064516129, + "loss": 0.9548, + "step": 7 + }, + { + "epoch": 0.256, + "grad_norm": 0.44419369101524353, + "learning_rate": 0.00029225806451612903, + "loss": 0.8482, + "step": 8 + }, + { + "epoch": 0.288, + "grad_norm": 0.5317733883857727, + "learning_rate": 0.0002912903225806451, + "loss": 0.9426, + "step": 9 + }, + { + "epoch": 0.32, + "grad_norm": 0.47260937094688416, + "learning_rate": 0.00029032258064516127, + "loss": 0.9816, + "step": 10 + }, + { + "epoch": 0.352, + "grad_norm": 0.39063283801078796, + "learning_rate": 0.00028935483870967736, + "loss": 0.84, + "step": 11 + }, + { + "epoch": 0.384, + "grad_norm": 0.39234670996665955, + "learning_rate": 0.0002883870967741935, + "loss": 0.7476, + "step": 12 + }, + { + "epoch": 0.416, + "grad_norm": 0.40661805868148804, + "learning_rate": 0.00028741935483870965, + "loss": 0.9282, + "step": 13 + }, + { + "epoch": 0.448, + "grad_norm": 0.42970865964889526, + "learning_rate": 0.0002864516129032258, + "loss": 0.7858, + "step": 14 + }, + { + "epoch": 0.48, + "grad_norm": 0.3780193626880646, + "learning_rate": 0.00028548387096774194, + "loss": 0.7968, + "step": 15 + }, + { + "epoch": 0.512, + "grad_norm": 0.37006014585494995, + "learning_rate": 0.00028451612903225803, + "loss": 0.6801, + "step": 16 + }, + { + "epoch": 0.544, + "grad_norm": 0.3660840392112732, + "learning_rate": 0.0002835483870967742, + "loss": 0.5914, + "step": 17 + }, + { + "epoch": 0.576, + "grad_norm": 0.3270975351333618, + "learning_rate": 0.00028258064516129027, + "loss": 0.6449, + "step": 18 + }, + { + "epoch": 0.608, + "grad_norm": 0.3859024941921234, + "learning_rate": 0.0002816129032258064, + "loss": 0.8144, + "step": 19 + }, + { + "epoch": 0.64, + "grad_norm": 0.37092071771621704, + "learning_rate": 0.00028064516129032256, + "loss": 0.7667, + "step": 20 + }, + { + "epoch": 0.672, + "grad_norm": 0.37667015194892883, + "learning_rate": 0.0002796774193548387, + "loss": 0.7751, + "step": 21 + }, + { + "epoch": 0.704, + "grad_norm": 0.3832458555698395, + "learning_rate": 0.0002787096774193548, + "loss": 0.755, + "step": 22 + }, + { + "epoch": 0.736, + "grad_norm": 0.327288419008255, + "learning_rate": 0.00027774193548387095, + "loss": 0.7178, + "step": 23 + }, + { + "epoch": 0.768, + "grad_norm": 0.34552687406539917, + "learning_rate": 0.0002767741935483871, + "loss": 0.7057, + "step": 24 + }, + { + "epoch": 0.8, + "grad_norm": 0.3611259460449219, + "learning_rate": 0.0002758064516129032, + "loss": 0.8159, + "step": 25 + }, + { + "epoch": 0.832, + "grad_norm": 0.3345054090023041, + "learning_rate": 0.00027483870967741933, + "loss": 0.7208, + "step": 26 + }, + { + "epoch": 0.864, + "grad_norm": 0.3697254955768585, + "learning_rate": 0.0002738709677419355, + "loss": 0.8964, + "step": 27 + }, + { + "epoch": 0.896, + "grad_norm": 0.3905017375946045, + "learning_rate": 0.00027290322580645157, + "loss": 0.7794, + "step": 28 + }, + { + "epoch": 0.928, + "grad_norm": 0.3715725243091583, + "learning_rate": 0.0002719354838709677, + "loss": 0.6966, + "step": 29 + }, + { + "epoch": 0.96, + "grad_norm": 0.3650343120098114, + "learning_rate": 0.00027096774193548386, + "loss": 0.5761, + "step": 30 + }, + { + "epoch": 0.992, + "grad_norm": 0.33932459354400635, + "learning_rate": 0.00027, + "loss": 0.556, + "step": 31 + }, + { + "epoch": 1.024, + "grad_norm": 0.6371742486953735, + "learning_rate": 0.0002690322580645161, + "loss": 0.847, + "step": 32 + }, + { + "epoch": 1.056, + "grad_norm": 0.37499895691871643, + "learning_rate": 0.00026806451612903224, + "loss": 0.8419, + "step": 33 + }, + { + "epoch": 1.088, + "grad_norm": 0.33221954107284546, + "learning_rate": 0.0002670967741935484, + "loss": 0.6011, + "step": 34 + }, + { + "epoch": 1.12, + "grad_norm": 0.344096839427948, + "learning_rate": 0.0002661290322580645, + "loss": 0.6501, + "step": 35 + }, + { + "epoch": 1.152, + "grad_norm": 0.38429391384124756, + "learning_rate": 0.0002651612903225806, + "loss": 0.8091, + "step": 36 + }, + { + "epoch": 1.184, + "grad_norm": 0.38014867901802063, + "learning_rate": 0.00026419354838709677, + "loss": 0.7668, + "step": 37 + }, + { + "epoch": 1.216, + "grad_norm": 0.3352573812007904, + "learning_rate": 0.00026322580645161286, + "loss": 0.5444, + "step": 38 + }, + { + "epoch": 1.248, + "grad_norm": 0.33811062574386597, + "learning_rate": 0.000262258064516129, + "loss": 0.512, + "step": 39 + }, + { + "epoch": 1.28, + "grad_norm": 0.3998416066169739, + "learning_rate": 0.00026129032258064515, + "loss": 0.6315, + "step": 40 + }, + { + "epoch": 1.312, + "grad_norm": 0.3983341157436371, + "learning_rate": 0.0002603225806451613, + "loss": 0.5882, + "step": 41 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.4585898816585541, + "learning_rate": 0.0002593548387096774, + "loss": 0.761, + "step": 42 + }, + { + "epoch": 1.376, + "grad_norm": 0.4080730080604553, + "learning_rate": 0.00025838709677419354, + "loss": 0.6716, + "step": 43 + }, + { + "epoch": 1.408, + "grad_norm": 0.4068273901939392, + "learning_rate": 0.0002574193548387096, + "loss": 0.6376, + "step": 44 + }, + { + "epoch": 1.44, + "grad_norm": 0.4406949579715729, + "learning_rate": 0.00025645161290322577, + "loss": 0.4594, + "step": 45 + }, + { + "epoch": 1.472, + "grad_norm": 0.34500986337661743, + "learning_rate": 0.0002554838709677419, + "loss": 0.3672, + "step": 46 + }, + { + "epoch": 1.504, + "grad_norm": 0.4760681390762329, + "learning_rate": 0.00025451612903225806, + "loss": 0.6331, + "step": 47 + }, + { + "epoch": 1.536, + "grad_norm": 0.39281558990478516, + "learning_rate": 0.0002535483870967742, + "loss": 0.5845, + "step": 48 + }, + { + "epoch": 1.568, + "grad_norm": 0.4265002906322479, + "learning_rate": 0.0002525806451612903, + "loss": 0.4461, + "step": 49 + }, + { + "epoch": 1.6, + "grad_norm": 0.40967294573783875, + "learning_rate": 0.00025161290322580645, + "loss": 0.7011, + "step": 50 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.4288088381290436, + "learning_rate": 0.00025064516129032254, + "loss": 0.6928, + "step": 51 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.4356289803981781, + "learning_rate": 0.0002496774193548387, + "loss": 0.7972, + "step": 52 + }, + { + "epoch": 1.696, + "grad_norm": 0.3827487826347351, + "learning_rate": 0.0002487096774193548, + "loss": 0.2991, + "step": 53 + }, + { + "epoch": 1.728, + "grad_norm": 0.40093398094177246, + "learning_rate": 0.0002477419354838709, + "loss": 0.416, + "step": 54 + }, + { + "epoch": 1.76, + "grad_norm": 0.41548973321914673, + "learning_rate": 0.00024677419354838707, + "loss": 0.5501, + "step": 55 + }, + { + "epoch": 1.792, + "grad_norm": 0.4093388617038727, + "learning_rate": 0.0002458064516129032, + "loss": 0.5557, + "step": 56 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.3934040665626526, + "learning_rate": 0.00024483870967741936, + "loss": 0.602, + "step": 57 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.42221033573150635, + "learning_rate": 0.00024387096774193545, + "loss": 0.6421, + "step": 58 + }, + { + "epoch": 1.888, + "grad_norm": 0.4351339340209961, + "learning_rate": 0.0002429032258064516, + "loss": 0.5615, + "step": 59 + }, + { + "epoch": 1.92, + "grad_norm": 0.4319838881492615, + "learning_rate": 0.00024193548387096771, + "loss": 0.6804, + "step": 60 + }, + { + "epoch": 1.952, + "grad_norm": 0.40016525983810425, + "learning_rate": 0.00024096774193548386, + "loss": 0.5432, + "step": 61 + }, + { + "epoch": 1.984, + "grad_norm": 0.3905942440032959, + "learning_rate": 0.00023999999999999998, + "loss": 0.4187, + "step": 62 + }, + { + "epoch": 2.016, + "grad_norm": 0.8056382536888123, + "learning_rate": 0.0002390322580645161, + "loss": 1.0174, + "step": 63 + }, + { + "epoch": 2.048, + "grad_norm": 0.3835236430168152, + "learning_rate": 0.00023806451612903224, + "loss": 0.5992, + "step": 64 + }, + { + "epoch": 2.08, + "grad_norm": 0.41092216968536377, + "learning_rate": 0.00023709677419354836, + "loss": 0.4746, + "step": 65 + }, + { + "epoch": 2.112, + "grad_norm": 0.39536622166633606, + "learning_rate": 0.0002361290322580645, + "loss": 0.3946, + "step": 66 + }, + { + "epoch": 2.144, + "grad_norm": 0.3927665948867798, + "learning_rate": 0.0002351612903225806, + "loss": 0.5187, + "step": 67 + }, + { + "epoch": 2.176, + "grad_norm": 0.39792704582214355, + "learning_rate": 0.00023419354838709674, + "loss": 0.4568, + "step": 68 + }, + { + "epoch": 2.208, + "grad_norm": 0.5023652911186218, + "learning_rate": 0.0002332258064516129, + "loss": 0.6166, + "step": 69 + }, + { + "epoch": 2.24, + "grad_norm": 0.425017774105072, + "learning_rate": 0.000232258064516129, + "loss": 0.42, + "step": 70 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 0.46458110213279724, + "learning_rate": 0.00023129032258064516, + "loss": 0.4613, + "step": 71 + }, + { + "epoch": 2.304, + "grad_norm": 0.49037960171699524, + "learning_rate": 0.00023032258064516125, + "loss": 0.5509, + "step": 72 + }, + { + "epoch": 2.336, + "grad_norm": 0.5233697891235352, + "learning_rate": 0.0002293548387096774, + "loss": 0.6396, + "step": 73 + }, + { + "epoch": 2.368, + "grad_norm": 0.4720582962036133, + "learning_rate": 0.0002283870967741935, + "loss": 0.5076, + "step": 74 + }, + { + "epoch": 2.4, + "grad_norm": 0.4900650382041931, + "learning_rate": 0.00022741935483870966, + "loss": 0.4794, + "step": 75 + }, + { + "epoch": 2.432, + "grad_norm": 0.6321704983711243, + "learning_rate": 0.0002264516129032258, + "loss": 0.6677, + "step": 76 + }, + { + "epoch": 2.464, + "grad_norm": 0.5305324792861938, + "learning_rate": 0.00022548387096774192, + "loss": 0.5102, + "step": 77 + }, + { + "epoch": 2.496, + "grad_norm": 0.5799248218536377, + "learning_rate": 0.00022451612903225804, + "loss": 0.5274, + "step": 78 + }, + { + "epoch": 2.528, + "grad_norm": 0.4990101456642151, + "learning_rate": 0.00022354838709677416, + "loss": 0.5407, + "step": 79 + }, + { + "epoch": 2.56, + "grad_norm": 0.4779827296733856, + "learning_rate": 0.0002225806451612903, + "loss": 0.5166, + "step": 80 + }, + { + "epoch": 2.592, + "grad_norm": 0.5140111446380615, + "learning_rate": 0.00022161290322580645, + "loss": 0.3288, + "step": 81 + }, + { + "epoch": 2.624, + "grad_norm": 0.5674853920936584, + "learning_rate": 0.00022064516129032257, + "loss": 0.666, + "step": 82 + }, + { + "epoch": 2.656, + "grad_norm": 0.5277597308158875, + "learning_rate": 0.00021967741935483871, + "loss": 0.5335, + "step": 83 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 0.6029439568519592, + "learning_rate": 0.0002187096774193548, + "loss": 0.693, + "step": 84 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.5039327144622803, + "learning_rate": 0.00021774193548387095, + "loss": 0.5728, + "step": 85 + }, + { + "epoch": 2.752, + "grad_norm": 0.5564692616462708, + "learning_rate": 0.00021677419354838707, + "loss": 0.4734, + "step": 86 + }, + { + "epoch": 2.784, + "grad_norm": 0.5278319120407104, + "learning_rate": 0.00021580645161290322, + "loss": 0.5834, + "step": 87 + }, + { + "epoch": 2.816, + "grad_norm": 0.5445135831832886, + "learning_rate": 0.00021483870967741936, + "loss": 0.4642, + "step": 88 + }, + { + "epoch": 2.848, + "grad_norm": 0.5394749045372009, + "learning_rate": 0.00021387096774193545, + "loss": 0.4779, + "step": 89 + }, + { + "epoch": 2.88, + "grad_norm": 0.5756134390830994, + "learning_rate": 0.0002129032258064516, + "loss": 0.5607, + "step": 90 + }, + { + "epoch": 2.912, + "grad_norm": 0.48361241817474365, + "learning_rate": 0.00021193548387096772, + "loss": 0.4278, + "step": 91 + }, + { + "epoch": 2.944, + "grad_norm": 0.5017121434211731, + "learning_rate": 0.00021096774193548386, + "loss": 0.4834, + "step": 92 + }, + { + "epoch": 2.976, + "grad_norm": 0.4741989076137543, + "learning_rate": 0.00020999999999999998, + "loss": 0.468, + "step": 93 + }, + { + "epoch": 3.008, + "grad_norm": 1.003368854522705, + "learning_rate": 0.0002090322580645161, + "loss": 0.8614, + "step": 94 + }, + { + "epoch": 3.04, + "grad_norm": 0.4782228469848633, + "learning_rate": 0.00020806451612903225, + "loss": 0.4111, + "step": 95 + }, + { + "epoch": 3.072, + "grad_norm": 0.4558674395084381, + "learning_rate": 0.00020709677419354836, + "loss": 0.3463, + "step": 96 + }, + { + "epoch": 3.104, + "grad_norm": 0.4409371316432953, + "learning_rate": 0.0002061290322580645, + "loss": 0.2571, + "step": 97 + }, + { + "epoch": 3.136, + "grad_norm": 0.5415034890174866, + "learning_rate": 0.00020516129032258063, + "loss": 0.5707, + "step": 98 + }, + { + "epoch": 3.168, + "grad_norm": 0.6157724857330322, + "learning_rate": 0.00020419354838709677, + "loss": 0.5692, + "step": 99 + }, + { + "epoch": 3.2, + "grad_norm": 0.4855688810348511, + "learning_rate": 0.00020322580645161287, + "loss": 0.3311, + "step": 100 + } + ], + "logging_steps": 1, + "max_steps": 310, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.3700930822144e+16, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..01723982396407692a903d785c60e57fcabfa0c4 --- /dev/null +++ b/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:457c697b05fd5daa3c83df8920300c4940c26fb78ace5b5428b7c95d133a0ef4 +size 5560 diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..17729c873634bbfb1699f787a721c2f034265b15 --- /dev/null +++ b/checkpoint-200/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/ubuntu/Apps/DataInf/models/model +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8b6984d46d6583eed5953a32eacf9f9ce36613d --- /dev/null +++ b/checkpoint-200/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/ubuntu/Apps/DataInf/models/model", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-200/adapter_model.safetensors b/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ea2344d2c114ffd0940f97c13bbea058f279d643 --- /dev/null +++ b/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76034bd12f617b62f87d1878710f04e43736a4161233cbd240f6ea4c24e35dc7 +size 26235704 diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..21f5a6a917e4c49591c4b5bd0a2ae0bfaf0280da --- /dev/null +++ b/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8e377a77f5c23ff7f423e5dbb2c0bb98a05cbe4a1ce433ebe86185e505f3d95 +size 52563258 diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..ba2a153441a74f7d4f28a7eefafa080c29286592 --- /dev/null +++ b/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ec44201e5ce901f0bed6a4979735328ac972b7f817413219ecec88ecb8ce9c0 +size 14244 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d47de238d025ea40daae2e188b96dc453a788457 --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3048d808fac665a3eab1331732df75d683f7b1c8261cab5545365b6834a4e34 +size 1064 diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-200/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-200/tokenizer.model b/checkpoint-200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6ca9f0b39df7b30b561a2070b66bf0059e2aa9c8 --- /dev/null +++ b/checkpoint-200/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..4c831e658a4ea3a61bf00df65cb3e312dc3067bd --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,1433 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 6.4, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.032, + "grad_norm": 0.3297976851463318, + "learning_rate": 0.0002990322580645161, + "loss": 1.0389, + "step": 1 + }, + { + "epoch": 0.064, + "grad_norm": 0.4069916307926178, + "learning_rate": 0.0002980645161290322, + "loss": 1.3377, + "step": 2 + }, + { + "epoch": 0.096, + "grad_norm": 0.42084500193595886, + "learning_rate": 0.00029709677419354836, + "loss": 0.9366, + "step": 3 + }, + { + "epoch": 0.128, + "grad_norm": 0.4641948938369751, + "learning_rate": 0.0002961290322580645, + "loss": 1.0086, + "step": 4 + }, + { + "epoch": 0.16, + "grad_norm": 0.3840750455856323, + "learning_rate": 0.00029516129032258065, + "loss": 0.8333, + "step": 5 + }, + { + "epoch": 0.192, + "grad_norm": 0.4263865053653717, + "learning_rate": 0.00029419354838709674, + "loss": 0.854, + "step": 6 + }, + { + "epoch": 0.224, + "grad_norm": 0.48615148663520813, + "learning_rate": 0.0002932258064516129, + "loss": 0.9548, + "step": 7 + }, + { + "epoch": 0.256, + "grad_norm": 0.44419369101524353, + "learning_rate": 0.00029225806451612903, + "loss": 0.8482, + "step": 8 + }, + { + "epoch": 0.288, + "grad_norm": 0.5317733883857727, + "learning_rate": 0.0002912903225806451, + "loss": 0.9426, + "step": 9 + }, + { + "epoch": 0.32, + "grad_norm": 0.47260937094688416, + "learning_rate": 0.00029032258064516127, + "loss": 0.9816, + "step": 10 + }, + { + "epoch": 0.352, + "grad_norm": 0.39063283801078796, + "learning_rate": 0.00028935483870967736, + "loss": 0.84, + "step": 11 + }, + { + "epoch": 0.384, + "grad_norm": 0.39234670996665955, + "learning_rate": 0.0002883870967741935, + "loss": 0.7476, + "step": 12 + }, + { + "epoch": 0.416, + "grad_norm": 0.40661805868148804, + "learning_rate": 0.00028741935483870965, + "loss": 0.9282, + "step": 13 + }, + { + "epoch": 0.448, + "grad_norm": 0.42970865964889526, + "learning_rate": 0.0002864516129032258, + "loss": 0.7858, + "step": 14 + }, + { + "epoch": 0.48, + "grad_norm": 0.3780193626880646, + "learning_rate": 0.00028548387096774194, + "loss": 0.7968, + "step": 15 + }, + { + "epoch": 0.512, + "grad_norm": 0.37006014585494995, + "learning_rate": 0.00028451612903225803, + "loss": 0.6801, + "step": 16 + }, + { + "epoch": 0.544, + "grad_norm": 0.3660840392112732, + "learning_rate": 0.0002835483870967742, + "loss": 0.5914, + "step": 17 + }, + { + "epoch": 0.576, + "grad_norm": 0.3270975351333618, + "learning_rate": 0.00028258064516129027, + "loss": 0.6449, + "step": 18 + }, + { + "epoch": 0.608, + "grad_norm": 0.3859024941921234, + "learning_rate": 0.0002816129032258064, + "loss": 0.8144, + "step": 19 + }, + { + "epoch": 0.64, + "grad_norm": 0.37092071771621704, + "learning_rate": 0.00028064516129032256, + "loss": 0.7667, + "step": 20 + }, + { + "epoch": 0.672, + "grad_norm": 0.37667015194892883, + "learning_rate": 0.0002796774193548387, + "loss": 0.7751, + "step": 21 + }, + { + "epoch": 0.704, + "grad_norm": 0.3832458555698395, + "learning_rate": 0.0002787096774193548, + "loss": 0.755, + "step": 22 + }, + { + "epoch": 0.736, + "grad_norm": 0.327288419008255, + "learning_rate": 0.00027774193548387095, + "loss": 0.7178, + "step": 23 + }, + { + "epoch": 0.768, + "grad_norm": 0.34552687406539917, + "learning_rate": 0.0002767741935483871, + "loss": 0.7057, + "step": 24 + }, + { + "epoch": 0.8, + "grad_norm": 0.3611259460449219, + "learning_rate": 0.0002758064516129032, + "loss": 0.8159, + "step": 25 + }, + { + "epoch": 0.832, + "grad_norm": 0.3345054090023041, + "learning_rate": 0.00027483870967741933, + "loss": 0.7208, + "step": 26 + }, + { + "epoch": 0.864, + "grad_norm": 0.3697254955768585, + "learning_rate": 0.0002738709677419355, + "loss": 0.8964, + "step": 27 + }, + { + "epoch": 0.896, + "grad_norm": 0.3905017375946045, + "learning_rate": 0.00027290322580645157, + "loss": 0.7794, + "step": 28 + }, + { + "epoch": 0.928, + "grad_norm": 0.3715725243091583, + "learning_rate": 0.0002719354838709677, + "loss": 0.6966, + "step": 29 + }, + { + "epoch": 0.96, + "grad_norm": 0.3650343120098114, + "learning_rate": 0.00027096774193548386, + "loss": 0.5761, + "step": 30 + }, + { + "epoch": 0.992, + "grad_norm": 0.33932459354400635, + "learning_rate": 0.00027, + "loss": 0.556, + "step": 31 + }, + { + "epoch": 1.024, + "grad_norm": 0.6371742486953735, + "learning_rate": 0.0002690322580645161, + "loss": 0.847, + "step": 32 + }, + { + "epoch": 1.056, + "grad_norm": 0.37499895691871643, + "learning_rate": 0.00026806451612903224, + "loss": 0.8419, + "step": 33 + }, + { + "epoch": 1.088, + "grad_norm": 0.33221954107284546, + "learning_rate": 0.0002670967741935484, + "loss": 0.6011, + "step": 34 + }, + { + "epoch": 1.12, + "grad_norm": 0.344096839427948, + "learning_rate": 0.0002661290322580645, + "loss": 0.6501, + "step": 35 + }, + { + "epoch": 1.152, + "grad_norm": 0.38429391384124756, + "learning_rate": 0.0002651612903225806, + "loss": 0.8091, + "step": 36 + }, + { + "epoch": 1.184, + "grad_norm": 0.38014867901802063, + "learning_rate": 0.00026419354838709677, + "loss": 0.7668, + "step": 37 + }, + { + "epoch": 1.216, + "grad_norm": 0.3352573812007904, + "learning_rate": 0.00026322580645161286, + "loss": 0.5444, + "step": 38 + }, + { + "epoch": 1.248, + "grad_norm": 0.33811062574386597, + "learning_rate": 0.000262258064516129, + "loss": 0.512, + "step": 39 + }, + { + "epoch": 1.28, + "grad_norm": 0.3998416066169739, + "learning_rate": 0.00026129032258064515, + "loss": 0.6315, + "step": 40 + }, + { + "epoch": 1.312, + "grad_norm": 0.3983341157436371, + "learning_rate": 0.0002603225806451613, + "loss": 0.5882, + "step": 41 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.4585898816585541, + "learning_rate": 0.0002593548387096774, + "loss": 0.761, + "step": 42 + }, + { + "epoch": 1.376, + "grad_norm": 0.4080730080604553, + "learning_rate": 0.00025838709677419354, + "loss": 0.6716, + "step": 43 + }, + { + "epoch": 1.408, + "grad_norm": 0.4068273901939392, + "learning_rate": 0.0002574193548387096, + "loss": 0.6376, + "step": 44 + }, + { + "epoch": 1.44, + "grad_norm": 0.4406949579715729, + "learning_rate": 0.00025645161290322577, + "loss": 0.4594, + "step": 45 + }, + { + "epoch": 1.472, + "grad_norm": 0.34500986337661743, + "learning_rate": 0.0002554838709677419, + "loss": 0.3672, + "step": 46 + }, + { + "epoch": 1.504, + "grad_norm": 0.4760681390762329, + "learning_rate": 0.00025451612903225806, + "loss": 0.6331, + "step": 47 + }, + { + "epoch": 1.536, + "grad_norm": 0.39281558990478516, + "learning_rate": 0.0002535483870967742, + "loss": 0.5845, + "step": 48 + }, + { + "epoch": 1.568, + "grad_norm": 0.4265002906322479, + "learning_rate": 0.0002525806451612903, + "loss": 0.4461, + "step": 49 + }, + { + "epoch": 1.6, + "grad_norm": 0.40967294573783875, + "learning_rate": 0.00025161290322580645, + "loss": 0.7011, + "step": 50 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.4288088381290436, + "learning_rate": 0.00025064516129032254, + "loss": 0.6928, + "step": 51 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.4356289803981781, + "learning_rate": 0.0002496774193548387, + "loss": 0.7972, + "step": 52 + }, + { + "epoch": 1.696, + "grad_norm": 0.3827487826347351, + "learning_rate": 0.0002487096774193548, + "loss": 0.2991, + "step": 53 + }, + { + "epoch": 1.728, + "grad_norm": 0.40093398094177246, + "learning_rate": 0.0002477419354838709, + "loss": 0.416, + "step": 54 + }, + { + "epoch": 1.76, + "grad_norm": 0.41548973321914673, + "learning_rate": 0.00024677419354838707, + "loss": 0.5501, + "step": 55 + }, + { + "epoch": 1.792, + "grad_norm": 0.4093388617038727, + "learning_rate": 0.0002458064516129032, + "loss": 0.5557, + "step": 56 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.3934040665626526, + "learning_rate": 0.00024483870967741936, + "loss": 0.602, + "step": 57 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.42221033573150635, + "learning_rate": 0.00024387096774193545, + "loss": 0.6421, + "step": 58 + }, + { + "epoch": 1.888, + "grad_norm": 0.4351339340209961, + "learning_rate": 0.0002429032258064516, + "loss": 0.5615, + "step": 59 + }, + { + "epoch": 1.92, + "grad_norm": 0.4319838881492615, + "learning_rate": 0.00024193548387096771, + "loss": 0.6804, + "step": 60 + }, + { + "epoch": 1.952, + "grad_norm": 0.40016525983810425, + "learning_rate": 0.00024096774193548386, + "loss": 0.5432, + "step": 61 + }, + { + "epoch": 1.984, + "grad_norm": 0.3905942440032959, + "learning_rate": 0.00023999999999999998, + "loss": 0.4187, + "step": 62 + }, + { + "epoch": 2.016, + "grad_norm": 0.8056382536888123, + "learning_rate": 0.0002390322580645161, + "loss": 1.0174, + "step": 63 + }, + { + "epoch": 2.048, + "grad_norm": 0.3835236430168152, + "learning_rate": 0.00023806451612903224, + "loss": 0.5992, + "step": 64 + }, + { + "epoch": 2.08, + "grad_norm": 0.41092216968536377, + "learning_rate": 0.00023709677419354836, + "loss": 0.4746, + "step": 65 + }, + { + "epoch": 2.112, + "grad_norm": 0.39536622166633606, + "learning_rate": 0.0002361290322580645, + "loss": 0.3946, + "step": 66 + }, + { + "epoch": 2.144, + "grad_norm": 0.3927665948867798, + "learning_rate": 0.0002351612903225806, + "loss": 0.5187, + "step": 67 + }, + { + "epoch": 2.176, + "grad_norm": 0.39792704582214355, + "learning_rate": 0.00023419354838709674, + "loss": 0.4568, + "step": 68 + }, + { + "epoch": 2.208, + "grad_norm": 0.5023652911186218, + "learning_rate": 0.0002332258064516129, + "loss": 0.6166, + "step": 69 + }, + { + "epoch": 2.24, + "grad_norm": 0.425017774105072, + "learning_rate": 0.000232258064516129, + "loss": 0.42, + "step": 70 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 0.46458110213279724, + "learning_rate": 0.00023129032258064516, + "loss": 0.4613, + "step": 71 + }, + { + "epoch": 2.304, + "grad_norm": 0.49037960171699524, + "learning_rate": 0.00023032258064516125, + "loss": 0.5509, + "step": 72 + }, + { + "epoch": 2.336, + "grad_norm": 0.5233697891235352, + "learning_rate": 0.0002293548387096774, + "loss": 0.6396, + "step": 73 + }, + { + "epoch": 2.368, + "grad_norm": 0.4720582962036133, + "learning_rate": 0.0002283870967741935, + "loss": 0.5076, + "step": 74 + }, + { + "epoch": 2.4, + "grad_norm": 0.4900650382041931, + "learning_rate": 0.00022741935483870966, + "loss": 0.4794, + "step": 75 + }, + { + "epoch": 2.432, + "grad_norm": 0.6321704983711243, + "learning_rate": 0.0002264516129032258, + "loss": 0.6677, + "step": 76 + }, + { + "epoch": 2.464, + "grad_norm": 0.5305324792861938, + "learning_rate": 0.00022548387096774192, + "loss": 0.5102, + "step": 77 + }, + { + "epoch": 2.496, + "grad_norm": 0.5799248218536377, + "learning_rate": 0.00022451612903225804, + "loss": 0.5274, + "step": 78 + }, + { + "epoch": 2.528, + "grad_norm": 0.4990101456642151, + "learning_rate": 0.00022354838709677416, + "loss": 0.5407, + "step": 79 + }, + { + "epoch": 2.56, + "grad_norm": 0.4779827296733856, + "learning_rate": 0.0002225806451612903, + "loss": 0.5166, + "step": 80 + }, + { + "epoch": 2.592, + "grad_norm": 0.5140111446380615, + "learning_rate": 0.00022161290322580645, + "loss": 0.3288, + "step": 81 + }, + { + "epoch": 2.624, + "grad_norm": 0.5674853920936584, + "learning_rate": 0.00022064516129032257, + "loss": 0.666, + "step": 82 + }, + { + "epoch": 2.656, + "grad_norm": 0.5277597308158875, + "learning_rate": 0.00021967741935483871, + "loss": 0.5335, + "step": 83 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 0.6029439568519592, + "learning_rate": 0.0002187096774193548, + "loss": 0.693, + "step": 84 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.5039327144622803, + "learning_rate": 0.00021774193548387095, + "loss": 0.5728, + "step": 85 + }, + { + "epoch": 2.752, + "grad_norm": 0.5564692616462708, + "learning_rate": 0.00021677419354838707, + "loss": 0.4734, + "step": 86 + }, + { + "epoch": 2.784, + "grad_norm": 0.5278319120407104, + "learning_rate": 0.00021580645161290322, + "loss": 0.5834, + "step": 87 + }, + { + "epoch": 2.816, + "grad_norm": 0.5445135831832886, + "learning_rate": 0.00021483870967741936, + "loss": 0.4642, + "step": 88 + }, + { + "epoch": 2.848, + "grad_norm": 0.5394749045372009, + "learning_rate": 0.00021387096774193545, + "loss": 0.4779, + "step": 89 + }, + { + "epoch": 2.88, + "grad_norm": 0.5756134390830994, + "learning_rate": 0.0002129032258064516, + "loss": 0.5607, + "step": 90 + }, + { + "epoch": 2.912, + "grad_norm": 0.48361241817474365, + "learning_rate": 0.00021193548387096772, + "loss": 0.4278, + "step": 91 + }, + { + "epoch": 2.944, + "grad_norm": 0.5017121434211731, + "learning_rate": 0.00021096774193548386, + "loss": 0.4834, + "step": 92 + }, + { + "epoch": 2.976, + "grad_norm": 0.4741989076137543, + "learning_rate": 0.00020999999999999998, + "loss": 0.468, + "step": 93 + }, + { + "epoch": 3.008, + "grad_norm": 1.003368854522705, + "learning_rate": 0.0002090322580645161, + "loss": 0.8614, + "step": 94 + }, + { + "epoch": 3.04, + "grad_norm": 0.4782228469848633, + "learning_rate": 0.00020806451612903225, + "loss": 0.4111, + "step": 95 + }, + { + "epoch": 3.072, + "grad_norm": 0.4558674395084381, + "learning_rate": 0.00020709677419354836, + "loss": 0.3463, + "step": 96 + }, + { + "epoch": 3.104, + "grad_norm": 0.4409371316432953, + "learning_rate": 0.0002061290322580645, + "loss": 0.2571, + "step": 97 + }, + { + "epoch": 3.136, + "grad_norm": 0.5415034890174866, + "learning_rate": 0.00020516129032258063, + "loss": 0.5707, + "step": 98 + }, + { + "epoch": 3.168, + "grad_norm": 0.6157724857330322, + "learning_rate": 0.00020419354838709677, + "loss": 0.5692, + "step": 99 + }, + { + "epoch": 3.2, + "grad_norm": 0.4855688810348511, + "learning_rate": 0.00020322580645161287, + "loss": 0.3311, + "step": 100 + }, + { + "epoch": 3.232, + "grad_norm": 0.569878101348877, + "learning_rate": 0.000202258064516129, + "loss": 0.4707, + "step": 101 + }, + { + "epoch": 3.2640000000000002, + "grad_norm": 0.645232081413269, + "learning_rate": 0.00020129032258064516, + "loss": 0.5504, + "step": 102 + }, + { + "epoch": 3.296, + "grad_norm": 0.5775763392448425, + "learning_rate": 0.00020032258064516128, + "loss": 0.3651, + "step": 103 + }, + { + "epoch": 3.328, + "grad_norm": 0.5808250904083252, + "learning_rate": 0.00019935483870967742, + "loss": 0.5068, + "step": 104 + }, + { + "epoch": 3.36, + "grad_norm": 0.689313530921936, + "learning_rate": 0.0001983870967741935, + "loss": 0.4936, + "step": 105 + }, + { + "epoch": 3.392, + "grad_norm": 0.6571519374847412, + "learning_rate": 0.00019741935483870966, + "loss": 0.3671, + "step": 106 + }, + { + "epoch": 3.424, + "grad_norm": 0.6340517401695251, + "learning_rate": 0.00019645161290322578, + "loss": 0.4783, + "step": 107 + }, + { + "epoch": 3.456, + "grad_norm": 0.7031407952308655, + "learning_rate": 0.00019548387096774192, + "loss": 0.427, + "step": 108 + }, + { + "epoch": 3.488, + "grad_norm": 0.728496789932251, + "learning_rate": 0.00019451612903225807, + "loss": 0.5497, + "step": 109 + }, + { + "epoch": 3.52, + "grad_norm": 0.6106727719306946, + "learning_rate": 0.00019354838709677416, + "loss": 0.392, + "step": 110 + }, + { + "epoch": 3.552, + "grad_norm": 0.5296047329902649, + "learning_rate": 0.0001925806451612903, + "loss": 0.3412, + "step": 111 + }, + { + "epoch": 3.584, + "grad_norm": 0.6282025575637817, + "learning_rate": 0.00019161290322580643, + "loss": 0.4081, + "step": 112 + }, + { + "epoch": 3.616, + "grad_norm": 0.6166461110115051, + "learning_rate": 0.00019064516129032257, + "loss": 0.4771, + "step": 113 + }, + { + "epoch": 3.648, + "grad_norm": 0.5448863506317139, + "learning_rate": 0.0001896774193548387, + "loss": 0.404, + "step": 114 + }, + { + "epoch": 3.68, + "grad_norm": 0.6598389148712158, + "learning_rate": 0.0001887096774193548, + "loss": 0.3915, + "step": 115 + }, + { + "epoch": 3.7119999999999997, + "grad_norm": 0.5567564368247986, + "learning_rate": 0.00018774193548387095, + "loss": 0.3862, + "step": 116 + }, + { + "epoch": 3.7439999999999998, + "grad_norm": 0.6524521708488464, + "learning_rate": 0.00018677419354838707, + "loss": 0.5315, + "step": 117 + }, + { + "epoch": 3.776, + "grad_norm": 0.7040128707885742, + "learning_rate": 0.00018580645161290322, + "loss": 0.5387, + "step": 118 + }, + { + "epoch": 3.808, + "grad_norm": 0.690262496471405, + "learning_rate": 0.00018483870967741934, + "loss": 0.4877, + "step": 119 + }, + { + "epoch": 3.84, + "grad_norm": 0.6928034424781799, + "learning_rate": 0.00018387096774193548, + "loss": 0.4895, + "step": 120 + }, + { + "epoch": 3.872, + "grad_norm": 0.7148469686508179, + "learning_rate": 0.00018290322580645157, + "loss": 0.4814, + "step": 121 + }, + { + "epoch": 3.904, + "grad_norm": 0.6096572875976562, + "learning_rate": 0.00018193548387096772, + "loss": 0.3403, + "step": 122 + }, + { + "epoch": 3.936, + "grad_norm": 0.7132399678230286, + "learning_rate": 0.00018096774193548387, + "loss": 0.4258, + "step": 123 + }, + { + "epoch": 3.968, + "grad_norm": 0.7302684187889099, + "learning_rate": 0.00017999999999999998, + "loss": 0.7215, + "step": 124 + }, + { + "epoch": 4.0, + "grad_norm": 1.5244004726409912, + "learning_rate": 0.00017903225806451613, + "loss": 0.8544, + "step": 125 + }, + { + "epoch": 4.032, + "grad_norm": 0.6032777428627014, + "learning_rate": 0.00017806451612903222, + "loss": 0.4183, + "step": 126 + }, + { + "epoch": 4.064, + "grad_norm": 0.6349691152572632, + "learning_rate": 0.00017709677419354837, + "loss": 0.5871, + "step": 127 + }, + { + "epoch": 4.096, + "grad_norm": 0.5730060935020447, + "learning_rate": 0.00017612903225806449, + "loss": 0.3786, + "step": 128 + }, + { + "epoch": 4.128, + "grad_norm": 0.6988044381141663, + "learning_rate": 0.00017516129032258063, + "loss": 0.3216, + "step": 129 + }, + { + "epoch": 4.16, + "grad_norm": 0.7379153370857239, + "learning_rate": 0.00017419354838709678, + "loss": 0.4026, + "step": 130 + }, + { + "epoch": 4.192, + "grad_norm": 0.7058238983154297, + "learning_rate": 0.00017322580645161287, + "loss": 0.4328, + "step": 131 + }, + { + "epoch": 4.224, + "grad_norm": 0.80663001537323, + "learning_rate": 0.00017225806451612901, + "loss": 0.3849, + "step": 132 + }, + { + "epoch": 4.256, + "grad_norm": 0.899818480014801, + "learning_rate": 0.00017129032258064513, + "loss": 0.4191, + "step": 133 + }, + { + "epoch": 4.288, + "grad_norm": 0.8538224697113037, + "learning_rate": 0.00017032258064516128, + "loss": 0.3587, + "step": 134 + }, + { + "epoch": 4.32, + "grad_norm": 0.8948169350624084, + "learning_rate": 0.00016935483870967742, + "loss": 0.3957, + "step": 135 + }, + { + "epoch": 4.352, + "grad_norm": 0.7195591926574707, + "learning_rate": 0.00016838709677419354, + "loss": 0.3361, + "step": 136 + }, + { + "epoch": 4.384, + "grad_norm": 0.7769681215286255, + "learning_rate": 0.00016741935483870966, + "loss": 0.3519, + "step": 137 + }, + { + "epoch": 4.416, + "grad_norm": 0.9509867429733276, + "learning_rate": 0.00016645161290322578, + "loss": 0.4216, + "step": 138 + }, + { + "epoch": 4.448, + "grad_norm": 0.7923309206962585, + "learning_rate": 0.00016548387096774193, + "loss": 0.3999, + "step": 139 + }, + { + "epoch": 4.48, + "grad_norm": 0.8961685299873352, + "learning_rate": 0.00016451612903225804, + "loss": 0.5385, + "step": 140 + }, + { + "epoch": 4.5120000000000005, + "grad_norm": 0.7496562004089355, + "learning_rate": 0.0001635483870967742, + "loss": 0.341, + "step": 141 + }, + { + "epoch": 4.5440000000000005, + "grad_norm": 0.8512839674949646, + "learning_rate": 0.00016258064516129034, + "loss": 0.3847, + "step": 142 + }, + { + "epoch": 4.576, + "grad_norm": 0.7487362027168274, + "learning_rate": 0.00016161290322580643, + "loss": 0.3694, + "step": 143 + }, + { + "epoch": 4.608, + "grad_norm": 0.7957774996757507, + "learning_rate": 0.00016064516129032257, + "loss": 0.3379, + "step": 144 + }, + { + "epoch": 4.64, + "grad_norm": 0.7299221754074097, + "learning_rate": 0.0001596774193548387, + "loss": 0.2989, + "step": 145 + }, + { + "epoch": 4.672, + "grad_norm": 0.7909884452819824, + "learning_rate": 0.00015870967741935484, + "loss": 0.3675, + "step": 146 + }, + { + "epoch": 4.704, + "grad_norm": 0.7321597933769226, + "learning_rate": 0.00015774193548387093, + "loss": 0.3243, + "step": 147 + }, + { + "epoch": 4.736, + "grad_norm": 0.7196181416511536, + "learning_rate": 0.00015677419354838708, + "loss": 0.2709, + "step": 148 + }, + { + "epoch": 4.768, + "grad_norm": 0.7918142676353455, + "learning_rate": 0.00015580645161290322, + "loss": 0.3934, + "step": 149 + }, + { + "epoch": 4.8, + "grad_norm": 0.8657622337341309, + "learning_rate": 0.00015483870967741934, + "loss": 0.3583, + "step": 150 + }, + { + "epoch": 4.832, + "grad_norm": 0.8207722306251526, + "learning_rate": 0.00015387096774193549, + "loss": 0.412, + "step": 151 + }, + { + "epoch": 4.864, + "grad_norm": 0.7206109166145325, + "learning_rate": 0.00015290322580645158, + "loss": 0.3594, + "step": 152 + }, + { + "epoch": 4.896, + "grad_norm": 0.8529183864593506, + "learning_rate": 0.00015193548387096772, + "loss": 0.512, + "step": 153 + }, + { + "epoch": 4.928, + "grad_norm": 0.6895930171012878, + "learning_rate": 0.00015096774193548384, + "loss": 0.333, + "step": 154 + }, + { + "epoch": 4.96, + "grad_norm": 0.7422910332679749, + "learning_rate": 0.00015, + "loss": 0.2872, + "step": 155 + }, + { + "epoch": 4.992, + "grad_norm": 0.7366386651992798, + "learning_rate": 0.0001490322580645161, + "loss": 0.3415, + "step": 156 + }, + { + "epoch": 5.024, + "grad_norm": 2.1416280269622803, + "learning_rate": 0.00014806451612903225, + "loss": 0.9961, + "step": 157 + }, + { + "epoch": 5.056, + "grad_norm": 0.7944900393486023, + "learning_rate": 0.00014709677419354837, + "loss": 0.3372, + "step": 158 + }, + { + "epoch": 5.088, + "grad_norm": 0.7071006298065186, + "learning_rate": 0.00014612903225806452, + "loss": 0.2732, + "step": 159 + }, + { + "epoch": 5.12, + "grad_norm": 0.7874396443367004, + "learning_rate": 0.00014516129032258063, + "loss": 0.2861, + "step": 160 + }, + { + "epoch": 5.152, + "grad_norm": 0.8244249224662781, + "learning_rate": 0.00014419354838709675, + "loss": 0.3428, + "step": 161 + }, + { + "epoch": 5.184, + "grad_norm": 0.81637042760849, + "learning_rate": 0.0001432258064516129, + "loss": 0.3037, + "step": 162 + }, + { + "epoch": 5.216, + "grad_norm": 0.9916559457778931, + "learning_rate": 0.00014225806451612902, + "loss": 0.3337, + "step": 163 + }, + { + "epoch": 5.248, + "grad_norm": 0.9077599048614502, + "learning_rate": 0.00014129032258064514, + "loss": 0.287, + "step": 164 + }, + { + "epoch": 5.28, + "grad_norm": 0.9824132919311523, + "learning_rate": 0.00014032258064516128, + "loss": 0.3852, + "step": 165 + }, + { + "epoch": 5.312, + "grad_norm": 1.0016467571258545, + "learning_rate": 0.0001393548387096774, + "loss": 0.3234, + "step": 166 + }, + { + "epoch": 5.344, + "grad_norm": 0.8697543144226074, + "learning_rate": 0.00013838709677419355, + "loss": 0.2848, + "step": 167 + }, + { + "epoch": 5.376, + "grad_norm": 0.8214029669761658, + "learning_rate": 0.00013741935483870966, + "loss": 0.3377, + "step": 168 + }, + { + "epoch": 5.408, + "grad_norm": 0.9105691313743591, + "learning_rate": 0.00013645161290322578, + "loss": 0.2944, + "step": 169 + }, + { + "epoch": 5.44, + "grad_norm": 0.9642040133476257, + "learning_rate": 0.00013548387096774193, + "loss": 0.3624, + "step": 170 + }, + { + "epoch": 5.4719999999999995, + "grad_norm": 0.9218887686729431, + "learning_rate": 0.00013451612903225805, + "loss": 0.3938, + "step": 171 + }, + { + "epoch": 5.504, + "grad_norm": 0.8704710006713867, + "learning_rate": 0.0001335483870967742, + "loss": 0.3629, + "step": 172 + }, + { + "epoch": 5.536, + "grad_norm": 0.8207693099975586, + "learning_rate": 0.0001325806451612903, + "loss": 0.3169, + "step": 173 + }, + { + "epoch": 5.568, + "grad_norm": 0.9315701127052307, + "learning_rate": 0.00013161290322580643, + "loss": 0.429, + "step": 174 + }, + { + "epoch": 5.6, + "grad_norm": 0.860234260559082, + "learning_rate": 0.00013064516129032258, + "loss": 0.3842, + "step": 175 + }, + { + "epoch": 5.632, + "grad_norm": 0.8927604556083679, + "learning_rate": 0.0001296774193548387, + "loss": 0.3405, + "step": 176 + }, + { + "epoch": 5.664, + "grad_norm": 0.8084587454795837, + "learning_rate": 0.0001287096774193548, + "loss": 0.306, + "step": 177 + }, + { + "epoch": 5.696, + "grad_norm": 0.9102941155433655, + "learning_rate": 0.00012774193548387096, + "loss": 0.3285, + "step": 178 + }, + { + "epoch": 5.728, + "grad_norm": 0.763113796710968, + "learning_rate": 0.0001267741935483871, + "loss": 0.2729, + "step": 179 + }, + { + "epoch": 5.76, + "grad_norm": 0.8704251646995544, + "learning_rate": 0.00012580645161290322, + "loss": 0.3164, + "step": 180 + }, + { + "epoch": 5.792, + "grad_norm": 0.9634932279586792, + "learning_rate": 0.00012483870967741934, + "loss": 0.2939, + "step": 181 + }, + { + "epoch": 5.824, + "grad_norm": 1.1567790508270264, + "learning_rate": 0.00012387096774193546, + "loss": 0.3076, + "step": 182 + }, + { + "epoch": 5.856, + "grad_norm": 0.9096764922142029, + "learning_rate": 0.0001229032258064516, + "loss": 0.3289, + "step": 183 + }, + { + "epoch": 5.888, + "grad_norm": 0.9840425848960876, + "learning_rate": 0.00012193548387096773, + "loss": 0.2772, + "step": 184 + }, + { + "epoch": 5.92, + "grad_norm": 0.725844144821167, + "learning_rate": 0.00012096774193548386, + "loss": 0.2151, + "step": 185 + }, + { + "epoch": 5.952, + "grad_norm": 0.8343638181686401, + "learning_rate": 0.00011999999999999999, + "loss": 0.3825, + "step": 186 + }, + { + "epoch": 5.984, + "grad_norm": 0.8040199279785156, + "learning_rate": 0.00011903225806451612, + "loss": 0.2571, + "step": 187 + }, + { + "epoch": 6.016, + "grad_norm": 1.6932090520858765, + "learning_rate": 0.00011806451612903225, + "loss": 0.5538, + "step": 188 + }, + { + "epoch": 6.048, + "grad_norm": 0.744048535823822, + "learning_rate": 0.00011709677419354837, + "loss": 0.2335, + "step": 189 + }, + { + "epoch": 6.08, + "grad_norm": 0.6974924206733704, + "learning_rate": 0.0001161290322580645, + "loss": 0.2891, + "step": 190 + }, + { + "epoch": 6.112, + "grad_norm": 0.7202953696250916, + "learning_rate": 0.00011516129032258062, + "loss": 0.2017, + "step": 191 + }, + { + "epoch": 6.144, + "grad_norm": 0.8437547087669373, + "learning_rate": 0.00011419354838709676, + "loss": 0.2175, + "step": 192 + }, + { + "epoch": 6.176, + "grad_norm": 1.0741796493530273, + "learning_rate": 0.0001132258064516129, + "loss": 0.3913, + "step": 193 + }, + { + "epoch": 6.208, + "grad_norm": 1.031754493713379, + "learning_rate": 0.00011225806451612902, + "loss": 0.298, + "step": 194 + }, + { + "epoch": 6.24, + "grad_norm": 0.9575178027153015, + "learning_rate": 0.00011129032258064515, + "loss": 0.3201, + "step": 195 + }, + { + "epoch": 6.272, + "grad_norm": 0.9503082633018494, + "learning_rate": 0.00011032258064516128, + "loss": 0.2005, + "step": 196 + }, + { + "epoch": 6.304, + "grad_norm": 1.2572892904281616, + "learning_rate": 0.0001093548387096774, + "loss": 0.3045, + "step": 197 + }, + { + "epoch": 6.336, + "grad_norm": 1.5667368173599243, + "learning_rate": 0.00010838709677419353, + "loss": 0.4053, + "step": 198 + }, + { + "epoch": 6.368, + "grad_norm": 0.9439151883125305, + "learning_rate": 0.00010741935483870968, + "loss": 0.2721, + "step": 199 + }, + { + "epoch": 6.4, + "grad_norm": 1.0985567569732666, + "learning_rate": 0.0001064516129032258, + "loss": 0.2543, + "step": 200 + } + ], + "logging_steps": 1, + "max_steps": 310, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.7401861644288e+16, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..01723982396407692a903d785c60e57fcabfa0c4 --- /dev/null +++ b/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:457c697b05fd5daa3c83df8920300c4940c26fb78ace5b5428b7c95d133a0ef4 +size 5560 diff --git a/checkpoint-300/README.md b/checkpoint-300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..17729c873634bbfb1699f787a721c2f034265b15 --- /dev/null +++ b/checkpoint-300/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/ubuntu/Apps/DataInf/models/model +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/checkpoint-300/adapter_config.json b/checkpoint-300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8b6984d46d6583eed5953a32eacf9f9ce36613d --- /dev/null +++ b/checkpoint-300/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/ubuntu/Apps/DataInf/models/model", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-300/adapter_model.safetensors b/checkpoint-300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..de97f000d364be40f5e3bed6ed02c0568a7ecac9 --- /dev/null +++ b/checkpoint-300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3a386647e475c5ceec94d64bca14946af914e62ba8c1b0ccabaf6e67ee9cc86 +size 26235704 diff --git a/checkpoint-300/optimizer.pt b/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..50090940b46536e9b521d8cfbc701fef64ff0b46 --- /dev/null +++ b/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e6e1eb36f19824df0e3da6744f15a8782d02453f8d4fea63614aea6541336c8 +size 52563258 diff --git a/checkpoint-300/rng_state.pth b/checkpoint-300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..a856c2c2faa51d99c550c01069d0ab82afe65aa7 --- /dev/null +++ b/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:459e5e31f82079d82419676e4070ee07546d1393dfcf3e2693f2cd031c775968 +size 14244 diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f39a59054b67b4ff6ad33f46a9b5fe89959580dd --- /dev/null +++ b/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a2a684bbb13b2471d0a344d410a300b5697fcfde43c8620a62c450f9f491758 +size 1064 diff --git a/checkpoint-300/special_tokens_map.json b/checkpoint-300/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-300/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-300/tokenizer.model b/checkpoint-300/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-300/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-300/tokenizer_config.json b/checkpoint-300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6ca9f0b39df7b30b561a2070b66bf0059e2aa9c8 --- /dev/null +++ b/checkpoint-300/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ae963cab66822a60d833cce9e668fc909b0f1186 --- /dev/null +++ b/checkpoint-300/trainer_state.json @@ -0,0 +1,2133 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.6, + "eval_steps": 500, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.032, + "grad_norm": 0.3297976851463318, + "learning_rate": 0.0002990322580645161, + "loss": 1.0389, + "step": 1 + }, + { + "epoch": 0.064, + "grad_norm": 0.4069916307926178, + "learning_rate": 0.0002980645161290322, + "loss": 1.3377, + "step": 2 + }, + { + "epoch": 0.096, + "grad_norm": 0.42084500193595886, + "learning_rate": 0.00029709677419354836, + "loss": 0.9366, + "step": 3 + }, + { + "epoch": 0.128, + "grad_norm": 0.4641948938369751, + "learning_rate": 0.0002961290322580645, + "loss": 1.0086, + "step": 4 + }, + { + "epoch": 0.16, + "grad_norm": 0.3840750455856323, + "learning_rate": 0.00029516129032258065, + "loss": 0.8333, + "step": 5 + }, + { + "epoch": 0.192, + "grad_norm": 0.4263865053653717, + "learning_rate": 0.00029419354838709674, + "loss": 0.854, + "step": 6 + }, + { + "epoch": 0.224, + "grad_norm": 0.48615148663520813, + "learning_rate": 0.0002932258064516129, + "loss": 0.9548, + "step": 7 + }, + { + "epoch": 0.256, + "grad_norm": 0.44419369101524353, + "learning_rate": 0.00029225806451612903, + "loss": 0.8482, + "step": 8 + }, + { + "epoch": 0.288, + "grad_norm": 0.5317733883857727, + "learning_rate": 0.0002912903225806451, + "loss": 0.9426, + "step": 9 + }, + { + "epoch": 0.32, + "grad_norm": 0.47260937094688416, + "learning_rate": 0.00029032258064516127, + "loss": 0.9816, + "step": 10 + }, + { + "epoch": 0.352, + "grad_norm": 0.39063283801078796, + "learning_rate": 0.00028935483870967736, + "loss": 0.84, + "step": 11 + }, + { + "epoch": 0.384, + "grad_norm": 0.39234670996665955, + "learning_rate": 0.0002883870967741935, + "loss": 0.7476, + "step": 12 + }, + { + "epoch": 0.416, + "grad_norm": 0.40661805868148804, + "learning_rate": 0.00028741935483870965, + "loss": 0.9282, + "step": 13 + }, + { + "epoch": 0.448, + "grad_norm": 0.42970865964889526, + "learning_rate": 0.0002864516129032258, + "loss": 0.7858, + "step": 14 + }, + { + "epoch": 0.48, + "grad_norm": 0.3780193626880646, + "learning_rate": 0.00028548387096774194, + "loss": 0.7968, + "step": 15 + }, + { + "epoch": 0.512, + "grad_norm": 0.37006014585494995, + "learning_rate": 0.00028451612903225803, + "loss": 0.6801, + "step": 16 + }, + { + "epoch": 0.544, + "grad_norm": 0.3660840392112732, + "learning_rate": 0.0002835483870967742, + "loss": 0.5914, + "step": 17 + }, + { + "epoch": 0.576, + "grad_norm": 0.3270975351333618, + "learning_rate": 0.00028258064516129027, + "loss": 0.6449, + "step": 18 + }, + { + "epoch": 0.608, + "grad_norm": 0.3859024941921234, + "learning_rate": 0.0002816129032258064, + "loss": 0.8144, + "step": 19 + }, + { + "epoch": 0.64, + "grad_norm": 0.37092071771621704, + "learning_rate": 0.00028064516129032256, + "loss": 0.7667, + "step": 20 + }, + { + "epoch": 0.672, + "grad_norm": 0.37667015194892883, + "learning_rate": 0.0002796774193548387, + "loss": 0.7751, + "step": 21 + }, + { + "epoch": 0.704, + "grad_norm": 0.3832458555698395, + "learning_rate": 0.0002787096774193548, + "loss": 0.755, + "step": 22 + }, + { + "epoch": 0.736, + "grad_norm": 0.327288419008255, + "learning_rate": 0.00027774193548387095, + "loss": 0.7178, + "step": 23 + }, + { + "epoch": 0.768, + "grad_norm": 0.34552687406539917, + "learning_rate": 0.0002767741935483871, + "loss": 0.7057, + "step": 24 + }, + { + "epoch": 0.8, + "grad_norm": 0.3611259460449219, + "learning_rate": 0.0002758064516129032, + "loss": 0.8159, + "step": 25 + }, + { + "epoch": 0.832, + "grad_norm": 0.3345054090023041, + "learning_rate": 0.00027483870967741933, + "loss": 0.7208, + "step": 26 + }, + { + "epoch": 0.864, + "grad_norm": 0.3697254955768585, + "learning_rate": 0.0002738709677419355, + "loss": 0.8964, + "step": 27 + }, + { + "epoch": 0.896, + "grad_norm": 0.3905017375946045, + "learning_rate": 0.00027290322580645157, + "loss": 0.7794, + "step": 28 + }, + { + "epoch": 0.928, + "grad_norm": 0.3715725243091583, + "learning_rate": 0.0002719354838709677, + "loss": 0.6966, + "step": 29 + }, + { + "epoch": 0.96, + "grad_norm": 0.3650343120098114, + "learning_rate": 0.00027096774193548386, + "loss": 0.5761, + "step": 30 + }, + { + "epoch": 0.992, + "grad_norm": 0.33932459354400635, + "learning_rate": 0.00027, + "loss": 0.556, + "step": 31 + }, + { + "epoch": 1.024, + "grad_norm": 0.6371742486953735, + "learning_rate": 0.0002690322580645161, + "loss": 0.847, + "step": 32 + }, + { + "epoch": 1.056, + "grad_norm": 0.37499895691871643, + "learning_rate": 0.00026806451612903224, + "loss": 0.8419, + "step": 33 + }, + { + "epoch": 1.088, + "grad_norm": 0.33221954107284546, + "learning_rate": 0.0002670967741935484, + "loss": 0.6011, + "step": 34 + }, + { + "epoch": 1.12, + "grad_norm": 0.344096839427948, + "learning_rate": 0.0002661290322580645, + "loss": 0.6501, + "step": 35 + }, + { + "epoch": 1.152, + "grad_norm": 0.38429391384124756, + "learning_rate": 0.0002651612903225806, + "loss": 0.8091, + "step": 36 + }, + { + "epoch": 1.184, + "grad_norm": 0.38014867901802063, + "learning_rate": 0.00026419354838709677, + "loss": 0.7668, + "step": 37 + }, + { + "epoch": 1.216, + "grad_norm": 0.3352573812007904, + "learning_rate": 0.00026322580645161286, + "loss": 0.5444, + "step": 38 + }, + { + "epoch": 1.248, + "grad_norm": 0.33811062574386597, + "learning_rate": 0.000262258064516129, + "loss": 0.512, + "step": 39 + }, + { + "epoch": 1.28, + "grad_norm": 0.3998416066169739, + "learning_rate": 0.00026129032258064515, + "loss": 0.6315, + "step": 40 + }, + { + "epoch": 1.312, + "grad_norm": 0.3983341157436371, + "learning_rate": 0.0002603225806451613, + "loss": 0.5882, + "step": 41 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.4585898816585541, + "learning_rate": 0.0002593548387096774, + "loss": 0.761, + "step": 42 + }, + { + "epoch": 1.376, + "grad_norm": 0.4080730080604553, + "learning_rate": 0.00025838709677419354, + "loss": 0.6716, + "step": 43 + }, + { + "epoch": 1.408, + "grad_norm": 0.4068273901939392, + "learning_rate": 0.0002574193548387096, + "loss": 0.6376, + "step": 44 + }, + { + "epoch": 1.44, + "grad_norm": 0.4406949579715729, + "learning_rate": 0.00025645161290322577, + "loss": 0.4594, + "step": 45 + }, + { + "epoch": 1.472, + "grad_norm": 0.34500986337661743, + "learning_rate": 0.0002554838709677419, + "loss": 0.3672, + "step": 46 + }, + { + "epoch": 1.504, + "grad_norm": 0.4760681390762329, + "learning_rate": 0.00025451612903225806, + "loss": 0.6331, + "step": 47 + }, + { + "epoch": 1.536, + "grad_norm": 0.39281558990478516, + "learning_rate": 0.0002535483870967742, + "loss": 0.5845, + "step": 48 + }, + { + "epoch": 1.568, + "grad_norm": 0.4265002906322479, + "learning_rate": 0.0002525806451612903, + "loss": 0.4461, + "step": 49 + }, + { + "epoch": 1.6, + "grad_norm": 0.40967294573783875, + "learning_rate": 0.00025161290322580645, + "loss": 0.7011, + "step": 50 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.4288088381290436, + "learning_rate": 0.00025064516129032254, + "loss": 0.6928, + "step": 51 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.4356289803981781, + "learning_rate": 0.0002496774193548387, + "loss": 0.7972, + "step": 52 + }, + { + "epoch": 1.696, + "grad_norm": 0.3827487826347351, + "learning_rate": 0.0002487096774193548, + "loss": 0.2991, + "step": 53 + }, + { + "epoch": 1.728, + "grad_norm": 0.40093398094177246, + "learning_rate": 0.0002477419354838709, + "loss": 0.416, + "step": 54 + }, + { + "epoch": 1.76, + "grad_norm": 0.41548973321914673, + "learning_rate": 0.00024677419354838707, + "loss": 0.5501, + "step": 55 + }, + { + "epoch": 1.792, + "grad_norm": 0.4093388617038727, + "learning_rate": 0.0002458064516129032, + "loss": 0.5557, + "step": 56 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.3934040665626526, + "learning_rate": 0.00024483870967741936, + "loss": 0.602, + "step": 57 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.42221033573150635, + "learning_rate": 0.00024387096774193545, + "loss": 0.6421, + "step": 58 + }, + { + "epoch": 1.888, + "grad_norm": 0.4351339340209961, + "learning_rate": 0.0002429032258064516, + "loss": 0.5615, + "step": 59 + }, + { + "epoch": 1.92, + "grad_norm": 0.4319838881492615, + "learning_rate": 0.00024193548387096771, + "loss": 0.6804, + "step": 60 + }, + { + "epoch": 1.952, + "grad_norm": 0.40016525983810425, + "learning_rate": 0.00024096774193548386, + "loss": 0.5432, + "step": 61 + }, + { + "epoch": 1.984, + "grad_norm": 0.3905942440032959, + "learning_rate": 0.00023999999999999998, + "loss": 0.4187, + "step": 62 + }, + { + "epoch": 2.016, + "grad_norm": 0.8056382536888123, + "learning_rate": 0.0002390322580645161, + "loss": 1.0174, + "step": 63 + }, + { + "epoch": 2.048, + "grad_norm": 0.3835236430168152, + "learning_rate": 0.00023806451612903224, + "loss": 0.5992, + "step": 64 + }, + { + "epoch": 2.08, + "grad_norm": 0.41092216968536377, + "learning_rate": 0.00023709677419354836, + "loss": 0.4746, + "step": 65 + }, + { + "epoch": 2.112, + "grad_norm": 0.39536622166633606, + "learning_rate": 0.0002361290322580645, + "loss": 0.3946, + "step": 66 + }, + { + "epoch": 2.144, + "grad_norm": 0.3927665948867798, + "learning_rate": 0.0002351612903225806, + "loss": 0.5187, + "step": 67 + }, + { + "epoch": 2.176, + "grad_norm": 0.39792704582214355, + "learning_rate": 0.00023419354838709674, + "loss": 0.4568, + "step": 68 + }, + { + "epoch": 2.208, + "grad_norm": 0.5023652911186218, + "learning_rate": 0.0002332258064516129, + "loss": 0.6166, + "step": 69 + }, + { + "epoch": 2.24, + "grad_norm": 0.425017774105072, + "learning_rate": 0.000232258064516129, + "loss": 0.42, + "step": 70 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 0.46458110213279724, + "learning_rate": 0.00023129032258064516, + "loss": 0.4613, + "step": 71 + }, + { + "epoch": 2.304, + "grad_norm": 0.49037960171699524, + "learning_rate": 0.00023032258064516125, + "loss": 0.5509, + "step": 72 + }, + { + "epoch": 2.336, + "grad_norm": 0.5233697891235352, + "learning_rate": 0.0002293548387096774, + "loss": 0.6396, + "step": 73 + }, + { + "epoch": 2.368, + "grad_norm": 0.4720582962036133, + "learning_rate": 0.0002283870967741935, + "loss": 0.5076, + "step": 74 + }, + { + "epoch": 2.4, + "grad_norm": 0.4900650382041931, + "learning_rate": 0.00022741935483870966, + "loss": 0.4794, + "step": 75 + }, + { + "epoch": 2.432, + "grad_norm": 0.6321704983711243, + "learning_rate": 0.0002264516129032258, + "loss": 0.6677, + "step": 76 + }, + { + "epoch": 2.464, + "grad_norm": 0.5305324792861938, + "learning_rate": 0.00022548387096774192, + "loss": 0.5102, + "step": 77 + }, + { + "epoch": 2.496, + "grad_norm": 0.5799248218536377, + "learning_rate": 0.00022451612903225804, + "loss": 0.5274, + "step": 78 + }, + { + "epoch": 2.528, + "grad_norm": 0.4990101456642151, + "learning_rate": 0.00022354838709677416, + "loss": 0.5407, + "step": 79 + }, + { + "epoch": 2.56, + "grad_norm": 0.4779827296733856, + "learning_rate": 0.0002225806451612903, + "loss": 0.5166, + "step": 80 + }, + { + "epoch": 2.592, + "grad_norm": 0.5140111446380615, + "learning_rate": 0.00022161290322580645, + "loss": 0.3288, + "step": 81 + }, + { + "epoch": 2.624, + "grad_norm": 0.5674853920936584, + "learning_rate": 0.00022064516129032257, + "loss": 0.666, + "step": 82 + }, + { + "epoch": 2.656, + "grad_norm": 0.5277597308158875, + "learning_rate": 0.00021967741935483871, + "loss": 0.5335, + "step": 83 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 0.6029439568519592, + "learning_rate": 0.0002187096774193548, + "loss": 0.693, + "step": 84 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.5039327144622803, + "learning_rate": 0.00021774193548387095, + "loss": 0.5728, + "step": 85 + }, + { + "epoch": 2.752, + "grad_norm": 0.5564692616462708, + "learning_rate": 0.00021677419354838707, + "loss": 0.4734, + "step": 86 + }, + { + "epoch": 2.784, + "grad_norm": 0.5278319120407104, + "learning_rate": 0.00021580645161290322, + "loss": 0.5834, + "step": 87 + }, + { + "epoch": 2.816, + "grad_norm": 0.5445135831832886, + "learning_rate": 0.00021483870967741936, + "loss": 0.4642, + "step": 88 + }, + { + "epoch": 2.848, + "grad_norm": 0.5394749045372009, + "learning_rate": 0.00021387096774193545, + "loss": 0.4779, + "step": 89 + }, + { + "epoch": 2.88, + "grad_norm": 0.5756134390830994, + "learning_rate": 0.0002129032258064516, + "loss": 0.5607, + "step": 90 + }, + { + "epoch": 2.912, + "grad_norm": 0.48361241817474365, + "learning_rate": 0.00021193548387096772, + "loss": 0.4278, + "step": 91 + }, + { + "epoch": 2.944, + "grad_norm": 0.5017121434211731, + "learning_rate": 0.00021096774193548386, + "loss": 0.4834, + "step": 92 + }, + { + "epoch": 2.976, + "grad_norm": 0.4741989076137543, + "learning_rate": 0.00020999999999999998, + "loss": 0.468, + "step": 93 + }, + { + "epoch": 3.008, + "grad_norm": 1.003368854522705, + "learning_rate": 0.0002090322580645161, + "loss": 0.8614, + "step": 94 + }, + { + "epoch": 3.04, + "grad_norm": 0.4782228469848633, + "learning_rate": 0.00020806451612903225, + "loss": 0.4111, + "step": 95 + }, + { + "epoch": 3.072, + "grad_norm": 0.4558674395084381, + "learning_rate": 0.00020709677419354836, + "loss": 0.3463, + "step": 96 + }, + { + "epoch": 3.104, + "grad_norm": 0.4409371316432953, + "learning_rate": 0.0002061290322580645, + "loss": 0.2571, + "step": 97 + }, + { + "epoch": 3.136, + "grad_norm": 0.5415034890174866, + "learning_rate": 0.00020516129032258063, + "loss": 0.5707, + "step": 98 + }, + { + "epoch": 3.168, + "grad_norm": 0.6157724857330322, + "learning_rate": 0.00020419354838709677, + "loss": 0.5692, + "step": 99 + }, + { + "epoch": 3.2, + "grad_norm": 0.4855688810348511, + "learning_rate": 0.00020322580645161287, + "loss": 0.3311, + "step": 100 + }, + { + "epoch": 3.232, + "grad_norm": 0.569878101348877, + "learning_rate": 0.000202258064516129, + "loss": 0.4707, + "step": 101 + }, + { + "epoch": 3.2640000000000002, + "grad_norm": 0.645232081413269, + "learning_rate": 0.00020129032258064516, + "loss": 0.5504, + "step": 102 + }, + { + "epoch": 3.296, + "grad_norm": 0.5775763392448425, + "learning_rate": 0.00020032258064516128, + "loss": 0.3651, + "step": 103 + }, + { + "epoch": 3.328, + "grad_norm": 0.5808250904083252, + "learning_rate": 0.00019935483870967742, + "loss": 0.5068, + "step": 104 + }, + { + "epoch": 3.36, + "grad_norm": 0.689313530921936, + "learning_rate": 0.0001983870967741935, + "loss": 0.4936, + "step": 105 + }, + { + "epoch": 3.392, + "grad_norm": 0.6571519374847412, + "learning_rate": 0.00019741935483870966, + "loss": 0.3671, + "step": 106 + }, + { + "epoch": 3.424, + "grad_norm": 0.6340517401695251, + "learning_rate": 0.00019645161290322578, + "loss": 0.4783, + "step": 107 + }, + { + "epoch": 3.456, + "grad_norm": 0.7031407952308655, + "learning_rate": 0.00019548387096774192, + "loss": 0.427, + "step": 108 + }, + { + "epoch": 3.488, + "grad_norm": 0.728496789932251, + "learning_rate": 0.00019451612903225807, + "loss": 0.5497, + "step": 109 + }, + { + "epoch": 3.52, + "grad_norm": 0.6106727719306946, + "learning_rate": 0.00019354838709677416, + "loss": 0.392, + "step": 110 + }, + { + "epoch": 3.552, + "grad_norm": 0.5296047329902649, + "learning_rate": 0.0001925806451612903, + "loss": 0.3412, + "step": 111 + }, + { + "epoch": 3.584, + "grad_norm": 0.6282025575637817, + "learning_rate": 0.00019161290322580643, + "loss": 0.4081, + "step": 112 + }, + { + "epoch": 3.616, + "grad_norm": 0.6166461110115051, + "learning_rate": 0.00019064516129032257, + "loss": 0.4771, + "step": 113 + }, + { + "epoch": 3.648, + "grad_norm": 0.5448863506317139, + "learning_rate": 0.0001896774193548387, + "loss": 0.404, + "step": 114 + }, + { + "epoch": 3.68, + "grad_norm": 0.6598389148712158, + "learning_rate": 0.0001887096774193548, + "loss": 0.3915, + "step": 115 + }, + { + "epoch": 3.7119999999999997, + "grad_norm": 0.5567564368247986, + "learning_rate": 0.00018774193548387095, + "loss": 0.3862, + "step": 116 + }, + { + "epoch": 3.7439999999999998, + "grad_norm": 0.6524521708488464, + "learning_rate": 0.00018677419354838707, + "loss": 0.5315, + "step": 117 + }, + { + "epoch": 3.776, + "grad_norm": 0.7040128707885742, + "learning_rate": 0.00018580645161290322, + "loss": 0.5387, + "step": 118 + }, + { + "epoch": 3.808, + "grad_norm": 0.690262496471405, + "learning_rate": 0.00018483870967741934, + "loss": 0.4877, + "step": 119 + }, + { + "epoch": 3.84, + "grad_norm": 0.6928034424781799, + "learning_rate": 0.00018387096774193548, + "loss": 0.4895, + "step": 120 + }, + { + "epoch": 3.872, + "grad_norm": 0.7148469686508179, + "learning_rate": 0.00018290322580645157, + "loss": 0.4814, + "step": 121 + }, + { + "epoch": 3.904, + "grad_norm": 0.6096572875976562, + "learning_rate": 0.00018193548387096772, + "loss": 0.3403, + "step": 122 + }, + { + "epoch": 3.936, + "grad_norm": 0.7132399678230286, + "learning_rate": 0.00018096774193548387, + "loss": 0.4258, + "step": 123 + }, + { + "epoch": 3.968, + "grad_norm": 0.7302684187889099, + "learning_rate": 0.00017999999999999998, + "loss": 0.7215, + "step": 124 + }, + { + "epoch": 4.0, + "grad_norm": 1.5244004726409912, + "learning_rate": 0.00017903225806451613, + "loss": 0.8544, + "step": 125 + }, + { + "epoch": 4.032, + "grad_norm": 0.6032777428627014, + "learning_rate": 0.00017806451612903222, + "loss": 0.4183, + "step": 126 + }, + { + "epoch": 4.064, + "grad_norm": 0.6349691152572632, + "learning_rate": 0.00017709677419354837, + "loss": 0.5871, + "step": 127 + }, + { + "epoch": 4.096, + "grad_norm": 0.5730060935020447, + "learning_rate": 0.00017612903225806449, + "loss": 0.3786, + "step": 128 + }, + { + "epoch": 4.128, + "grad_norm": 0.6988044381141663, + "learning_rate": 0.00017516129032258063, + "loss": 0.3216, + "step": 129 + }, + { + "epoch": 4.16, + "grad_norm": 0.7379153370857239, + "learning_rate": 0.00017419354838709678, + "loss": 0.4026, + "step": 130 + }, + { + "epoch": 4.192, + "grad_norm": 0.7058238983154297, + "learning_rate": 0.00017322580645161287, + "loss": 0.4328, + "step": 131 + }, + { + "epoch": 4.224, + "grad_norm": 0.80663001537323, + "learning_rate": 0.00017225806451612901, + "loss": 0.3849, + "step": 132 + }, + { + "epoch": 4.256, + "grad_norm": 0.899818480014801, + "learning_rate": 0.00017129032258064513, + "loss": 0.4191, + "step": 133 + }, + { + "epoch": 4.288, + "grad_norm": 0.8538224697113037, + "learning_rate": 0.00017032258064516128, + "loss": 0.3587, + "step": 134 + }, + { + "epoch": 4.32, + "grad_norm": 0.8948169350624084, + "learning_rate": 0.00016935483870967742, + "loss": 0.3957, + "step": 135 + }, + { + "epoch": 4.352, + "grad_norm": 0.7195591926574707, + "learning_rate": 0.00016838709677419354, + "loss": 0.3361, + "step": 136 + }, + { + "epoch": 4.384, + "grad_norm": 0.7769681215286255, + "learning_rate": 0.00016741935483870966, + "loss": 0.3519, + "step": 137 + }, + { + "epoch": 4.416, + "grad_norm": 0.9509867429733276, + "learning_rate": 0.00016645161290322578, + "loss": 0.4216, + "step": 138 + }, + { + "epoch": 4.448, + "grad_norm": 0.7923309206962585, + "learning_rate": 0.00016548387096774193, + "loss": 0.3999, + "step": 139 + }, + { + "epoch": 4.48, + "grad_norm": 0.8961685299873352, + "learning_rate": 0.00016451612903225804, + "loss": 0.5385, + "step": 140 + }, + { + "epoch": 4.5120000000000005, + "grad_norm": 0.7496562004089355, + "learning_rate": 0.0001635483870967742, + "loss": 0.341, + "step": 141 + }, + { + "epoch": 4.5440000000000005, + "grad_norm": 0.8512839674949646, + "learning_rate": 0.00016258064516129034, + "loss": 0.3847, + "step": 142 + }, + { + "epoch": 4.576, + "grad_norm": 0.7487362027168274, + "learning_rate": 0.00016161290322580643, + "loss": 0.3694, + "step": 143 + }, + { + "epoch": 4.608, + "grad_norm": 0.7957774996757507, + "learning_rate": 0.00016064516129032257, + "loss": 0.3379, + "step": 144 + }, + { + "epoch": 4.64, + "grad_norm": 0.7299221754074097, + "learning_rate": 0.0001596774193548387, + "loss": 0.2989, + "step": 145 + }, + { + "epoch": 4.672, + "grad_norm": 0.7909884452819824, + "learning_rate": 0.00015870967741935484, + "loss": 0.3675, + "step": 146 + }, + { + "epoch": 4.704, + "grad_norm": 0.7321597933769226, + "learning_rate": 0.00015774193548387093, + "loss": 0.3243, + "step": 147 + }, + { + "epoch": 4.736, + "grad_norm": 0.7196181416511536, + "learning_rate": 0.00015677419354838708, + "loss": 0.2709, + "step": 148 + }, + { + "epoch": 4.768, + "grad_norm": 0.7918142676353455, + "learning_rate": 0.00015580645161290322, + "loss": 0.3934, + "step": 149 + }, + { + "epoch": 4.8, + "grad_norm": 0.8657622337341309, + "learning_rate": 0.00015483870967741934, + "loss": 0.3583, + "step": 150 + }, + { + "epoch": 4.832, + "grad_norm": 0.8207722306251526, + "learning_rate": 0.00015387096774193549, + "loss": 0.412, + "step": 151 + }, + { + "epoch": 4.864, + "grad_norm": 0.7206109166145325, + "learning_rate": 0.00015290322580645158, + "loss": 0.3594, + "step": 152 + }, + { + "epoch": 4.896, + "grad_norm": 0.8529183864593506, + "learning_rate": 0.00015193548387096772, + "loss": 0.512, + "step": 153 + }, + { + "epoch": 4.928, + "grad_norm": 0.6895930171012878, + "learning_rate": 0.00015096774193548384, + "loss": 0.333, + "step": 154 + }, + { + "epoch": 4.96, + "grad_norm": 0.7422910332679749, + "learning_rate": 0.00015, + "loss": 0.2872, + "step": 155 + }, + { + "epoch": 4.992, + "grad_norm": 0.7366386651992798, + "learning_rate": 0.0001490322580645161, + "loss": 0.3415, + "step": 156 + }, + { + "epoch": 5.024, + "grad_norm": 2.1416280269622803, + "learning_rate": 0.00014806451612903225, + "loss": 0.9961, + "step": 157 + }, + { + "epoch": 5.056, + "grad_norm": 0.7944900393486023, + "learning_rate": 0.00014709677419354837, + "loss": 0.3372, + "step": 158 + }, + { + "epoch": 5.088, + "grad_norm": 0.7071006298065186, + "learning_rate": 0.00014612903225806452, + "loss": 0.2732, + "step": 159 + }, + { + "epoch": 5.12, + "grad_norm": 0.7874396443367004, + "learning_rate": 0.00014516129032258063, + "loss": 0.2861, + "step": 160 + }, + { + "epoch": 5.152, + "grad_norm": 0.8244249224662781, + "learning_rate": 0.00014419354838709675, + "loss": 0.3428, + "step": 161 + }, + { + "epoch": 5.184, + "grad_norm": 0.81637042760849, + "learning_rate": 0.0001432258064516129, + "loss": 0.3037, + "step": 162 + }, + { + "epoch": 5.216, + "grad_norm": 0.9916559457778931, + "learning_rate": 0.00014225806451612902, + "loss": 0.3337, + "step": 163 + }, + { + "epoch": 5.248, + "grad_norm": 0.9077599048614502, + "learning_rate": 0.00014129032258064514, + "loss": 0.287, + "step": 164 + }, + { + "epoch": 5.28, + "grad_norm": 0.9824132919311523, + "learning_rate": 0.00014032258064516128, + "loss": 0.3852, + "step": 165 + }, + { + "epoch": 5.312, + "grad_norm": 1.0016467571258545, + "learning_rate": 0.0001393548387096774, + "loss": 0.3234, + "step": 166 + }, + { + "epoch": 5.344, + "grad_norm": 0.8697543144226074, + "learning_rate": 0.00013838709677419355, + "loss": 0.2848, + "step": 167 + }, + { + "epoch": 5.376, + "grad_norm": 0.8214029669761658, + "learning_rate": 0.00013741935483870966, + "loss": 0.3377, + "step": 168 + }, + { + "epoch": 5.408, + "grad_norm": 0.9105691313743591, + "learning_rate": 0.00013645161290322578, + "loss": 0.2944, + "step": 169 + }, + { + "epoch": 5.44, + "grad_norm": 0.9642040133476257, + "learning_rate": 0.00013548387096774193, + "loss": 0.3624, + "step": 170 + }, + { + "epoch": 5.4719999999999995, + "grad_norm": 0.9218887686729431, + "learning_rate": 0.00013451612903225805, + "loss": 0.3938, + "step": 171 + }, + { + "epoch": 5.504, + "grad_norm": 0.8704710006713867, + "learning_rate": 0.0001335483870967742, + "loss": 0.3629, + "step": 172 + }, + { + "epoch": 5.536, + "grad_norm": 0.8207693099975586, + "learning_rate": 0.0001325806451612903, + "loss": 0.3169, + "step": 173 + }, + { + "epoch": 5.568, + "grad_norm": 0.9315701127052307, + "learning_rate": 0.00013161290322580643, + "loss": 0.429, + "step": 174 + }, + { + "epoch": 5.6, + "grad_norm": 0.860234260559082, + "learning_rate": 0.00013064516129032258, + "loss": 0.3842, + "step": 175 + }, + { + "epoch": 5.632, + "grad_norm": 0.8927604556083679, + "learning_rate": 0.0001296774193548387, + "loss": 0.3405, + "step": 176 + }, + { + "epoch": 5.664, + "grad_norm": 0.8084587454795837, + "learning_rate": 0.0001287096774193548, + "loss": 0.306, + "step": 177 + }, + { + "epoch": 5.696, + "grad_norm": 0.9102941155433655, + "learning_rate": 0.00012774193548387096, + "loss": 0.3285, + "step": 178 + }, + { + "epoch": 5.728, + "grad_norm": 0.763113796710968, + "learning_rate": 0.0001267741935483871, + "loss": 0.2729, + "step": 179 + }, + { + "epoch": 5.76, + "grad_norm": 0.8704251646995544, + "learning_rate": 0.00012580645161290322, + "loss": 0.3164, + "step": 180 + }, + { + "epoch": 5.792, + "grad_norm": 0.9634932279586792, + "learning_rate": 0.00012483870967741934, + "loss": 0.2939, + "step": 181 + }, + { + "epoch": 5.824, + "grad_norm": 1.1567790508270264, + "learning_rate": 0.00012387096774193546, + "loss": 0.3076, + "step": 182 + }, + { + "epoch": 5.856, + "grad_norm": 0.9096764922142029, + "learning_rate": 0.0001229032258064516, + "loss": 0.3289, + "step": 183 + }, + { + "epoch": 5.888, + "grad_norm": 0.9840425848960876, + "learning_rate": 0.00012193548387096773, + "loss": 0.2772, + "step": 184 + }, + { + "epoch": 5.92, + "grad_norm": 0.725844144821167, + "learning_rate": 0.00012096774193548386, + "loss": 0.2151, + "step": 185 + }, + { + "epoch": 5.952, + "grad_norm": 0.8343638181686401, + "learning_rate": 0.00011999999999999999, + "loss": 0.3825, + "step": 186 + }, + { + "epoch": 5.984, + "grad_norm": 0.8040199279785156, + "learning_rate": 0.00011903225806451612, + "loss": 0.2571, + "step": 187 + }, + { + "epoch": 6.016, + "grad_norm": 1.6932090520858765, + "learning_rate": 0.00011806451612903225, + "loss": 0.5538, + "step": 188 + }, + { + "epoch": 6.048, + "grad_norm": 0.744048535823822, + "learning_rate": 0.00011709677419354837, + "loss": 0.2335, + "step": 189 + }, + { + "epoch": 6.08, + "grad_norm": 0.6974924206733704, + "learning_rate": 0.0001161290322580645, + "loss": 0.2891, + "step": 190 + }, + { + "epoch": 6.112, + "grad_norm": 0.7202953696250916, + "learning_rate": 0.00011516129032258062, + "loss": 0.2017, + "step": 191 + }, + { + "epoch": 6.144, + "grad_norm": 0.8437547087669373, + "learning_rate": 0.00011419354838709676, + "loss": 0.2175, + "step": 192 + }, + { + "epoch": 6.176, + "grad_norm": 1.0741796493530273, + "learning_rate": 0.0001132258064516129, + "loss": 0.3913, + "step": 193 + }, + { + "epoch": 6.208, + "grad_norm": 1.031754493713379, + "learning_rate": 0.00011225806451612902, + "loss": 0.298, + "step": 194 + }, + { + "epoch": 6.24, + "grad_norm": 0.9575178027153015, + "learning_rate": 0.00011129032258064515, + "loss": 0.3201, + "step": 195 + }, + { + "epoch": 6.272, + "grad_norm": 0.9503082633018494, + "learning_rate": 0.00011032258064516128, + "loss": 0.2005, + "step": 196 + }, + { + "epoch": 6.304, + "grad_norm": 1.2572892904281616, + "learning_rate": 0.0001093548387096774, + "loss": 0.3045, + "step": 197 + }, + { + "epoch": 6.336, + "grad_norm": 1.5667368173599243, + "learning_rate": 0.00010838709677419353, + "loss": 0.4053, + "step": 198 + }, + { + "epoch": 6.368, + "grad_norm": 0.9439151883125305, + "learning_rate": 0.00010741935483870968, + "loss": 0.2721, + "step": 199 + }, + { + "epoch": 6.4, + "grad_norm": 1.0985567569732666, + "learning_rate": 0.0001064516129032258, + "loss": 0.2543, + "step": 200 + }, + { + "epoch": 6.432, + "grad_norm": 0.789880633354187, + "learning_rate": 0.00010548387096774193, + "loss": 0.2148, + "step": 201 + }, + { + "epoch": 6.464, + "grad_norm": 0.9937541484832764, + "learning_rate": 0.00010451612903225805, + "loss": 0.2343, + "step": 202 + }, + { + "epoch": 6.496, + "grad_norm": 0.9496509432792664, + "learning_rate": 0.00010354838709677418, + "loss": 0.2576, + "step": 203 + }, + { + "epoch": 6.5280000000000005, + "grad_norm": 0.9214590191841125, + "learning_rate": 0.00010258064516129031, + "loss": 0.3067, + "step": 204 + }, + { + "epoch": 6.5600000000000005, + "grad_norm": 0.8984239101409912, + "learning_rate": 0.00010161290322580643, + "loss": 0.2471, + "step": 205 + }, + { + "epoch": 6.592, + "grad_norm": 0.8055192232131958, + "learning_rate": 0.00010064516129032258, + "loss": 0.2234, + "step": 206 + }, + { + "epoch": 6.624, + "grad_norm": 0.769008219242096, + "learning_rate": 9.967741935483871e-05, + "loss": 0.1963, + "step": 207 + }, + { + "epoch": 6.656, + "grad_norm": 0.7947174310684204, + "learning_rate": 9.870967741935483e-05, + "loss": 0.2165, + "step": 208 + }, + { + "epoch": 6.688, + "grad_norm": 1.0192420482635498, + "learning_rate": 9.774193548387096e-05, + "loss": 0.2581, + "step": 209 + }, + { + "epoch": 6.72, + "grad_norm": 1.0067439079284668, + "learning_rate": 9.677419354838708e-05, + "loss": 0.2394, + "step": 210 + }, + { + "epoch": 6.752, + "grad_norm": 1.0539058446884155, + "learning_rate": 9.580645161290321e-05, + "loss": 0.2526, + "step": 211 + }, + { + "epoch": 6.784, + "grad_norm": 1.130011796951294, + "learning_rate": 9.483870967741934e-05, + "loss": 0.3339, + "step": 212 + }, + { + "epoch": 6.816, + "grad_norm": 0.9603860378265381, + "learning_rate": 9.387096774193548e-05, + "loss": 0.2808, + "step": 213 + }, + { + "epoch": 6.848, + "grad_norm": 1.0667173862457275, + "learning_rate": 9.290322580645161e-05, + "loss": 0.3025, + "step": 214 + }, + { + "epoch": 6.88, + "grad_norm": 0.9093402624130249, + "learning_rate": 9.193548387096774e-05, + "loss": 0.2698, + "step": 215 + }, + { + "epoch": 6.912, + "grad_norm": 0.8621392846107483, + "learning_rate": 9.096774193548386e-05, + "loss": 0.2259, + "step": 216 + }, + { + "epoch": 6.944, + "grad_norm": 1.035175085067749, + "learning_rate": 8.999999999999999e-05, + "loss": 0.3156, + "step": 217 + }, + { + "epoch": 6.976, + "grad_norm": 1.0241689682006836, + "learning_rate": 8.903225806451611e-05, + "loss": 0.2723, + "step": 218 + }, + { + "epoch": 7.008, + "grad_norm": 1.735946536064148, + "learning_rate": 8.806451612903224e-05, + "loss": 0.411, + "step": 219 + }, + { + "epoch": 7.04, + "grad_norm": 0.8678178191184998, + "learning_rate": 8.709677419354839e-05, + "loss": 0.2415, + "step": 220 + }, + { + "epoch": 7.072, + "grad_norm": 0.7134645581245422, + "learning_rate": 8.612903225806451e-05, + "loss": 0.1509, + "step": 221 + }, + { + "epoch": 7.104, + "grad_norm": 0.8543497920036316, + "learning_rate": 8.516129032258064e-05, + "loss": 0.2459, + "step": 222 + }, + { + "epoch": 7.136, + "grad_norm": 0.9644029140472412, + "learning_rate": 8.419354838709677e-05, + "loss": 0.2828, + "step": 223 + }, + { + "epoch": 7.168, + "grad_norm": 0.8568740487098694, + "learning_rate": 8.322580645161289e-05, + "loss": 0.1936, + "step": 224 + }, + { + "epoch": 7.2, + "grad_norm": 1.005867600440979, + "learning_rate": 8.225806451612902e-05, + "loss": 0.2678, + "step": 225 + }, + { + "epoch": 7.232, + "grad_norm": 0.9942033290863037, + "learning_rate": 8.129032258064517e-05, + "loss": 0.2111, + "step": 226 + }, + { + "epoch": 7.264, + "grad_norm": 0.9886007905006409, + "learning_rate": 8.032258064516129e-05, + "loss": 0.2375, + "step": 227 + }, + { + "epoch": 7.296, + "grad_norm": 1.0586844682693481, + "learning_rate": 7.935483870967742e-05, + "loss": 0.2385, + "step": 228 + }, + { + "epoch": 7.328, + "grad_norm": 1.026432991027832, + "learning_rate": 7.838709677419354e-05, + "loss": 0.2139, + "step": 229 + }, + { + "epoch": 7.36, + "grad_norm": 1.0039665699005127, + "learning_rate": 7.741935483870967e-05, + "loss": 0.2211, + "step": 230 + }, + { + "epoch": 7.392, + "grad_norm": 1.1125057935714722, + "learning_rate": 7.645161290322579e-05, + "loss": 0.2725, + "step": 231 + }, + { + "epoch": 7.424, + "grad_norm": 0.9078079462051392, + "learning_rate": 7.548387096774192e-05, + "loss": 0.1965, + "step": 232 + }, + { + "epoch": 7.456, + "grad_norm": 0.8247030377388, + "learning_rate": 7.451612903225805e-05, + "loss": 0.1502, + "step": 233 + }, + { + "epoch": 7.4879999999999995, + "grad_norm": 1.1396474838256836, + "learning_rate": 7.354838709677418e-05, + "loss": 0.37, + "step": 234 + }, + { + "epoch": 7.52, + "grad_norm": 0.753663182258606, + "learning_rate": 7.258064516129032e-05, + "loss": 0.1627, + "step": 235 + }, + { + "epoch": 7.552, + "grad_norm": 0.7927701473236084, + "learning_rate": 7.161290322580645e-05, + "loss": 0.1684, + "step": 236 + }, + { + "epoch": 7.584, + "grad_norm": 0.9258756637573242, + "learning_rate": 7.064516129032257e-05, + "loss": 0.213, + "step": 237 + }, + { + "epoch": 7.616, + "grad_norm": 0.8111560940742493, + "learning_rate": 6.96774193548387e-05, + "loss": 0.1998, + "step": 238 + }, + { + "epoch": 7.648, + "grad_norm": 0.8484370708465576, + "learning_rate": 6.870967741935483e-05, + "loss": 0.1307, + "step": 239 + }, + { + "epoch": 7.68, + "grad_norm": 0.9123087525367737, + "learning_rate": 6.774193548387096e-05, + "loss": 0.2529, + "step": 240 + }, + { + "epoch": 7.712, + "grad_norm": 1.0526336431503296, + "learning_rate": 6.67741935483871e-05, + "loss": 0.2468, + "step": 241 + }, + { + "epoch": 7.744, + "grad_norm": 1.0104210376739502, + "learning_rate": 6.580645161290322e-05, + "loss": 0.23, + "step": 242 + }, + { + "epoch": 7.776, + "grad_norm": 0.8749745488166809, + "learning_rate": 6.483870967741935e-05, + "loss": 0.1973, + "step": 243 + }, + { + "epoch": 7.808, + "grad_norm": 0.9921355247497559, + "learning_rate": 6.387096774193548e-05, + "loss": 0.2144, + "step": 244 + }, + { + "epoch": 7.84, + "grad_norm": 0.8243810534477234, + "learning_rate": 6.290322580645161e-05, + "loss": 0.1531, + "step": 245 + }, + { + "epoch": 7.872, + "grad_norm": 1.0764353275299072, + "learning_rate": 6.193548387096773e-05, + "loss": 0.2763, + "step": 246 + }, + { + "epoch": 7.904, + "grad_norm": 1.1754212379455566, + "learning_rate": 6.096774193548386e-05, + "loss": 0.2249, + "step": 247 + }, + { + "epoch": 7.936, + "grad_norm": 0.8588422536849976, + "learning_rate": 5.9999999999999995e-05, + "loss": 0.1782, + "step": 248 + }, + { + "epoch": 7.968, + "grad_norm": 1.045143961906433, + "learning_rate": 5.903225806451613e-05, + "loss": 0.2789, + "step": 249 + }, + { + "epoch": 8.0, + "grad_norm": 1.9824038743972778, + "learning_rate": 5.806451612903225e-05, + "loss": 0.3057, + "step": 250 + }, + { + "epoch": 8.032, + "grad_norm": 0.9252362847328186, + "learning_rate": 5.709677419354838e-05, + "loss": 0.2221, + "step": 251 + }, + { + "epoch": 8.064, + "grad_norm": 0.8381021022796631, + "learning_rate": 5.612903225806451e-05, + "loss": 0.2639, + "step": 252 + }, + { + "epoch": 8.096, + "grad_norm": 0.9777012467384338, + "learning_rate": 5.516129032258064e-05, + "loss": 0.1533, + "step": 253 + }, + { + "epoch": 8.128, + "grad_norm": 0.8053516745567322, + "learning_rate": 5.419354838709677e-05, + "loss": 0.1883, + "step": 254 + }, + { + "epoch": 8.16, + "grad_norm": 0.8703336119651794, + "learning_rate": 5.32258064516129e-05, + "loss": 0.2079, + "step": 255 + }, + { + "epoch": 8.192, + "grad_norm": 0.8113718032836914, + "learning_rate": 5.2258064516129025e-05, + "loss": 0.1609, + "step": 256 + }, + { + "epoch": 8.224, + "grad_norm": 1.0667418241500854, + "learning_rate": 5.129032258064516e-05, + "loss": 0.2544, + "step": 257 + }, + { + "epoch": 8.256, + "grad_norm": 0.7853135466575623, + "learning_rate": 5.032258064516129e-05, + "loss": 0.1391, + "step": 258 + }, + { + "epoch": 8.288, + "grad_norm": 0.9970865845680237, + "learning_rate": 4.9354838709677415e-05, + "loss": 0.2305, + "step": 259 + }, + { + "epoch": 8.32, + "grad_norm": 12.063047409057617, + "learning_rate": 4.838709677419354e-05, + "loss": 0.189, + "step": 260 + }, + { + "epoch": 8.352, + "grad_norm": 1.2325772047042847, + "learning_rate": 4.741935483870967e-05, + "loss": 0.2308, + "step": 261 + }, + { + "epoch": 8.384, + "grad_norm": 1.1118851900100708, + "learning_rate": 4.6451612903225805e-05, + "loss": 0.2009, + "step": 262 + }, + { + "epoch": 8.416, + "grad_norm": 1.0783390998840332, + "learning_rate": 4.548387096774193e-05, + "loss": 0.2276, + "step": 263 + }, + { + "epoch": 8.448, + "grad_norm": 1.2127933502197266, + "learning_rate": 4.4516129032258055e-05, + "loss": 0.2046, + "step": 264 + }, + { + "epoch": 8.48, + "grad_norm": 1.1135843992233276, + "learning_rate": 4.3548387096774194e-05, + "loss": 0.1791, + "step": 265 + }, + { + "epoch": 8.512, + "grad_norm": 0.8666661381721497, + "learning_rate": 4.258064516129032e-05, + "loss": 0.1287, + "step": 266 + }, + { + "epoch": 8.544, + "grad_norm": 0.8430101275444031, + "learning_rate": 4.1612903225806445e-05, + "loss": 0.1475, + "step": 267 + }, + { + "epoch": 8.576, + "grad_norm": 0.7744110822677612, + "learning_rate": 4.0645161290322584e-05, + "loss": 0.1458, + "step": 268 + }, + { + "epoch": 8.608, + "grad_norm": 1.4067776203155518, + "learning_rate": 3.967741935483871e-05, + "loss": 0.2189, + "step": 269 + }, + { + "epoch": 8.64, + "grad_norm": 0.8347670435905457, + "learning_rate": 3.8709677419354835e-05, + "loss": 0.1602, + "step": 270 + }, + { + "epoch": 8.672, + "grad_norm": 0.7643276453018188, + "learning_rate": 3.774193548387096e-05, + "loss": 0.1363, + "step": 271 + }, + { + "epoch": 8.704, + "grad_norm": 0.898059606552124, + "learning_rate": 3.677419354838709e-05, + "loss": 0.156, + "step": 272 + }, + { + "epoch": 8.736, + "grad_norm": 0.8416333198547363, + "learning_rate": 3.5806451612903225e-05, + "loss": 0.1754, + "step": 273 + }, + { + "epoch": 8.768, + "grad_norm": 0.8691906929016113, + "learning_rate": 3.483870967741935e-05, + "loss": 0.1808, + "step": 274 + }, + { + "epoch": 8.8, + "grad_norm": 1.062111496925354, + "learning_rate": 3.387096774193548e-05, + "loss": 0.2559, + "step": 275 + }, + { + "epoch": 8.832, + "grad_norm": 0.881698727607727, + "learning_rate": 3.290322580645161e-05, + "loss": 0.1732, + "step": 276 + }, + { + "epoch": 8.864, + "grad_norm": 0.8446074724197388, + "learning_rate": 3.193548387096774e-05, + "loss": 0.1833, + "step": 277 + }, + { + "epoch": 8.896, + "grad_norm": 0.9393475651741028, + "learning_rate": 3.0967741935483865e-05, + "loss": 0.2165, + "step": 278 + }, + { + "epoch": 8.928, + "grad_norm": 0.8838346004486084, + "learning_rate": 2.9999999999999997e-05, + "loss": 0.146, + "step": 279 + }, + { + "epoch": 8.96, + "grad_norm": 0.8380343914031982, + "learning_rate": 2.9032258064516126e-05, + "loss": 0.1721, + "step": 280 + }, + { + "epoch": 8.992, + "grad_norm": 0.8561931252479553, + "learning_rate": 2.8064516129032255e-05, + "loss": 0.1519, + "step": 281 + }, + { + "epoch": 9.024, + "grad_norm": 1.6088253259658813, + "learning_rate": 2.7096774193548384e-05, + "loss": 0.2658, + "step": 282 + }, + { + "epoch": 9.056, + "grad_norm": 0.8154093027114868, + "learning_rate": 2.6129032258064513e-05, + "loss": 0.1693, + "step": 283 + }, + { + "epoch": 9.088, + "grad_norm": 0.7722072005271912, + "learning_rate": 2.5161290322580645e-05, + "loss": 0.1853, + "step": 284 + }, + { + "epoch": 9.12, + "grad_norm": 0.8294870257377625, + "learning_rate": 2.419354838709677e-05, + "loss": 0.1736, + "step": 285 + }, + { + "epoch": 9.152, + "grad_norm": 0.7481442093849182, + "learning_rate": 2.3225806451612902e-05, + "loss": 0.1544, + "step": 286 + }, + { + "epoch": 9.184, + "grad_norm": 0.923413872718811, + "learning_rate": 2.2258064516129028e-05, + "loss": 0.2162, + "step": 287 + }, + { + "epoch": 9.216, + "grad_norm": 0.8326953053474426, + "learning_rate": 2.129032258064516e-05, + "loss": 0.1926, + "step": 288 + }, + { + "epoch": 9.248, + "grad_norm": 0.7642485499382019, + "learning_rate": 2.0322580645161292e-05, + "loss": 0.1555, + "step": 289 + }, + { + "epoch": 9.28, + "grad_norm": 0.7902241945266724, + "learning_rate": 1.9354838709677417e-05, + "loss": 0.1459, + "step": 290 + }, + { + "epoch": 9.312, + "grad_norm": 0.7414844036102295, + "learning_rate": 1.8387096774193546e-05, + "loss": 0.1425, + "step": 291 + }, + { + "epoch": 9.344, + "grad_norm": 0.7870174646377563, + "learning_rate": 1.7419354838709675e-05, + "loss": 0.1853, + "step": 292 + }, + { + "epoch": 9.376, + "grad_norm": 0.9091981649398804, + "learning_rate": 1.6451612903225804e-05, + "loss": 0.1666, + "step": 293 + }, + { + "epoch": 9.408, + "grad_norm": 0.8651584386825562, + "learning_rate": 1.5483870967741933e-05, + "loss": 0.174, + "step": 294 + }, + { + "epoch": 9.44, + "grad_norm": 0.7866891622543335, + "learning_rate": 1.4516129032258063e-05, + "loss": 0.1478, + "step": 295 + }, + { + "epoch": 9.472, + "grad_norm": 0.717932403087616, + "learning_rate": 1.3548387096774192e-05, + "loss": 0.1425, + "step": 296 + }, + { + "epoch": 9.504, + "grad_norm": 1.0217758417129517, + "learning_rate": 1.2580645161290322e-05, + "loss": 0.1574, + "step": 297 + }, + { + "epoch": 9.536, + "grad_norm": 0.8149961829185486, + "learning_rate": 1.1612903225806451e-05, + "loss": 0.1422, + "step": 298 + }, + { + "epoch": 9.568, + "grad_norm": 0.9206218719482422, + "learning_rate": 1.064516129032258e-05, + "loss": 0.1809, + "step": 299 + }, + { + "epoch": 9.6, + "grad_norm": 0.6865082383155823, + "learning_rate": 9.677419354838709e-06, + "loss": 0.133, + "step": 300 + } + ], + "logging_steps": 1, + "max_steps": 310, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.1102792466432e+16, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-300/training_args.bin b/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..01723982396407692a903d785c60e57fcabfa0c4 --- /dev/null +++ b/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:457c697b05fd5daa3c83df8920300c4940c26fb78ace5b5428b7c95d133a0ef4 +size 5560 diff --git a/checkpoint-310/README.md b/checkpoint-310/README.md new file mode 100644 index 0000000000000000000000000000000000000000..17729c873634bbfb1699f787a721c2f034265b15 --- /dev/null +++ b/checkpoint-310/README.md @@ -0,0 +1,202 @@ +--- +base_model: /home/ubuntu/Apps/DataInf/models/model +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.13.2 \ No newline at end of file diff --git a/checkpoint-310/adapter_config.json b/checkpoint-310/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e8b6984d46d6583eed5953a32eacf9f9ce36613d --- /dev/null +++ b/checkpoint-310/adapter_config.json @@ -0,0 +1,29 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "/home/ubuntu/Apps/DataInf/models/model", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.05, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-310/adapter_model.safetensors b/checkpoint-310/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b8569b1b01e4815c728777d4e1f28ddf15949f2f --- /dev/null +++ b/checkpoint-310/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3c618c18055bfce72d306d6b635cd0ecbd60120c3067688e0d526ab340b6b02 +size 26235704 diff --git a/checkpoint-310/optimizer.pt b/checkpoint-310/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..e8abf6dffefad9cbb453a0501d9271352de28aa3 --- /dev/null +++ b/checkpoint-310/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ef2087b1fbe093489148f4eb3a1c5ef6c381a445db728bd5dffc0f7e1d1f8b9 +size 52563258 diff --git a/checkpoint-310/rng_state.pth b/checkpoint-310/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..11ab6b4bc5da4fd238e034723349bbb05cf48707 --- /dev/null +++ b/checkpoint-310/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9ad5db2e1c1e26a024ec88b44e8e3dd0cd3608c099e37825d10b614778afd4e +size 14244 diff --git a/checkpoint-310/scheduler.pt b/checkpoint-310/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8326dc138070f9427ab11e8409c6ba6cf4bb4f6a --- /dev/null +++ b/checkpoint-310/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e89b1f0ce5285b1ae483386e0af782d4064a7bcec888d6919f207b0b5e5fa62 +size 1064 diff --git a/checkpoint-310/special_tokens_map.json b/checkpoint-310/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/checkpoint-310/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-310/tokenizer.model b/checkpoint-310/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/checkpoint-310/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/checkpoint-310/tokenizer_config.json b/checkpoint-310/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6ca9f0b39df7b30b561a2070b66bf0059e2aa9c8 --- /dev/null +++ b/checkpoint-310/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/checkpoint-310/trainer_state.json b/checkpoint-310/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bd2872139d09ce0b9c0e39ea9437851594c23f99 --- /dev/null +++ b/checkpoint-310/trainer_state.json @@ -0,0 +1,2203 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 9.92, + "eval_steps": 500, + "global_step": 310, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.032, + "grad_norm": 0.3297976851463318, + "learning_rate": 0.0002990322580645161, + "loss": 1.0389, + "step": 1 + }, + { + "epoch": 0.064, + "grad_norm": 0.4069916307926178, + "learning_rate": 0.0002980645161290322, + "loss": 1.3377, + "step": 2 + }, + { + "epoch": 0.096, + "grad_norm": 0.42084500193595886, + "learning_rate": 0.00029709677419354836, + "loss": 0.9366, + "step": 3 + }, + { + "epoch": 0.128, + "grad_norm": 0.4641948938369751, + "learning_rate": 0.0002961290322580645, + "loss": 1.0086, + "step": 4 + }, + { + "epoch": 0.16, + "grad_norm": 0.3840750455856323, + "learning_rate": 0.00029516129032258065, + "loss": 0.8333, + "step": 5 + }, + { + "epoch": 0.192, + "grad_norm": 0.4263865053653717, + "learning_rate": 0.00029419354838709674, + "loss": 0.854, + "step": 6 + }, + { + "epoch": 0.224, + "grad_norm": 0.48615148663520813, + "learning_rate": 0.0002932258064516129, + "loss": 0.9548, + "step": 7 + }, + { + "epoch": 0.256, + "grad_norm": 0.44419369101524353, + "learning_rate": 0.00029225806451612903, + "loss": 0.8482, + "step": 8 + }, + { + "epoch": 0.288, + "grad_norm": 0.5317733883857727, + "learning_rate": 0.0002912903225806451, + "loss": 0.9426, + "step": 9 + }, + { + "epoch": 0.32, + "grad_norm": 0.47260937094688416, + "learning_rate": 0.00029032258064516127, + "loss": 0.9816, + "step": 10 + }, + { + "epoch": 0.352, + "grad_norm": 0.39063283801078796, + "learning_rate": 0.00028935483870967736, + "loss": 0.84, + "step": 11 + }, + { + "epoch": 0.384, + "grad_norm": 0.39234670996665955, + "learning_rate": 0.0002883870967741935, + "loss": 0.7476, + "step": 12 + }, + { + "epoch": 0.416, + "grad_norm": 0.40661805868148804, + "learning_rate": 0.00028741935483870965, + "loss": 0.9282, + "step": 13 + }, + { + "epoch": 0.448, + "grad_norm": 0.42970865964889526, + "learning_rate": 0.0002864516129032258, + "loss": 0.7858, + "step": 14 + }, + { + "epoch": 0.48, + "grad_norm": 0.3780193626880646, + "learning_rate": 0.00028548387096774194, + "loss": 0.7968, + "step": 15 + }, + { + "epoch": 0.512, + "grad_norm": 0.37006014585494995, + "learning_rate": 0.00028451612903225803, + "loss": 0.6801, + "step": 16 + }, + { + "epoch": 0.544, + "grad_norm": 0.3660840392112732, + "learning_rate": 0.0002835483870967742, + "loss": 0.5914, + "step": 17 + }, + { + "epoch": 0.576, + "grad_norm": 0.3270975351333618, + "learning_rate": 0.00028258064516129027, + "loss": 0.6449, + "step": 18 + }, + { + "epoch": 0.608, + "grad_norm": 0.3859024941921234, + "learning_rate": 0.0002816129032258064, + "loss": 0.8144, + "step": 19 + }, + { + "epoch": 0.64, + "grad_norm": 0.37092071771621704, + "learning_rate": 0.00028064516129032256, + "loss": 0.7667, + "step": 20 + }, + { + "epoch": 0.672, + "grad_norm": 0.37667015194892883, + "learning_rate": 0.0002796774193548387, + "loss": 0.7751, + "step": 21 + }, + { + "epoch": 0.704, + "grad_norm": 0.3832458555698395, + "learning_rate": 0.0002787096774193548, + "loss": 0.755, + "step": 22 + }, + { + "epoch": 0.736, + "grad_norm": 0.327288419008255, + "learning_rate": 0.00027774193548387095, + "loss": 0.7178, + "step": 23 + }, + { + "epoch": 0.768, + "grad_norm": 0.34552687406539917, + "learning_rate": 0.0002767741935483871, + "loss": 0.7057, + "step": 24 + }, + { + "epoch": 0.8, + "grad_norm": 0.3611259460449219, + "learning_rate": 0.0002758064516129032, + "loss": 0.8159, + "step": 25 + }, + { + "epoch": 0.832, + "grad_norm": 0.3345054090023041, + "learning_rate": 0.00027483870967741933, + "loss": 0.7208, + "step": 26 + }, + { + "epoch": 0.864, + "grad_norm": 0.3697254955768585, + "learning_rate": 0.0002738709677419355, + "loss": 0.8964, + "step": 27 + }, + { + "epoch": 0.896, + "grad_norm": 0.3905017375946045, + "learning_rate": 0.00027290322580645157, + "loss": 0.7794, + "step": 28 + }, + { + "epoch": 0.928, + "grad_norm": 0.3715725243091583, + "learning_rate": 0.0002719354838709677, + "loss": 0.6966, + "step": 29 + }, + { + "epoch": 0.96, + "grad_norm": 0.3650343120098114, + "learning_rate": 0.00027096774193548386, + "loss": 0.5761, + "step": 30 + }, + { + "epoch": 0.992, + "grad_norm": 0.33932459354400635, + "learning_rate": 0.00027, + "loss": 0.556, + "step": 31 + }, + { + "epoch": 1.024, + "grad_norm": 0.6371742486953735, + "learning_rate": 0.0002690322580645161, + "loss": 0.847, + "step": 32 + }, + { + "epoch": 1.056, + "grad_norm": 0.37499895691871643, + "learning_rate": 0.00026806451612903224, + "loss": 0.8419, + "step": 33 + }, + { + "epoch": 1.088, + "grad_norm": 0.33221954107284546, + "learning_rate": 0.0002670967741935484, + "loss": 0.6011, + "step": 34 + }, + { + "epoch": 1.12, + "grad_norm": 0.344096839427948, + "learning_rate": 0.0002661290322580645, + "loss": 0.6501, + "step": 35 + }, + { + "epoch": 1.152, + "grad_norm": 0.38429391384124756, + "learning_rate": 0.0002651612903225806, + "loss": 0.8091, + "step": 36 + }, + { + "epoch": 1.184, + "grad_norm": 0.38014867901802063, + "learning_rate": 0.00026419354838709677, + "loss": 0.7668, + "step": 37 + }, + { + "epoch": 1.216, + "grad_norm": 0.3352573812007904, + "learning_rate": 0.00026322580645161286, + "loss": 0.5444, + "step": 38 + }, + { + "epoch": 1.248, + "grad_norm": 0.33811062574386597, + "learning_rate": 0.000262258064516129, + "loss": 0.512, + "step": 39 + }, + { + "epoch": 1.28, + "grad_norm": 0.3998416066169739, + "learning_rate": 0.00026129032258064515, + "loss": 0.6315, + "step": 40 + }, + { + "epoch": 1.312, + "grad_norm": 0.3983341157436371, + "learning_rate": 0.0002603225806451613, + "loss": 0.5882, + "step": 41 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.4585898816585541, + "learning_rate": 0.0002593548387096774, + "loss": 0.761, + "step": 42 + }, + { + "epoch": 1.376, + "grad_norm": 0.4080730080604553, + "learning_rate": 0.00025838709677419354, + "loss": 0.6716, + "step": 43 + }, + { + "epoch": 1.408, + "grad_norm": 0.4068273901939392, + "learning_rate": 0.0002574193548387096, + "loss": 0.6376, + "step": 44 + }, + { + "epoch": 1.44, + "grad_norm": 0.4406949579715729, + "learning_rate": 0.00025645161290322577, + "loss": 0.4594, + "step": 45 + }, + { + "epoch": 1.472, + "grad_norm": 0.34500986337661743, + "learning_rate": 0.0002554838709677419, + "loss": 0.3672, + "step": 46 + }, + { + "epoch": 1.504, + "grad_norm": 0.4760681390762329, + "learning_rate": 0.00025451612903225806, + "loss": 0.6331, + "step": 47 + }, + { + "epoch": 1.536, + "grad_norm": 0.39281558990478516, + "learning_rate": 0.0002535483870967742, + "loss": 0.5845, + "step": 48 + }, + { + "epoch": 1.568, + "grad_norm": 0.4265002906322479, + "learning_rate": 0.0002525806451612903, + "loss": 0.4461, + "step": 49 + }, + { + "epoch": 1.6, + "grad_norm": 0.40967294573783875, + "learning_rate": 0.00025161290322580645, + "loss": 0.7011, + "step": 50 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.4288088381290436, + "learning_rate": 0.00025064516129032254, + "loss": 0.6928, + "step": 51 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.4356289803981781, + "learning_rate": 0.0002496774193548387, + "loss": 0.7972, + "step": 52 + }, + { + "epoch": 1.696, + "grad_norm": 0.3827487826347351, + "learning_rate": 0.0002487096774193548, + "loss": 0.2991, + "step": 53 + }, + { + "epoch": 1.728, + "grad_norm": 0.40093398094177246, + "learning_rate": 0.0002477419354838709, + "loss": 0.416, + "step": 54 + }, + { + "epoch": 1.76, + "grad_norm": 0.41548973321914673, + "learning_rate": 0.00024677419354838707, + "loss": 0.5501, + "step": 55 + }, + { + "epoch": 1.792, + "grad_norm": 0.4093388617038727, + "learning_rate": 0.0002458064516129032, + "loss": 0.5557, + "step": 56 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.3934040665626526, + "learning_rate": 0.00024483870967741936, + "loss": 0.602, + "step": 57 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.42221033573150635, + "learning_rate": 0.00024387096774193545, + "loss": 0.6421, + "step": 58 + }, + { + "epoch": 1.888, + "grad_norm": 0.4351339340209961, + "learning_rate": 0.0002429032258064516, + "loss": 0.5615, + "step": 59 + }, + { + "epoch": 1.92, + "grad_norm": 0.4319838881492615, + "learning_rate": 0.00024193548387096771, + "loss": 0.6804, + "step": 60 + }, + { + "epoch": 1.952, + "grad_norm": 0.40016525983810425, + "learning_rate": 0.00024096774193548386, + "loss": 0.5432, + "step": 61 + }, + { + "epoch": 1.984, + "grad_norm": 0.3905942440032959, + "learning_rate": 0.00023999999999999998, + "loss": 0.4187, + "step": 62 + }, + { + "epoch": 2.016, + "grad_norm": 0.8056382536888123, + "learning_rate": 0.0002390322580645161, + "loss": 1.0174, + "step": 63 + }, + { + "epoch": 2.048, + "grad_norm": 0.3835236430168152, + "learning_rate": 0.00023806451612903224, + "loss": 0.5992, + "step": 64 + }, + { + "epoch": 2.08, + "grad_norm": 0.41092216968536377, + "learning_rate": 0.00023709677419354836, + "loss": 0.4746, + "step": 65 + }, + { + "epoch": 2.112, + "grad_norm": 0.39536622166633606, + "learning_rate": 0.0002361290322580645, + "loss": 0.3946, + "step": 66 + }, + { + "epoch": 2.144, + "grad_norm": 0.3927665948867798, + "learning_rate": 0.0002351612903225806, + "loss": 0.5187, + "step": 67 + }, + { + "epoch": 2.176, + "grad_norm": 0.39792704582214355, + "learning_rate": 0.00023419354838709674, + "loss": 0.4568, + "step": 68 + }, + { + "epoch": 2.208, + "grad_norm": 0.5023652911186218, + "learning_rate": 0.0002332258064516129, + "loss": 0.6166, + "step": 69 + }, + { + "epoch": 2.24, + "grad_norm": 0.425017774105072, + "learning_rate": 0.000232258064516129, + "loss": 0.42, + "step": 70 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 0.46458110213279724, + "learning_rate": 0.00023129032258064516, + "loss": 0.4613, + "step": 71 + }, + { + "epoch": 2.304, + "grad_norm": 0.49037960171699524, + "learning_rate": 0.00023032258064516125, + "loss": 0.5509, + "step": 72 + }, + { + "epoch": 2.336, + "grad_norm": 0.5233697891235352, + "learning_rate": 0.0002293548387096774, + "loss": 0.6396, + "step": 73 + }, + { + "epoch": 2.368, + "grad_norm": 0.4720582962036133, + "learning_rate": 0.0002283870967741935, + "loss": 0.5076, + "step": 74 + }, + { + "epoch": 2.4, + "grad_norm": 0.4900650382041931, + "learning_rate": 0.00022741935483870966, + "loss": 0.4794, + "step": 75 + }, + { + "epoch": 2.432, + "grad_norm": 0.6321704983711243, + "learning_rate": 0.0002264516129032258, + "loss": 0.6677, + "step": 76 + }, + { + "epoch": 2.464, + "grad_norm": 0.5305324792861938, + "learning_rate": 0.00022548387096774192, + "loss": 0.5102, + "step": 77 + }, + { + "epoch": 2.496, + "grad_norm": 0.5799248218536377, + "learning_rate": 0.00022451612903225804, + "loss": 0.5274, + "step": 78 + }, + { + "epoch": 2.528, + "grad_norm": 0.4990101456642151, + "learning_rate": 0.00022354838709677416, + "loss": 0.5407, + "step": 79 + }, + { + "epoch": 2.56, + "grad_norm": 0.4779827296733856, + "learning_rate": 0.0002225806451612903, + "loss": 0.5166, + "step": 80 + }, + { + "epoch": 2.592, + "grad_norm": 0.5140111446380615, + "learning_rate": 0.00022161290322580645, + "loss": 0.3288, + "step": 81 + }, + { + "epoch": 2.624, + "grad_norm": 0.5674853920936584, + "learning_rate": 0.00022064516129032257, + "loss": 0.666, + "step": 82 + }, + { + "epoch": 2.656, + "grad_norm": 0.5277597308158875, + "learning_rate": 0.00021967741935483871, + "loss": 0.5335, + "step": 83 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 0.6029439568519592, + "learning_rate": 0.0002187096774193548, + "loss": 0.693, + "step": 84 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.5039327144622803, + "learning_rate": 0.00021774193548387095, + "loss": 0.5728, + "step": 85 + }, + { + "epoch": 2.752, + "grad_norm": 0.5564692616462708, + "learning_rate": 0.00021677419354838707, + "loss": 0.4734, + "step": 86 + }, + { + "epoch": 2.784, + "grad_norm": 0.5278319120407104, + "learning_rate": 0.00021580645161290322, + "loss": 0.5834, + "step": 87 + }, + { + "epoch": 2.816, + "grad_norm": 0.5445135831832886, + "learning_rate": 0.00021483870967741936, + "loss": 0.4642, + "step": 88 + }, + { + "epoch": 2.848, + "grad_norm": 0.5394749045372009, + "learning_rate": 0.00021387096774193545, + "loss": 0.4779, + "step": 89 + }, + { + "epoch": 2.88, + "grad_norm": 0.5756134390830994, + "learning_rate": 0.0002129032258064516, + "loss": 0.5607, + "step": 90 + }, + { + "epoch": 2.912, + "grad_norm": 0.48361241817474365, + "learning_rate": 0.00021193548387096772, + "loss": 0.4278, + "step": 91 + }, + { + "epoch": 2.944, + "grad_norm": 0.5017121434211731, + "learning_rate": 0.00021096774193548386, + "loss": 0.4834, + "step": 92 + }, + { + "epoch": 2.976, + "grad_norm": 0.4741989076137543, + "learning_rate": 0.00020999999999999998, + "loss": 0.468, + "step": 93 + }, + { + "epoch": 3.008, + "grad_norm": 1.003368854522705, + "learning_rate": 0.0002090322580645161, + "loss": 0.8614, + "step": 94 + }, + { + "epoch": 3.04, + "grad_norm": 0.4782228469848633, + "learning_rate": 0.00020806451612903225, + "loss": 0.4111, + "step": 95 + }, + { + "epoch": 3.072, + "grad_norm": 0.4558674395084381, + "learning_rate": 0.00020709677419354836, + "loss": 0.3463, + "step": 96 + }, + { + "epoch": 3.104, + "grad_norm": 0.4409371316432953, + "learning_rate": 0.0002061290322580645, + "loss": 0.2571, + "step": 97 + }, + { + "epoch": 3.136, + "grad_norm": 0.5415034890174866, + "learning_rate": 0.00020516129032258063, + "loss": 0.5707, + "step": 98 + }, + { + "epoch": 3.168, + "grad_norm": 0.6157724857330322, + "learning_rate": 0.00020419354838709677, + "loss": 0.5692, + "step": 99 + }, + { + "epoch": 3.2, + "grad_norm": 0.4855688810348511, + "learning_rate": 0.00020322580645161287, + "loss": 0.3311, + "step": 100 + }, + { + "epoch": 3.232, + "grad_norm": 0.569878101348877, + "learning_rate": 0.000202258064516129, + "loss": 0.4707, + "step": 101 + }, + { + "epoch": 3.2640000000000002, + "grad_norm": 0.645232081413269, + "learning_rate": 0.00020129032258064516, + "loss": 0.5504, + "step": 102 + }, + { + "epoch": 3.296, + "grad_norm": 0.5775763392448425, + "learning_rate": 0.00020032258064516128, + "loss": 0.3651, + "step": 103 + }, + { + "epoch": 3.328, + "grad_norm": 0.5808250904083252, + "learning_rate": 0.00019935483870967742, + "loss": 0.5068, + "step": 104 + }, + { + "epoch": 3.36, + "grad_norm": 0.689313530921936, + "learning_rate": 0.0001983870967741935, + "loss": 0.4936, + "step": 105 + }, + { + "epoch": 3.392, + "grad_norm": 0.6571519374847412, + "learning_rate": 0.00019741935483870966, + "loss": 0.3671, + "step": 106 + }, + { + "epoch": 3.424, + "grad_norm": 0.6340517401695251, + "learning_rate": 0.00019645161290322578, + "loss": 0.4783, + "step": 107 + }, + { + "epoch": 3.456, + "grad_norm": 0.7031407952308655, + "learning_rate": 0.00019548387096774192, + "loss": 0.427, + "step": 108 + }, + { + "epoch": 3.488, + "grad_norm": 0.728496789932251, + "learning_rate": 0.00019451612903225807, + "loss": 0.5497, + "step": 109 + }, + { + "epoch": 3.52, + "grad_norm": 0.6106727719306946, + "learning_rate": 0.00019354838709677416, + "loss": 0.392, + "step": 110 + }, + { + "epoch": 3.552, + "grad_norm": 0.5296047329902649, + "learning_rate": 0.0001925806451612903, + "loss": 0.3412, + "step": 111 + }, + { + "epoch": 3.584, + "grad_norm": 0.6282025575637817, + "learning_rate": 0.00019161290322580643, + "loss": 0.4081, + "step": 112 + }, + { + "epoch": 3.616, + "grad_norm": 0.6166461110115051, + "learning_rate": 0.00019064516129032257, + "loss": 0.4771, + "step": 113 + }, + { + "epoch": 3.648, + "grad_norm": 0.5448863506317139, + "learning_rate": 0.0001896774193548387, + "loss": 0.404, + "step": 114 + }, + { + "epoch": 3.68, + "grad_norm": 0.6598389148712158, + "learning_rate": 0.0001887096774193548, + "loss": 0.3915, + "step": 115 + }, + { + "epoch": 3.7119999999999997, + "grad_norm": 0.5567564368247986, + "learning_rate": 0.00018774193548387095, + "loss": 0.3862, + "step": 116 + }, + { + "epoch": 3.7439999999999998, + "grad_norm": 0.6524521708488464, + "learning_rate": 0.00018677419354838707, + "loss": 0.5315, + "step": 117 + }, + { + "epoch": 3.776, + "grad_norm": 0.7040128707885742, + "learning_rate": 0.00018580645161290322, + "loss": 0.5387, + "step": 118 + }, + { + "epoch": 3.808, + "grad_norm": 0.690262496471405, + "learning_rate": 0.00018483870967741934, + "loss": 0.4877, + "step": 119 + }, + { + "epoch": 3.84, + "grad_norm": 0.6928034424781799, + "learning_rate": 0.00018387096774193548, + "loss": 0.4895, + "step": 120 + }, + { + "epoch": 3.872, + "grad_norm": 0.7148469686508179, + "learning_rate": 0.00018290322580645157, + "loss": 0.4814, + "step": 121 + }, + { + "epoch": 3.904, + "grad_norm": 0.6096572875976562, + "learning_rate": 0.00018193548387096772, + "loss": 0.3403, + "step": 122 + }, + { + "epoch": 3.936, + "grad_norm": 0.7132399678230286, + "learning_rate": 0.00018096774193548387, + "loss": 0.4258, + "step": 123 + }, + { + "epoch": 3.968, + "grad_norm": 0.7302684187889099, + "learning_rate": 0.00017999999999999998, + "loss": 0.7215, + "step": 124 + }, + { + "epoch": 4.0, + "grad_norm": 1.5244004726409912, + "learning_rate": 0.00017903225806451613, + "loss": 0.8544, + "step": 125 + }, + { + "epoch": 4.032, + "grad_norm": 0.6032777428627014, + "learning_rate": 0.00017806451612903222, + "loss": 0.4183, + "step": 126 + }, + { + "epoch": 4.064, + "grad_norm": 0.6349691152572632, + "learning_rate": 0.00017709677419354837, + "loss": 0.5871, + "step": 127 + }, + { + "epoch": 4.096, + "grad_norm": 0.5730060935020447, + "learning_rate": 0.00017612903225806449, + "loss": 0.3786, + "step": 128 + }, + { + "epoch": 4.128, + "grad_norm": 0.6988044381141663, + "learning_rate": 0.00017516129032258063, + "loss": 0.3216, + "step": 129 + }, + { + "epoch": 4.16, + "grad_norm": 0.7379153370857239, + "learning_rate": 0.00017419354838709678, + "loss": 0.4026, + "step": 130 + }, + { + "epoch": 4.192, + "grad_norm": 0.7058238983154297, + "learning_rate": 0.00017322580645161287, + "loss": 0.4328, + "step": 131 + }, + { + "epoch": 4.224, + "grad_norm": 0.80663001537323, + "learning_rate": 0.00017225806451612901, + "loss": 0.3849, + "step": 132 + }, + { + "epoch": 4.256, + "grad_norm": 0.899818480014801, + "learning_rate": 0.00017129032258064513, + "loss": 0.4191, + "step": 133 + }, + { + "epoch": 4.288, + "grad_norm": 0.8538224697113037, + "learning_rate": 0.00017032258064516128, + "loss": 0.3587, + "step": 134 + }, + { + "epoch": 4.32, + "grad_norm": 0.8948169350624084, + "learning_rate": 0.00016935483870967742, + "loss": 0.3957, + "step": 135 + }, + { + "epoch": 4.352, + "grad_norm": 0.7195591926574707, + "learning_rate": 0.00016838709677419354, + "loss": 0.3361, + "step": 136 + }, + { + "epoch": 4.384, + "grad_norm": 0.7769681215286255, + "learning_rate": 0.00016741935483870966, + "loss": 0.3519, + "step": 137 + }, + { + "epoch": 4.416, + "grad_norm": 0.9509867429733276, + "learning_rate": 0.00016645161290322578, + "loss": 0.4216, + "step": 138 + }, + { + "epoch": 4.448, + "grad_norm": 0.7923309206962585, + "learning_rate": 0.00016548387096774193, + "loss": 0.3999, + "step": 139 + }, + { + "epoch": 4.48, + "grad_norm": 0.8961685299873352, + "learning_rate": 0.00016451612903225804, + "loss": 0.5385, + "step": 140 + }, + { + "epoch": 4.5120000000000005, + "grad_norm": 0.7496562004089355, + "learning_rate": 0.0001635483870967742, + "loss": 0.341, + "step": 141 + }, + { + "epoch": 4.5440000000000005, + "grad_norm": 0.8512839674949646, + "learning_rate": 0.00016258064516129034, + "loss": 0.3847, + "step": 142 + }, + { + "epoch": 4.576, + "grad_norm": 0.7487362027168274, + "learning_rate": 0.00016161290322580643, + "loss": 0.3694, + "step": 143 + }, + { + "epoch": 4.608, + "grad_norm": 0.7957774996757507, + "learning_rate": 0.00016064516129032257, + "loss": 0.3379, + "step": 144 + }, + { + "epoch": 4.64, + "grad_norm": 0.7299221754074097, + "learning_rate": 0.0001596774193548387, + "loss": 0.2989, + "step": 145 + }, + { + "epoch": 4.672, + "grad_norm": 0.7909884452819824, + "learning_rate": 0.00015870967741935484, + "loss": 0.3675, + "step": 146 + }, + { + "epoch": 4.704, + "grad_norm": 0.7321597933769226, + "learning_rate": 0.00015774193548387093, + "loss": 0.3243, + "step": 147 + }, + { + "epoch": 4.736, + "grad_norm": 0.7196181416511536, + "learning_rate": 0.00015677419354838708, + "loss": 0.2709, + "step": 148 + }, + { + "epoch": 4.768, + "grad_norm": 0.7918142676353455, + "learning_rate": 0.00015580645161290322, + "loss": 0.3934, + "step": 149 + }, + { + "epoch": 4.8, + "grad_norm": 0.8657622337341309, + "learning_rate": 0.00015483870967741934, + "loss": 0.3583, + "step": 150 + }, + { + "epoch": 4.832, + "grad_norm": 0.8207722306251526, + "learning_rate": 0.00015387096774193549, + "loss": 0.412, + "step": 151 + }, + { + "epoch": 4.864, + "grad_norm": 0.7206109166145325, + "learning_rate": 0.00015290322580645158, + "loss": 0.3594, + "step": 152 + }, + { + "epoch": 4.896, + "grad_norm": 0.8529183864593506, + "learning_rate": 0.00015193548387096772, + "loss": 0.512, + "step": 153 + }, + { + "epoch": 4.928, + "grad_norm": 0.6895930171012878, + "learning_rate": 0.00015096774193548384, + "loss": 0.333, + "step": 154 + }, + { + "epoch": 4.96, + "grad_norm": 0.7422910332679749, + "learning_rate": 0.00015, + "loss": 0.2872, + "step": 155 + }, + { + "epoch": 4.992, + "grad_norm": 0.7366386651992798, + "learning_rate": 0.0001490322580645161, + "loss": 0.3415, + "step": 156 + }, + { + "epoch": 5.024, + "grad_norm": 2.1416280269622803, + "learning_rate": 0.00014806451612903225, + "loss": 0.9961, + "step": 157 + }, + { + "epoch": 5.056, + "grad_norm": 0.7944900393486023, + "learning_rate": 0.00014709677419354837, + "loss": 0.3372, + "step": 158 + }, + { + "epoch": 5.088, + "grad_norm": 0.7071006298065186, + "learning_rate": 0.00014612903225806452, + "loss": 0.2732, + "step": 159 + }, + { + "epoch": 5.12, + "grad_norm": 0.7874396443367004, + "learning_rate": 0.00014516129032258063, + "loss": 0.2861, + "step": 160 + }, + { + "epoch": 5.152, + "grad_norm": 0.8244249224662781, + "learning_rate": 0.00014419354838709675, + "loss": 0.3428, + "step": 161 + }, + { + "epoch": 5.184, + "grad_norm": 0.81637042760849, + "learning_rate": 0.0001432258064516129, + "loss": 0.3037, + "step": 162 + }, + { + "epoch": 5.216, + "grad_norm": 0.9916559457778931, + "learning_rate": 0.00014225806451612902, + "loss": 0.3337, + "step": 163 + }, + { + "epoch": 5.248, + "grad_norm": 0.9077599048614502, + "learning_rate": 0.00014129032258064514, + "loss": 0.287, + "step": 164 + }, + { + "epoch": 5.28, + "grad_norm": 0.9824132919311523, + "learning_rate": 0.00014032258064516128, + "loss": 0.3852, + "step": 165 + }, + { + "epoch": 5.312, + "grad_norm": 1.0016467571258545, + "learning_rate": 0.0001393548387096774, + "loss": 0.3234, + "step": 166 + }, + { + "epoch": 5.344, + "grad_norm": 0.8697543144226074, + "learning_rate": 0.00013838709677419355, + "loss": 0.2848, + "step": 167 + }, + { + "epoch": 5.376, + "grad_norm": 0.8214029669761658, + "learning_rate": 0.00013741935483870966, + "loss": 0.3377, + "step": 168 + }, + { + "epoch": 5.408, + "grad_norm": 0.9105691313743591, + "learning_rate": 0.00013645161290322578, + "loss": 0.2944, + "step": 169 + }, + { + "epoch": 5.44, + "grad_norm": 0.9642040133476257, + "learning_rate": 0.00013548387096774193, + "loss": 0.3624, + "step": 170 + }, + { + "epoch": 5.4719999999999995, + "grad_norm": 0.9218887686729431, + "learning_rate": 0.00013451612903225805, + "loss": 0.3938, + "step": 171 + }, + { + "epoch": 5.504, + "grad_norm": 0.8704710006713867, + "learning_rate": 0.0001335483870967742, + "loss": 0.3629, + "step": 172 + }, + { + "epoch": 5.536, + "grad_norm": 0.8207693099975586, + "learning_rate": 0.0001325806451612903, + "loss": 0.3169, + "step": 173 + }, + { + "epoch": 5.568, + "grad_norm": 0.9315701127052307, + "learning_rate": 0.00013161290322580643, + "loss": 0.429, + "step": 174 + }, + { + "epoch": 5.6, + "grad_norm": 0.860234260559082, + "learning_rate": 0.00013064516129032258, + "loss": 0.3842, + "step": 175 + }, + { + "epoch": 5.632, + "grad_norm": 0.8927604556083679, + "learning_rate": 0.0001296774193548387, + "loss": 0.3405, + "step": 176 + }, + { + "epoch": 5.664, + "grad_norm": 0.8084587454795837, + "learning_rate": 0.0001287096774193548, + "loss": 0.306, + "step": 177 + }, + { + "epoch": 5.696, + "grad_norm": 0.9102941155433655, + "learning_rate": 0.00012774193548387096, + "loss": 0.3285, + "step": 178 + }, + { + "epoch": 5.728, + "grad_norm": 0.763113796710968, + "learning_rate": 0.0001267741935483871, + "loss": 0.2729, + "step": 179 + }, + { + "epoch": 5.76, + "grad_norm": 0.8704251646995544, + "learning_rate": 0.00012580645161290322, + "loss": 0.3164, + "step": 180 + }, + { + "epoch": 5.792, + "grad_norm": 0.9634932279586792, + "learning_rate": 0.00012483870967741934, + "loss": 0.2939, + "step": 181 + }, + { + "epoch": 5.824, + "grad_norm": 1.1567790508270264, + "learning_rate": 0.00012387096774193546, + "loss": 0.3076, + "step": 182 + }, + { + "epoch": 5.856, + "grad_norm": 0.9096764922142029, + "learning_rate": 0.0001229032258064516, + "loss": 0.3289, + "step": 183 + }, + { + "epoch": 5.888, + "grad_norm": 0.9840425848960876, + "learning_rate": 0.00012193548387096773, + "loss": 0.2772, + "step": 184 + }, + { + "epoch": 5.92, + "grad_norm": 0.725844144821167, + "learning_rate": 0.00012096774193548386, + "loss": 0.2151, + "step": 185 + }, + { + "epoch": 5.952, + "grad_norm": 0.8343638181686401, + "learning_rate": 0.00011999999999999999, + "loss": 0.3825, + "step": 186 + }, + { + "epoch": 5.984, + "grad_norm": 0.8040199279785156, + "learning_rate": 0.00011903225806451612, + "loss": 0.2571, + "step": 187 + }, + { + "epoch": 6.016, + "grad_norm": 1.6932090520858765, + "learning_rate": 0.00011806451612903225, + "loss": 0.5538, + "step": 188 + }, + { + "epoch": 6.048, + "grad_norm": 0.744048535823822, + "learning_rate": 0.00011709677419354837, + "loss": 0.2335, + "step": 189 + }, + { + "epoch": 6.08, + "grad_norm": 0.6974924206733704, + "learning_rate": 0.0001161290322580645, + "loss": 0.2891, + "step": 190 + }, + { + "epoch": 6.112, + "grad_norm": 0.7202953696250916, + "learning_rate": 0.00011516129032258062, + "loss": 0.2017, + "step": 191 + }, + { + "epoch": 6.144, + "grad_norm": 0.8437547087669373, + "learning_rate": 0.00011419354838709676, + "loss": 0.2175, + "step": 192 + }, + { + "epoch": 6.176, + "grad_norm": 1.0741796493530273, + "learning_rate": 0.0001132258064516129, + "loss": 0.3913, + "step": 193 + }, + { + "epoch": 6.208, + "grad_norm": 1.031754493713379, + "learning_rate": 0.00011225806451612902, + "loss": 0.298, + "step": 194 + }, + { + "epoch": 6.24, + "grad_norm": 0.9575178027153015, + "learning_rate": 0.00011129032258064515, + "loss": 0.3201, + "step": 195 + }, + { + "epoch": 6.272, + "grad_norm": 0.9503082633018494, + "learning_rate": 0.00011032258064516128, + "loss": 0.2005, + "step": 196 + }, + { + "epoch": 6.304, + "grad_norm": 1.2572892904281616, + "learning_rate": 0.0001093548387096774, + "loss": 0.3045, + "step": 197 + }, + { + "epoch": 6.336, + "grad_norm": 1.5667368173599243, + "learning_rate": 0.00010838709677419353, + "loss": 0.4053, + "step": 198 + }, + { + "epoch": 6.368, + "grad_norm": 0.9439151883125305, + "learning_rate": 0.00010741935483870968, + "loss": 0.2721, + "step": 199 + }, + { + "epoch": 6.4, + "grad_norm": 1.0985567569732666, + "learning_rate": 0.0001064516129032258, + "loss": 0.2543, + "step": 200 + }, + { + "epoch": 6.432, + "grad_norm": 0.789880633354187, + "learning_rate": 0.00010548387096774193, + "loss": 0.2148, + "step": 201 + }, + { + "epoch": 6.464, + "grad_norm": 0.9937541484832764, + "learning_rate": 0.00010451612903225805, + "loss": 0.2343, + "step": 202 + }, + { + "epoch": 6.496, + "grad_norm": 0.9496509432792664, + "learning_rate": 0.00010354838709677418, + "loss": 0.2576, + "step": 203 + }, + { + "epoch": 6.5280000000000005, + "grad_norm": 0.9214590191841125, + "learning_rate": 0.00010258064516129031, + "loss": 0.3067, + "step": 204 + }, + { + "epoch": 6.5600000000000005, + "grad_norm": 0.8984239101409912, + "learning_rate": 0.00010161290322580643, + "loss": 0.2471, + "step": 205 + }, + { + "epoch": 6.592, + "grad_norm": 0.8055192232131958, + "learning_rate": 0.00010064516129032258, + "loss": 0.2234, + "step": 206 + }, + { + "epoch": 6.624, + "grad_norm": 0.769008219242096, + "learning_rate": 9.967741935483871e-05, + "loss": 0.1963, + "step": 207 + }, + { + "epoch": 6.656, + "grad_norm": 0.7947174310684204, + "learning_rate": 9.870967741935483e-05, + "loss": 0.2165, + "step": 208 + }, + { + "epoch": 6.688, + "grad_norm": 1.0192420482635498, + "learning_rate": 9.774193548387096e-05, + "loss": 0.2581, + "step": 209 + }, + { + "epoch": 6.72, + "grad_norm": 1.0067439079284668, + "learning_rate": 9.677419354838708e-05, + "loss": 0.2394, + "step": 210 + }, + { + "epoch": 6.752, + "grad_norm": 1.0539058446884155, + "learning_rate": 9.580645161290321e-05, + "loss": 0.2526, + "step": 211 + }, + { + "epoch": 6.784, + "grad_norm": 1.130011796951294, + "learning_rate": 9.483870967741934e-05, + "loss": 0.3339, + "step": 212 + }, + { + "epoch": 6.816, + "grad_norm": 0.9603860378265381, + "learning_rate": 9.387096774193548e-05, + "loss": 0.2808, + "step": 213 + }, + { + "epoch": 6.848, + "grad_norm": 1.0667173862457275, + "learning_rate": 9.290322580645161e-05, + "loss": 0.3025, + "step": 214 + }, + { + "epoch": 6.88, + "grad_norm": 0.9093402624130249, + "learning_rate": 9.193548387096774e-05, + "loss": 0.2698, + "step": 215 + }, + { + "epoch": 6.912, + "grad_norm": 0.8621392846107483, + "learning_rate": 9.096774193548386e-05, + "loss": 0.2259, + "step": 216 + }, + { + "epoch": 6.944, + "grad_norm": 1.035175085067749, + "learning_rate": 8.999999999999999e-05, + "loss": 0.3156, + "step": 217 + }, + { + "epoch": 6.976, + "grad_norm": 1.0241689682006836, + "learning_rate": 8.903225806451611e-05, + "loss": 0.2723, + "step": 218 + }, + { + "epoch": 7.008, + "grad_norm": 1.735946536064148, + "learning_rate": 8.806451612903224e-05, + "loss": 0.411, + "step": 219 + }, + { + "epoch": 7.04, + "grad_norm": 0.8678178191184998, + "learning_rate": 8.709677419354839e-05, + "loss": 0.2415, + "step": 220 + }, + { + "epoch": 7.072, + "grad_norm": 0.7134645581245422, + "learning_rate": 8.612903225806451e-05, + "loss": 0.1509, + "step": 221 + }, + { + "epoch": 7.104, + "grad_norm": 0.8543497920036316, + "learning_rate": 8.516129032258064e-05, + "loss": 0.2459, + "step": 222 + }, + { + "epoch": 7.136, + "grad_norm": 0.9644029140472412, + "learning_rate": 8.419354838709677e-05, + "loss": 0.2828, + "step": 223 + }, + { + "epoch": 7.168, + "grad_norm": 0.8568740487098694, + "learning_rate": 8.322580645161289e-05, + "loss": 0.1936, + "step": 224 + }, + { + "epoch": 7.2, + "grad_norm": 1.005867600440979, + "learning_rate": 8.225806451612902e-05, + "loss": 0.2678, + "step": 225 + }, + { + "epoch": 7.232, + "grad_norm": 0.9942033290863037, + "learning_rate": 8.129032258064517e-05, + "loss": 0.2111, + "step": 226 + }, + { + "epoch": 7.264, + "grad_norm": 0.9886007905006409, + "learning_rate": 8.032258064516129e-05, + "loss": 0.2375, + "step": 227 + }, + { + "epoch": 7.296, + "grad_norm": 1.0586844682693481, + "learning_rate": 7.935483870967742e-05, + "loss": 0.2385, + "step": 228 + }, + { + "epoch": 7.328, + "grad_norm": 1.026432991027832, + "learning_rate": 7.838709677419354e-05, + "loss": 0.2139, + "step": 229 + }, + { + "epoch": 7.36, + "grad_norm": 1.0039665699005127, + "learning_rate": 7.741935483870967e-05, + "loss": 0.2211, + "step": 230 + }, + { + "epoch": 7.392, + "grad_norm": 1.1125057935714722, + "learning_rate": 7.645161290322579e-05, + "loss": 0.2725, + "step": 231 + }, + { + "epoch": 7.424, + "grad_norm": 0.9078079462051392, + "learning_rate": 7.548387096774192e-05, + "loss": 0.1965, + "step": 232 + }, + { + "epoch": 7.456, + "grad_norm": 0.8247030377388, + "learning_rate": 7.451612903225805e-05, + "loss": 0.1502, + "step": 233 + }, + { + "epoch": 7.4879999999999995, + "grad_norm": 1.1396474838256836, + "learning_rate": 7.354838709677418e-05, + "loss": 0.37, + "step": 234 + }, + { + "epoch": 7.52, + "grad_norm": 0.753663182258606, + "learning_rate": 7.258064516129032e-05, + "loss": 0.1627, + "step": 235 + }, + { + "epoch": 7.552, + "grad_norm": 0.7927701473236084, + "learning_rate": 7.161290322580645e-05, + "loss": 0.1684, + "step": 236 + }, + { + "epoch": 7.584, + "grad_norm": 0.9258756637573242, + "learning_rate": 7.064516129032257e-05, + "loss": 0.213, + "step": 237 + }, + { + "epoch": 7.616, + "grad_norm": 0.8111560940742493, + "learning_rate": 6.96774193548387e-05, + "loss": 0.1998, + "step": 238 + }, + { + "epoch": 7.648, + "grad_norm": 0.8484370708465576, + "learning_rate": 6.870967741935483e-05, + "loss": 0.1307, + "step": 239 + }, + { + "epoch": 7.68, + "grad_norm": 0.9123087525367737, + "learning_rate": 6.774193548387096e-05, + "loss": 0.2529, + "step": 240 + }, + { + "epoch": 7.712, + "grad_norm": 1.0526336431503296, + "learning_rate": 6.67741935483871e-05, + "loss": 0.2468, + "step": 241 + }, + { + "epoch": 7.744, + "grad_norm": 1.0104210376739502, + "learning_rate": 6.580645161290322e-05, + "loss": 0.23, + "step": 242 + }, + { + "epoch": 7.776, + "grad_norm": 0.8749745488166809, + "learning_rate": 6.483870967741935e-05, + "loss": 0.1973, + "step": 243 + }, + { + "epoch": 7.808, + "grad_norm": 0.9921355247497559, + "learning_rate": 6.387096774193548e-05, + "loss": 0.2144, + "step": 244 + }, + { + "epoch": 7.84, + "grad_norm": 0.8243810534477234, + "learning_rate": 6.290322580645161e-05, + "loss": 0.1531, + "step": 245 + }, + { + "epoch": 7.872, + "grad_norm": 1.0764353275299072, + "learning_rate": 6.193548387096773e-05, + "loss": 0.2763, + "step": 246 + }, + { + "epoch": 7.904, + "grad_norm": 1.1754212379455566, + "learning_rate": 6.096774193548386e-05, + "loss": 0.2249, + "step": 247 + }, + { + "epoch": 7.936, + "grad_norm": 0.8588422536849976, + "learning_rate": 5.9999999999999995e-05, + "loss": 0.1782, + "step": 248 + }, + { + "epoch": 7.968, + "grad_norm": 1.045143961906433, + "learning_rate": 5.903225806451613e-05, + "loss": 0.2789, + "step": 249 + }, + { + "epoch": 8.0, + "grad_norm": 1.9824038743972778, + "learning_rate": 5.806451612903225e-05, + "loss": 0.3057, + "step": 250 + }, + { + "epoch": 8.032, + "grad_norm": 0.9252362847328186, + "learning_rate": 5.709677419354838e-05, + "loss": 0.2221, + "step": 251 + }, + { + "epoch": 8.064, + "grad_norm": 0.8381021022796631, + "learning_rate": 5.612903225806451e-05, + "loss": 0.2639, + "step": 252 + }, + { + "epoch": 8.096, + "grad_norm": 0.9777012467384338, + "learning_rate": 5.516129032258064e-05, + "loss": 0.1533, + "step": 253 + }, + { + "epoch": 8.128, + "grad_norm": 0.8053516745567322, + "learning_rate": 5.419354838709677e-05, + "loss": 0.1883, + "step": 254 + }, + { + "epoch": 8.16, + "grad_norm": 0.8703336119651794, + "learning_rate": 5.32258064516129e-05, + "loss": 0.2079, + "step": 255 + }, + { + "epoch": 8.192, + "grad_norm": 0.8113718032836914, + "learning_rate": 5.2258064516129025e-05, + "loss": 0.1609, + "step": 256 + }, + { + "epoch": 8.224, + "grad_norm": 1.0667418241500854, + "learning_rate": 5.129032258064516e-05, + "loss": 0.2544, + "step": 257 + }, + { + "epoch": 8.256, + "grad_norm": 0.7853135466575623, + "learning_rate": 5.032258064516129e-05, + "loss": 0.1391, + "step": 258 + }, + { + "epoch": 8.288, + "grad_norm": 0.9970865845680237, + "learning_rate": 4.9354838709677415e-05, + "loss": 0.2305, + "step": 259 + }, + { + "epoch": 8.32, + "grad_norm": 12.063047409057617, + "learning_rate": 4.838709677419354e-05, + "loss": 0.189, + "step": 260 + }, + { + "epoch": 8.352, + "grad_norm": 1.2325772047042847, + "learning_rate": 4.741935483870967e-05, + "loss": 0.2308, + "step": 261 + }, + { + "epoch": 8.384, + "grad_norm": 1.1118851900100708, + "learning_rate": 4.6451612903225805e-05, + "loss": 0.2009, + "step": 262 + }, + { + "epoch": 8.416, + "grad_norm": 1.0783390998840332, + "learning_rate": 4.548387096774193e-05, + "loss": 0.2276, + "step": 263 + }, + { + "epoch": 8.448, + "grad_norm": 1.2127933502197266, + "learning_rate": 4.4516129032258055e-05, + "loss": 0.2046, + "step": 264 + }, + { + "epoch": 8.48, + "grad_norm": 1.1135843992233276, + "learning_rate": 4.3548387096774194e-05, + "loss": 0.1791, + "step": 265 + }, + { + "epoch": 8.512, + "grad_norm": 0.8666661381721497, + "learning_rate": 4.258064516129032e-05, + "loss": 0.1287, + "step": 266 + }, + { + "epoch": 8.544, + "grad_norm": 0.8430101275444031, + "learning_rate": 4.1612903225806445e-05, + "loss": 0.1475, + "step": 267 + }, + { + "epoch": 8.576, + "grad_norm": 0.7744110822677612, + "learning_rate": 4.0645161290322584e-05, + "loss": 0.1458, + "step": 268 + }, + { + "epoch": 8.608, + "grad_norm": 1.4067776203155518, + "learning_rate": 3.967741935483871e-05, + "loss": 0.2189, + "step": 269 + }, + { + "epoch": 8.64, + "grad_norm": 0.8347670435905457, + "learning_rate": 3.8709677419354835e-05, + "loss": 0.1602, + "step": 270 + }, + { + "epoch": 8.672, + "grad_norm": 0.7643276453018188, + "learning_rate": 3.774193548387096e-05, + "loss": 0.1363, + "step": 271 + }, + { + "epoch": 8.704, + "grad_norm": 0.898059606552124, + "learning_rate": 3.677419354838709e-05, + "loss": 0.156, + "step": 272 + }, + { + "epoch": 8.736, + "grad_norm": 0.8416333198547363, + "learning_rate": 3.5806451612903225e-05, + "loss": 0.1754, + "step": 273 + }, + { + "epoch": 8.768, + "grad_norm": 0.8691906929016113, + "learning_rate": 3.483870967741935e-05, + "loss": 0.1808, + "step": 274 + }, + { + "epoch": 8.8, + "grad_norm": 1.062111496925354, + "learning_rate": 3.387096774193548e-05, + "loss": 0.2559, + "step": 275 + }, + { + "epoch": 8.832, + "grad_norm": 0.881698727607727, + "learning_rate": 3.290322580645161e-05, + "loss": 0.1732, + "step": 276 + }, + { + "epoch": 8.864, + "grad_norm": 0.8446074724197388, + "learning_rate": 3.193548387096774e-05, + "loss": 0.1833, + "step": 277 + }, + { + "epoch": 8.896, + "grad_norm": 0.9393475651741028, + "learning_rate": 3.0967741935483865e-05, + "loss": 0.2165, + "step": 278 + }, + { + "epoch": 8.928, + "grad_norm": 0.8838346004486084, + "learning_rate": 2.9999999999999997e-05, + "loss": 0.146, + "step": 279 + }, + { + "epoch": 8.96, + "grad_norm": 0.8380343914031982, + "learning_rate": 2.9032258064516126e-05, + "loss": 0.1721, + "step": 280 + }, + { + "epoch": 8.992, + "grad_norm": 0.8561931252479553, + "learning_rate": 2.8064516129032255e-05, + "loss": 0.1519, + "step": 281 + }, + { + "epoch": 9.024, + "grad_norm": 1.6088253259658813, + "learning_rate": 2.7096774193548384e-05, + "loss": 0.2658, + "step": 282 + }, + { + "epoch": 9.056, + "grad_norm": 0.8154093027114868, + "learning_rate": 2.6129032258064513e-05, + "loss": 0.1693, + "step": 283 + }, + { + "epoch": 9.088, + "grad_norm": 0.7722072005271912, + "learning_rate": 2.5161290322580645e-05, + "loss": 0.1853, + "step": 284 + }, + { + "epoch": 9.12, + "grad_norm": 0.8294870257377625, + "learning_rate": 2.419354838709677e-05, + "loss": 0.1736, + "step": 285 + }, + { + "epoch": 9.152, + "grad_norm": 0.7481442093849182, + "learning_rate": 2.3225806451612902e-05, + "loss": 0.1544, + "step": 286 + }, + { + "epoch": 9.184, + "grad_norm": 0.923413872718811, + "learning_rate": 2.2258064516129028e-05, + "loss": 0.2162, + "step": 287 + }, + { + "epoch": 9.216, + "grad_norm": 0.8326953053474426, + "learning_rate": 2.129032258064516e-05, + "loss": 0.1926, + "step": 288 + }, + { + "epoch": 9.248, + "grad_norm": 0.7642485499382019, + "learning_rate": 2.0322580645161292e-05, + "loss": 0.1555, + "step": 289 + }, + { + "epoch": 9.28, + "grad_norm": 0.7902241945266724, + "learning_rate": 1.9354838709677417e-05, + "loss": 0.1459, + "step": 290 + }, + { + "epoch": 9.312, + "grad_norm": 0.7414844036102295, + "learning_rate": 1.8387096774193546e-05, + "loss": 0.1425, + "step": 291 + }, + { + "epoch": 9.344, + "grad_norm": 0.7870174646377563, + "learning_rate": 1.7419354838709675e-05, + "loss": 0.1853, + "step": 292 + }, + { + "epoch": 9.376, + "grad_norm": 0.9091981649398804, + "learning_rate": 1.6451612903225804e-05, + "loss": 0.1666, + "step": 293 + }, + { + "epoch": 9.408, + "grad_norm": 0.8651584386825562, + "learning_rate": 1.5483870967741933e-05, + "loss": 0.174, + "step": 294 + }, + { + "epoch": 9.44, + "grad_norm": 0.7866891622543335, + "learning_rate": 1.4516129032258063e-05, + "loss": 0.1478, + "step": 295 + }, + { + "epoch": 9.472, + "grad_norm": 0.717932403087616, + "learning_rate": 1.3548387096774192e-05, + "loss": 0.1425, + "step": 296 + }, + { + "epoch": 9.504, + "grad_norm": 1.0217758417129517, + "learning_rate": 1.2580645161290322e-05, + "loss": 0.1574, + "step": 297 + }, + { + "epoch": 9.536, + "grad_norm": 0.8149961829185486, + "learning_rate": 1.1612903225806451e-05, + "loss": 0.1422, + "step": 298 + }, + { + "epoch": 9.568, + "grad_norm": 0.9206218719482422, + "learning_rate": 1.064516129032258e-05, + "loss": 0.1809, + "step": 299 + }, + { + "epoch": 9.6, + "grad_norm": 0.6865082383155823, + "learning_rate": 9.677419354838709e-06, + "loss": 0.133, + "step": 300 + }, + { + "epoch": 9.632, + "grad_norm": 0.7960584759712219, + "learning_rate": 8.709677419354838e-06, + "loss": 0.1289, + "step": 301 + }, + { + "epoch": 9.664, + "grad_norm": 1.4710181951522827, + "learning_rate": 7.741935483870966e-06, + "loss": 0.1844, + "step": 302 + }, + { + "epoch": 9.696, + "grad_norm": 0.7321292757987976, + "learning_rate": 6.774193548387096e-06, + "loss": 0.1356, + "step": 303 + }, + { + "epoch": 9.728, + "grad_norm": 0.9279872179031372, + "learning_rate": 5.8064516129032256e-06, + "loss": 0.1842, + "step": 304 + }, + { + "epoch": 9.76, + "grad_norm": 0.790213942527771, + "learning_rate": 4.838709677419354e-06, + "loss": 0.1341, + "step": 305 + }, + { + "epoch": 9.792, + "grad_norm": 0.7292400598526001, + "learning_rate": 3.870967741935483e-06, + "loss": 0.1287, + "step": 306 + }, + { + "epoch": 9.824, + "grad_norm": 0.8236159682273865, + "learning_rate": 2.9032258064516128e-06, + "loss": 0.1721, + "step": 307 + }, + { + "epoch": 9.856, + "grad_norm": 1.0054924488067627, + "learning_rate": 1.9354838709677416e-06, + "loss": 0.19, + "step": 308 + }, + { + "epoch": 9.888, + "grad_norm": 0.8466821312904358, + "learning_rate": 9.677419354838708e-07, + "loss": 0.1742, + "step": 309 + }, + { + "epoch": 9.92, + "grad_norm": 0.7754448652267456, + "learning_rate": 0.0, + "loss": 0.1368, + "step": 310 + } + ], + "logging_steps": 1, + "max_steps": 310, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 7.34728855486464e+16, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-310/training_args.bin b/checkpoint-310/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..01723982396407692a903d785c60e57fcabfa0c4 --- /dev/null +++ b/checkpoint-310/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:457c697b05fd5daa3c83df8920300c4940c26fb78ace5b5428b7c95d133a0ef4 +size 5560 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..72ecfeeb7e14d244c936169d2ed139eeae235ef1 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6ca9f0b39df7b30b561a2070b66bf0059e2aa9c8 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,44 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": false, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..01723982396407692a903d785c60e57fcabfa0c4 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:457c697b05fd5daa3c83df8920300c4940c26fb78ace5b5428b7c95d133a0ef4 +size 5560