diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..205ffc00b0cd5da21716a8d717cdab106074cd3e --- /dev/null +++ b/README.md @@ -0,0 +1,59 @@ +--- +license: other +library_name: peft +tags: +- llama-factory +- lora +- generated_from_trainer +base_model: hfl/chinese-alpaca-2-1.3b +model-index: +- name: train_2024-03-14-05-56-29 + results: [] +--- + + + +# train_2024-03-14-05-56-29 + +This model is a fine-tuned version of [hfl/chinese-alpaca-2-1.3b](https://huggingface.co/hfl/chinese-alpaca-2-1.3b) on the alpaca_zh dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 5e-05 +- train_batch_size: 2 +- eval_batch_size: 8 +- seed: 42 +- gradient_accumulation_steps: 8 +- total_train_batch_size: 16 +- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08 +- lr_scheduler_type: cosine +- num_epochs: 1.0 +- mixed_precision_training: Native AMP + +### Training results + + + +### Framework versions + +- PEFT 0.9.0 +- Transformers 4.38.2 +- Pytorch 2.2.1+cu121 +- Datasets 2.18.0 +- Tokenizers 0.15.2 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f627743fc8052e394d07d8452fb86a248e220795 --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:462bcac6c9586774f4979f614b6ec3f18bbc2f0febcdd1c766687b7f2056c66a +size 2099272 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b5f4085a932af8484c59597eb4f5cc1bb81a42f8 --- /dev/null +++ b/all_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 0.48, + "train_loss": 2.0602192145127516, + "train_runtime": 801.4891, + "train_samples_per_second": 64.198, + "train_steps_per_second": 4.011 +} \ No newline at end of file diff --git a/checkpoint-100/README.md b/checkpoint-100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-100/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-100/adapter_config.json b/checkpoint-100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-100/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-100/adapter_model.safetensors b/checkpoint-100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e93177b961498c33143fb1d3c1fa99de4f19e01d --- /dev/null +++ b/checkpoint-100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d36ee8ac772f766edf9dbbfd9624d8e73a0cfd65f5df48211d324cc57a4e5c6 +size 2099272 diff --git a/checkpoint-100/optimizer.pt b/checkpoint-100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..023f9eeb4c52e156b65246e54d1a04234318f2ab --- /dev/null +++ b/checkpoint-100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f41177184e0c283aa638b8ba6cde4ccf932b0a8e13fad8e15ae916a4cb18c62 +size 4208302 diff --git a/checkpoint-100/rng_state.pth b/checkpoint-100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..162c42af6b1715633121d91ddaa9ad6d6b894acc --- /dev/null +++ b/checkpoint-100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5b99b025074fe1142a3334c2107e39b98b9d36c2981cb810283beb4adc7ac94 +size 14244 diff --git a/checkpoint-100/scheduler.pt b/checkpoint-100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6c8d3ac24feb7d7a857c7f9590ce22e8b517a51 --- /dev/null +++ b/checkpoint-100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2197384a954a4fcf1ccbd8df7831a0d78a90198a2460b1cc6c71d1497ca1586 +size 1064 diff --git a/checkpoint-100/special_tokens_map.json b/checkpoint-100/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-100/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-100/tokenizer.model b/checkpoint-100/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-100/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-100/tokenizer_config.json b/checkpoint-100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-100/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-100/trainer_state.json b/checkpoint-100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..86ccc7f1deda8a29046e7d72e06e1642e3c0408a --- /dev/null +++ b/checkpoint-100/trainer_state.json @@ -0,0 +1,161 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.031095735997201383, + "eval_steps": 500, + "global_step": 100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1342876749004800.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-100/training_args.bin b/checkpoint-100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/checkpoint-1000/README.md b/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-1000/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1000/adapter_config.json b/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-1000/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1000/adapter_model.safetensors b/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e08ac9f1131a9eca827b7983ab68ecdc0e0e746a --- /dev/null +++ b/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48a2d9599d49f21f6da2043f811c7fc36667538dcd0694bf5ab45264e604eb6a +size 2099272 diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..713871392f596e05c0afccf6387210df924ac1f4 --- /dev/null +++ b/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f006162ce9a8ae562e525479e6cbcc5e65fcf10721c1c878a1eba1f3248032d +size 4208302 diff --git a/checkpoint-1000/rng_state.pth b/checkpoint-1000/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..72558266052bf947b48a003cb72b89f6e3de0769 --- /dev/null +++ b/checkpoint-1000/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bab00d452a5c7da709ec7f6117cea515f8bedf68f40e39d014d87811d017f294 +size 14244 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9af51ea8ab150a610a34adbcb06e117504b159da --- /dev/null +++ b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0997e59d7add886c3367f611ab22a16eaa2f60d42a1ebdb93b4ad46ac309297 +size 1064 diff --git a/checkpoint-1000/special_tokens_map.json b/checkpoint-1000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-1000/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1000/tokenizer.model b/checkpoint-1000/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-1000/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-1000/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..af8a9908632620735408ddc8508673478ec4dab4 --- /dev/null +++ b/checkpoint-1000/trainer_state.json @@ -0,0 +1,1421 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.31095735997201385, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.2905739545822144, + "learning_rate": 4.9500077405018807e-05, + "loss": 2.0987, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.8612006306648254, + "learning_rate": 4.9475480211302583e-05, + "loss": 2.1765, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 1.3128459453582764, + "learning_rate": 4.945029875489212e-05, + "loss": 1.9926, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 0.9610918164253235, + "learning_rate": 4.94245336369027e-05, + "loss": 2.0124, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.873160183429718, + "learning_rate": 4.939818547238241e-05, + "loss": 2.2229, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.5535285472869873, + "learning_rate": 4.9371254890297446e-05, + "loss": 2.2013, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951836347579956, + "learning_rate": 4.93437425335171e-05, + "loss": 2.014, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874170541763306, + "learning_rate": 4.9315649058798384e-05, + "loss": 2.1701, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 1.3503323793411255, + "learning_rate": 4.928697513677042e-05, + "loss": 2.1681, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091179132461548, + "learning_rate": 4.925772145191834e-05, + "loss": 2.1224, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.4428555965423584, + "learning_rate": 4.9227888702567044e-05, + "loss": 2.0512, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.8234395980834961, + "learning_rate": 4.9197477600864446e-05, + "loss": 2.1067, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 1.9094969034194946, + "learning_rate": 4.9166488872764526e-05, + "loss": 1.8884, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 1.0074087381362915, + "learning_rate": 4.913492325800999e-05, + "loss": 1.9345, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 1.0867297649383545, + "learning_rate": 4.910278151011458e-05, + "loss": 2.1928, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 0.6842357516288757, + "learning_rate": 4.907006439634516e-05, + "loss": 2.0407, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.8409023284912109, + "learning_rate": 4.903677269770329e-05, + "loss": 2.2344, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 0.8119503259658813, + "learning_rate": 4.900290720890671e-05, + "loss": 2.1296, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.9938147068023682, + "learning_rate": 4.8968468738370244e-05, + "loss": 2.152, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.9865244030952454, + "learning_rate": 4.8933458108186606e-05, + "loss": 1.9623, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944802284240723, + "learning_rate": 4.889787615410672e-05, + "loss": 1.915, + "step": 305 + }, + { + "epoch": 0.1, + "grad_norm": 1.3749767541885376, + "learning_rate": 4.886172372551977e-05, + "loss": 1.9934, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.9024938941001892, + "learning_rate": 4.882500168543294e-05, + "loss": 2.1541, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 1.1978263854980469, + "learning_rate": 4.878771091045082e-05, + "loss": 2.1688, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8360010981559753, + "learning_rate": 4.874985229075446e-05, + "loss": 2.1387, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 0.7683364152908325, + "learning_rate": 4.871142673008012e-05, + "loss": 2.0215, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 1.4230670928955078, + "learning_rate": 4.867243514569772e-05, + "loss": 1.9491, + "step": 335 + }, + { + "epoch": 0.11, + "grad_norm": 0.8198773860931396, + "learning_rate": 4.863287846838891e-05, + "loss": 2.0151, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 1.467207908630371, + "learning_rate": 4.85927576424249e-05, + "loss": 1.8906, + "step": 345 + }, + { + "epoch": 0.11, + "grad_norm": 0.9537095427513123, + "learning_rate": 4.855207362554385e-05, + "loss": 2.1844, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 1.0757155418395996, + "learning_rate": 4.851082738892809e-05, + "loss": 2.048, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6884938478469849, + "learning_rate": 4.8469019917180846e-05, + "loss": 1.9537, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 1.4680182933807373, + "learning_rate": 4.8426652208302814e-05, + "loss": 1.9731, + "step": 365 + }, + { + "epoch": 0.12, + "grad_norm": 1.1778632402420044, + "learning_rate": 4.83837252736683e-05, + "loss": 2.1395, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 1.2865056991577148, + "learning_rate": 4.834024013800108e-05, + "loss": 2.0016, + "step": 375 + }, + { + "epoch": 0.12, + "grad_norm": 1.055177092552185, + "learning_rate": 4.8296197839349944e-05, + "loss": 1.9632, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.0041871070861816, + "learning_rate": 4.825159942906389e-05, + "loss": 2.3302, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0026438236236572, + "learning_rate": 4.820644597176709e-05, + "loss": 2.1517, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 1.3532180786132812, + "learning_rate": 4.81607385453334e-05, + "loss": 2.1229, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 0.7670988440513611, + "learning_rate": 4.81144782408607e-05, + "loss": 2.1382, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 1.0405700206756592, + "learning_rate": 4.8067666162644774e-05, + "loss": 1.9614, + "step": 405 + }, + { + "epoch": 0.13, + "grad_norm": 1.2252662181854248, + "learning_rate": 4.802030342815304e-05, + "loss": 2.1399, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 1.237946629524231, + "learning_rate": 4.7972391167997754e-05, + "loss": 1.9034, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 0.8064705729484558, + "learning_rate": 4.7923930525909156e-05, + "loss": 2.0075, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 0.8717565536499023, + "learning_rate": 4.7874922658708065e-05, + "loss": 2.0105, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 1.6693098545074463, + "learning_rate": 4.782536873627832e-05, + "loss": 2.0242, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.82447350025177, + "learning_rate": 4.777526994153882e-05, + "loss": 2.0267, + "step": 435 + }, + { + "epoch": 0.14, + "grad_norm": 0.9926588535308838, + "learning_rate": 4.7724627470415307e-05, + "loss": 1.9119, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 1.0924450159072876, + "learning_rate": 4.7673442531811796e-05, + "loss": 2.2653, + "step": 445 + }, + { + "epoch": 0.14, + "grad_norm": 1.1592103242874146, + "learning_rate": 4.762171634758177e-05, + "loss": 2.0017, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.9172110557556152, + "learning_rate": 4.7569450152498927e-05, + "loss": 2.1408, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 1.1897525787353516, + "learning_rate": 4.751664519422778e-05, + "loss": 2.0935, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 0.8793094158172607, + "learning_rate": 4.746330273329386e-05, + "loss": 2.1142, + "step": 465 + }, + { + "epoch": 0.15, + "grad_norm": 1.4337489604949951, + "learning_rate": 4.740942404305356e-05, + "loss": 2.1289, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 1.0251764059066772, + "learning_rate": 4.735501040966383e-05, + "loss": 1.9741, + "step": 475 + }, + { + "epoch": 0.15, + "grad_norm": 1.2659822702407837, + "learning_rate": 4.730006313205143e-05, + "loss": 2.088, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.8884140849113464, + "learning_rate": 4.724458352188192e-05, + "loss": 2.2079, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 1.1937768459320068, + "learning_rate": 4.718857290352835e-05, + "loss": 2.048, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 0.9741552472114563, + "learning_rate": 4.713203261403966e-05, + "loss": 2.2569, + "step": 495 + }, + { + "epoch": 0.16, + "grad_norm": 0.7996780872344971, + "learning_rate": 4.707496400310874e-05, + "loss": 1.9574, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 1.8182051181793213, + "learning_rate": 4.701736843304025e-05, + "loss": 2.0951, + "step": 505 + }, + { + "epoch": 0.16, + "grad_norm": 1.507320761680603, + "learning_rate": 4.695924727871805e-05, + "loss": 2.0253, + "step": 510 + }, + { + "epoch": 0.16, + "grad_norm": 0.759121835231781, + "learning_rate": 4.690060192757242e-05, + "loss": 2.0602, + "step": 515 + }, + { + "epoch": 0.16, + "grad_norm": 1.5943195819854736, + "learning_rate": 4.684143377954691e-05, + "loss": 2.0386, + "step": 520 + }, + { + "epoch": 0.16, + "grad_norm": 0.8568710088729858, + "learning_rate": 4.6781744247064955e-05, + "loss": 2.073, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 1.3352620601654053, + "learning_rate": 4.6721534754996125e-05, + "loss": 2.1443, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 1.3417474031448364, + "learning_rate": 4.666080674062213e-05, + "loss": 2.0288, + "step": 535 + }, + { + "epoch": 0.17, + "grad_norm": 1.5334464311599731, + "learning_rate": 4.659956165360251e-05, + "loss": 2.0609, + "step": 540 + }, + { + "epoch": 0.17, + "grad_norm": 0.9658721089363098, + "learning_rate": 4.6537800955940005e-05, + "loss": 1.9539, + "step": 545 + }, + { + "epoch": 0.17, + "grad_norm": 1.9197947978973389, + "learning_rate": 4.647552612194572e-05, + "loss": 2.149, + "step": 550 + }, + { + "epoch": 0.17, + "grad_norm": 0.8512137532234192, + "learning_rate": 4.641273863820383e-05, + "loss": 1.9722, + "step": 555 + }, + { + "epoch": 0.17, + "grad_norm": 1.827289342880249, + "learning_rate": 4.634944000353622e-05, + "loss": 2.0729, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 1.088416337966919, + "learning_rate": 4.628563172896655e-05, + "loss": 1.9507, + "step": 565 + }, + { + "epoch": 0.18, + "grad_norm": 1.3566908836364746, + "learning_rate": 4.6221315337684353e-05, + "loss": 2.1643, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 1.3541293144226074, + "learning_rate": 4.615649236500854e-05, + "loss": 2.1839, + "step": 575 + }, + { + "epoch": 0.18, + "grad_norm": 0.991269588470459, + "learning_rate": 4.609116435835083e-05, + "loss": 2.0976, + "step": 580 + }, + { + "epoch": 0.18, + "grad_norm": 1.0280535221099854, + "learning_rate": 4.602533287717877e-05, + "loss": 2.1474, + "step": 585 + }, + { + "epoch": 0.18, + "grad_norm": 1.013123631477356, + "learning_rate": 4.5958999492978524e-05, + "loss": 2.1873, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 1.1753040552139282, + "learning_rate": 4.589216578921737e-05, + "loss": 2.1744, + "step": 595 + }, + { + "epoch": 0.19, + "grad_norm": 1.1839090585708618, + "learning_rate": 4.582483336130586e-05, + "loss": 1.9982, + "step": 600 + }, + { + "epoch": 0.19, + "grad_norm": 1.0724798440933228, + "learning_rate": 4.575700381655979e-05, + "loss": 2.1234, + "step": 605 + }, + { + "epoch": 0.19, + "grad_norm": 2.009913682937622, + "learning_rate": 4.5688678774161796e-05, + "loss": 1.9478, + "step": 610 + }, + { + "epoch": 0.19, + "grad_norm": 0.9897060394287109, + "learning_rate": 4.561985986512271e-05, + "loss": 1.8268, + "step": 615 + }, + { + "epoch": 0.19, + "grad_norm": 0.8881808519363403, + "learning_rate": 4.555054873224263e-05, + "loss": 1.9887, + "step": 620 + }, + { + "epoch": 0.19, + "grad_norm": 1.155900001525879, + "learning_rate": 4.54807470300717e-05, + "loss": 2.0777, + "step": 625 + }, + { + "epoch": 0.2, + "grad_norm": 0.8782421350479126, + "learning_rate": 4.5410456424870596e-05, + "loss": 2.0566, + "step": 630 + }, + { + "epoch": 0.2, + "grad_norm": 1.3324674367904663, + "learning_rate": 4.5339678594570795e-05, + "loss": 2.047, + "step": 635 + }, + { + "epoch": 0.2, + "grad_norm": 1.9805939197540283, + "learning_rate": 4.526841522873449e-05, + "loss": 1.962, + "step": 640 + }, + { + "epoch": 0.2, + "grad_norm": 1.4999943971633911, + "learning_rate": 4.519666802851422e-05, + "loss": 2.0972, + "step": 645 + }, + { + "epoch": 0.2, + "grad_norm": 1.4504961967468262, + "learning_rate": 4.5124438706612376e-05, + "loss": 2.0041, + "step": 650 + }, + { + "epoch": 0.2, + "grad_norm": 0.9078169465065002, + "learning_rate": 4.505172898724018e-05, + "loss": 2.1229, + "step": 655 + }, + { + "epoch": 0.21, + "grad_norm": 1.1635804176330566, + "learning_rate": 4.497854060607662e-05, + "loss": 2.0195, + "step": 660 + }, + { + "epoch": 0.21, + "grad_norm": 1.46576726436615, + "learning_rate": 4.490487531022699e-05, + "loss": 2.0745, + "step": 665 + }, + { + "epoch": 0.21, + "grad_norm": 1.2094652652740479, + "learning_rate": 4.4830734858181145e-05, + "loss": 2.1068, + "step": 670 + }, + { + "epoch": 0.21, + "grad_norm": 1.4738895893096924, + "learning_rate": 4.47561210197716e-05, + "loss": 1.8088, + "step": 675 + }, + { + "epoch": 0.21, + "grad_norm": 1.23384690284729, + "learning_rate": 4.4681035576131215e-05, + "loss": 2.0995, + "step": 680 + }, + { + "epoch": 0.21, + "grad_norm": 0.8332946300506592, + "learning_rate": 4.46054803196507e-05, + "loss": 2.0541, + "step": 685 + }, + { + "epoch": 0.21, + "grad_norm": 0.9207485318183899, + "learning_rate": 4.452945705393586e-05, + "loss": 2.166, + "step": 690 + }, + { + "epoch": 0.22, + "grad_norm": 1.292945146560669, + "learning_rate": 4.445296759376449e-05, + "loss": 2.0784, + "step": 695 + }, + { + "epoch": 0.22, + "grad_norm": 0.9874763488769531, + "learning_rate": 4.437601376504307e-05, + "loss": 2.2087, + "step": 700 + }, + { + "epoch": 0.22, + "grad_norm": 0.9427415132522583, + "learning_rate": 4.4298597404763186e-05, + "loss": 2.1199, + "step": 705 + }, + { + "epoch": 0.22, + "grad_norm": 1.7369529008865356, + "learning_rate": 4.422072036095768e-05, + "loss": 2.0355, + "step": 710 + }, + { + "epoch": 0.22, + "grad_norm": 1.2423696517944336, + "learning_rate": 4.414238449265654e-05, + "loss": 2.0011, + "step": 715 + }, + { + "epoch": 0.22, + "grad_norm": 1.2304831743240356, + "learning_rate": 4.406359166984249e-05, + "loss": 2.0368, + "step": 720 + }, + { + "epoch": 0.23, + "grad_norm": 0.9090413451194763, + "learning_rate": 4.39843437734064e-05, + "loss": 1.9983, + "step": 725 + }, + { + "epoch": 0.23, + "grad_norm": 1.2729507684707642, + "learning_rate": 4.390464269510233e-05, + "loss": 2.021, + "step": 730 + }, + { + "epoch": 0.23, + "grad_norm": 1.3009227514266968, + "learning_rate": 4.382449033750244e-05, + "loss": 1.9743, + "step": 735 + }, + { + "epoch": 0.23, + "grad_norm": 1.5456056594848633, + "learning_rate": 4.37438886139515e-05, + "loss": 2.0689, + "step": 740 + }, + { + "epoch": 0.23, + "grad_norm": 1.3235007524490356, + "learning_rate": 4.3662839448521264e-05, + "loss": 2.0838, + "step": 745 + }, + { + "epoch": 0.23, + "grad_norm": 2.2074007987976074, + "learning_rate": 4.358134477596454e-05, + "loss": 2.0835, + "step": 750 + }, + { + "epoch": 0.23, + "grad_norm": 1.403738021850586, + "learning_rate": 4.3499406541668966e-05, + "loss": 2.0916, + "step": 755 + }, + { + "epoch": 0.24, + "grad_norm": 1.0940325260162354, + "learning_rate": 4.3417026701610616e-05, + "loss": 1.972, + "step": 760 + }, + { + "epoch": 0.24, + "grad_norm": 1.666353702545166, + "learning_rate": 4.3334207222307275e-05, + "loss": 1.927, + "step": 765 + }, + { + "epoch": 0.24, + "grad_norm": 1.0777515172958374, + "learning_rate": 4.325095008077154e-05, + "loss": 2.1192, + "step": 770 + }, + { + "epoch": 0.24, + "grad_norm": 1.7218186855316162, + "learning_rate": 4.316725726446353e-05, + "loss": 2.0774, + "step": 775 + }, + { + "epoch": 0.24, + "grad_norm": 1.356753945350647, + "learning_rate": 4.3083130771243586e-05, + "loss": 2.0847, + "step": 780 + }, + { + "epoch": 0.24, + "grad_norm": 0.9967429637908936, + "learning_rate": 4.299857260932445e-05, + "loss": 2.0485, + "step": 785 + }, + { + "epoch": 0.25, + "grad_norm": 1.6216442584991455, + "learning_rate": 4.2913584797223397e-05, + "loss": 2.1008, + "step": 790 + }, + { + "epoch": 0.25, + "grad_norm": 1.2556742429733276, + "learning_rate": 4.2828169363714016e-05, + "loss": 1.9209, + "step": 795 + }, + { + "epoch": 0.25, + "grad_norm": 1.1800439357757568, + "learning_rate": 4.274232834777782e-05, + "loss": 1.9722, + "step": 800 + }, + { + "epoch": 0.25, + "grad_norm": 1.1313499212265015, + "learning_rate": 4.2656063798555515e-05, + "loss": 1.9176, + "step": 805 + }, + { + "epoch": 0.25, + "grad_norm": 1.137534737586975, + "learning_rate": 4.256937777529815e-05, + "loss": 1.9929, + "step": 810 + }, + { + "epoch": 0.25, + "grad_norm": 1.0575093030929565, + "learning_rate": 4.2482272347317906e-05, + "loss": 2.166, + "step": 815 + }, + { + "epoch": 0.25, + "grad_norm": 1.5939594507217407, + "learning_rate": 4.2394749593938733e-05, + "loss": 2.1334, + "step": 820 + }, + { + "epoch": 0.26, + "grad_norm": 1.1045507192611694, + "learning_rate": 4.230681160444669e-05, + "loss": 2.0853, + "step": 825 + }, + { + "epoch": 0.26, + "grad_norm": 1.3480136394500732, + "learning_rate": 4.221846047804009e-05, + "loss": 2.1802, + "step": 830 + }, + { + "epoch": 0.26, + "grad_norm": 1.1822657585144043, + "learning_rate": 4.2129698323779366e-05, + "loss": 2.0739, + "step": 835 + }, + { + "epoch": 0.26, + "grad_norm": 1.1771117448806763, + "learning_rate": 4.204052726053676e-05, + "loss": 2.0238, + "step": 840 + }, + { + "epoch": 0.26, + "grad_norm": 1.4757814407348633, + "learning_rate": 4.195094941694571e-05, + "loss": 2.1557, + "step": 845 + }, + { + "epoch": 0.26, + "grad_norm": 0.9095075726509094, + "learning_rate": 4.1860966931350054e-05, + "loss": 2.1666, + "step": 850 + }, + { + "epoch": 0.27, + "grad_norm": 1.1039543151855469, + "learning_rate": 4.1770581951752976e-05, + "loss": 2.105, + "step": 855 + }, + { + "epoch": 0.27, + "grad_norm": 0.8517205119132996, + "learning_rate": 4.1679796635765735e-05, + "loss": 1.9656, + "step": 860 + }, + { + "epoch": 0.27, + "grad_norm": 1.239492654800415, + "learning_rate": 4.158861315055617e-05, + "loss": 2.0166, + "step": 865 + }, + { + "epoch": 0.27, + "grad_norm": 1.1358321905136108, + "learning_rate": 4.1497033672796924e-05, + "loss": 2.0076, + "step": 870 + }, + { + "epoch": 0.27, + "grad_norm": 1.6215249300003052, + "learning_rate": 4.140506038861356e-05, + "loss": 2.1594, + "step": 875 + }, + { + "epoch": 0.27, + "grad_norm": 1.0528080463409424, + "learning_rate": 4.131269549353229e-05, + "loss": 2.1416, + "step": 880 + }, + { + "epoch": 0.28, + "grad_norm": 0.8976901769638062, + "learning_rate": 4.1219941192427644e-05, + "loss": 2.1242, + "step": 885 + }, + { + "epoch": 0.28, + "grad_norm": 1.263594388961792, + "learning_rate": 4.112679969946977e-05, + "loss": 2.02, + "step": 890 + }, + { + "epoch": 0.28, + "grad_norm": 1.4173017740249634, + "learning_rate": 4.103327323807162e-05, + "loss": 2.0438, + "step": 895 + }, + { + "epoch": 0.28, + "grad_norm": 1.876170039176941, + "learning_rate": 4.093936404083585e-05, + "loss": 1.9806, + "step": 900 + }, + { + "epoch": 0.28, + "grad_norm": 1.4649231433868408, + "learning_rate": 4.0845074349501544e-05, + "loss": 2.1476, + "step": 905 + }, + { + "epoch": 0.28, + "grad_norm": 1.0446043014526367, + "learning_rate": 4.0750406414890695e-05, + "loss": 1.9672, + "step": 910 + }, + { + "epoch": 0.28, + "grad_norm": 1.0225305557250977, + "learning_rate": 4.065536249685448e-05, + "loss": 1.9984, + "step": 915 + }, + { + "epoch": 0.29, + "grad_norm": 1.0120617151260376, + "learning_rate": 4.055994486421929e-05, + "loss": 2.1162, + "step": 920 + }, + { + "epoch": 0.29, + "grad_norm": 1.0469881296157837, + "learning_rate": 4.04641557947326e-05, + "loss": 2.0435, + "step": 925 + }, + { + "epoch": 0.29, + "grad_norm": 1.2435941696166992, + "learning_rate": 4.036799757500856e-05, + "loss": 2.0431, + "step": 930 + }, + { + "epoch": 0.29, + "grad_norm": 1.0055103302001953, + "learning_rate": 4.027147250047348e-05, + "loss": 2.2021, + "step": 935 + }, + { + "epoch": 0.29, + "grad_norm": 1.1212949752807617, + "learning_rate": 4.017458287531094e-05, + "loss": 1.997, + "step": 940 + }, + { + "epoch": 0.29, + "grad_norm": 1.1048357486724854, + "learning_rate": 4.007733101240685e-05, + "loss": 1.946, + "step": 945 + }, + { + "epoch": 0.3, + "grad_norm": 1.4721689224243164, + "learning_rate": 3.997971923329426e-05, + "loss": 2.0723, + "step": 950 + }, + { + "epoch": 0.3, + "grad_norm": 1.3793156147003174, + "learning_rate": 3.988174986809783e-05, + "loss": 2.034, + "step": 955 + }, + { + "epoch": 0.3, + "grad_norm": 0.9013482928276062, + "learning_rate": 3.9783425255478355e-05, + "loss": 1.9736, + "step": 960 + }, + { + "epoch": 0.3, + "grad_norm": 0.9192422032356262, + "learning_rate": 3.968474774257682e-05, + "loss": 1.9878, + "step": 965 + }, + { + "epoch": 0.3, + "grad_norm": 1.9304206371307373, + "learning_rate": 3.9585719684958446e-05, + "loss": 2.117, + "step": 970 + }, + { + "epoch": 0.3, + "grad_norm": 1.0435137748718262, + "learning_rate": 3.948634344655639e-05, + "loss": 2.0585, + "step": 975 + }, + { + "epoch": 0.3, + "grad_norm": 1.4636590480804443, + "learning_rate": 3.938662139961538e-05, + "loss": 2.0409, + "step": 980 + }, + { + "epoch": 0.31, + "grad_norm": 1.8014529943466187, + "learning_rate": 3.928655592463508e-05, + "loss": 2.0369, + "step": 985 + }, + { + "epoch": 0.31, + "grad_norm": 1.2412620782852173, + "learning_rate": 3.918614941031319e-05, + "loss": 1.967, + "step": 990 + }, + { + "epoch": 0.31, + "grad_norm": 1.3581103086471558, + "learning_rate": 3.908540425348852e-05, + "loss": 2.0037, + "step": 995 + }, + { + "epoch": 0.31, + "grad_norm": 1.2377780675888062, + "learning_rate": 3.8984322859083725e-05, + "loss": 1.9991, + "step": 1000 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.343065816498176e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/checkpoint-1100/README.md b/checkpoint-1100/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-1100/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1100/adapter_config.json b/checkpoint-1100/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-1100/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1100/adapter_model.safetensors b/checkpoint-1100/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0fab7c359f49f806f7a707f85bafc1d3bbaa0194 --- /dev/null +++ b/checkpoint-1100/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a683217eb65ff427c775be7b4f2b17fa3e4d2a1f4ec2c6f932ead2131164d58 +size 2099272 diff --git a/checkpoint-1100/optimizer.pt b/checkpoint-1100/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9331e6086ee91332160742707cb1a78f346b726 --- /dev/null +++ b/checkpoint-1100/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f09f84a51333688d2c1ffa008ee924f88b13e5b05cf3e96c960de16b6a3ec732 +size 4208302 diff --git a/checkpoint-1100/rng_state.pth b/checkpoint-1100/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..634fb2b7b3aace9c27427af47144ca2c2f16720b --- /dev/null +++ b/checkpoint-1100/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea2be7046a8cfae98823a1a5937a6b641a96662a439d14f80e764a9be0f430b4 +size 14244 diff --git a/checkpoint-1100/scheduler.pt b/checkpoint-1100/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..5424712ae48d8b747d8e1acccad0a2e32c6ef196 --- /dev/null +++ b/checkpoint-1100/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9ebce50ac9027c187ef9430639e84c374e26350a5f18c89c2fee60ddec9bbbf +size 1064 diff --git a/checkpoint-1100/special_tokens_map.json b/checkpoint-1100/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-1100/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1100/tokenizer.model b/checkpoint-1100/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-1100/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-1100/tokenizer_config.json b/checkpoint-1100/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-1100/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-1100/trainer_state.json b/checkpoint-1100/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c721a7555f4c3a72641e8f61392875b3d43c6890 --- /dev/null +++ b/checkpoint-1100/trainer_state.json @@ -0,0 +1,1561 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.34205309596921524, + "eval_steps": 500, + "global_step": 1100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.2905739545822144, + "learning_rate": 4.9500077405018807e-05, + "loss": 2.0987, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.8612006306648254, + "learning_rate": 4.9475480211302583e-05, + "loss": 2.1765, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 1.3128459453582764, + "learning_rate": 4.945029875489212e-05, + "loss": 1.9926, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 0.9610918164253235, + "learning_rate": 4.94245336369027e-05, + "loss": 2.0124, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.873160183429718, + "learning_rate": 4.939818547238241e-05, + "loss": 2.2229, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.5535285472869873, + "learning_rate": 4.9371254890297446e-05, + "loss": 2.2013, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951836347579956, + "learning_rate": 4.93437425335171e-05, + "loss": 2.014, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874170541763306, + "learning_rate": 4.9315649058798384e-05, + "loss": 2.1701, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 1.3503323793411255, + "learning_rate": 4.928697513677042e-05, + "loss": 2.1681, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091179132461548, + "learning_rate": 4.925772145191834e-05, + "loss": 2.1224, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.4428555965423584, + "learning_rate": 4.9227888702567044e-05, + "loss": 2.0512, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.8234395980834961, + "learning_rate": 4.9197477600864446e-05, + "loss": 2.1067, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 1.9094969034194946, + "learning_rate": 4.9166488872764526e-05, + "loss": 1.8884, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 1.0074087381362915, + "learning_rate": 4.913492325800999e-05, + "loss": 1.9345, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 1.0867297649383545, + "learning_rate": 4.910278151011458e-05, + "loss": 2.1928, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 0.6842357516288757, + "learning_rate": 4.907006439634516e-05, + "loss": 2.0407, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.8409023284912109, + "learning_rate": 4.903677269770329e-05, + "loss": 2.2344, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 0.8119503259658813, + "learning_rate": 4.900290720890671e-05, + "loss": 2.1296, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.9938147068023682, + "learning_rate": 4.8968468738370244e-05, + "loss": 2.152, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.9865244030952454, + "learning_rate": 4.8933458108186606e-05, + "loss": 1.9623, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944802284240723, + "learning_rate": 4.889787615410672e-05, + "loss": 1.915, + "step": 305 + }, + { + "epoch": 0.1, + "grad_norm": 1.3749767541885376, + "learning_rate": 4.886172372551977e-05, + "loss": 1.9934, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.9024938941001892, + "learning_rate": 4.882500168543294e-05, + "loss": 2.1541, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 1.1978263854980469, + "learning_rate": 4.878771091045082e-05, + "loss": 2.1688, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8360010981559753, + "learning_rate": 4.874985229075446e-05, + "loss": 2.1387, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 0.7683364152908325, + "learning_rate": 4.871142673008012e-05, + "loss": 2.0215, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 1.4230670928955078, + "learning_rate": 4.867243514569772e-05, + "loss": 1.9491, + "step": 335 + }, + { + "epoch": 0.11, + "grad_norm": 0.8198773860931396, + "learning_rate": 4.863287846838891e-05, + "loss": 2.0151, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 1.467207908630371, + "learning_rate": 4.85927576424249e-05, + "loss": 1.8906, + "step": 345 + }, + { + "epoch": 0.11, + "grad_norm": 0.9537095427513123, + "learning_rate": 4.855207362554385e-05, + "loss": 2.1844, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 1.0757155418395996, + "learning_rate": 4.851082738892809e-05, + "loss": 2.048, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6884938478469849, + "learning_rate": 4.8469019917180846e-05, + "loss": 1.9537, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 1.4680182933807373, + "learning_rate": 4.8426652208302814e-05, + "loss": 1.9731, + "step": 365 + }, + { + "epoch": 0.12, + "grad_norm": 1.1778632402420044, + "learning_rate": 4.83837252736683e-05, + "loss": 2.1395, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 1.2865056991577148, + "learning_rate": 4.834024013800108e-05, + "loss": 2.0016, + "step": 375 + }, + { + "epoch": 0.12, + "grad_norm": 1.055177092552185, + "learning_rate": 4.8296197839349944e-05, + "loss": 1.9632, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.0041871070861816, + "learning_rate": 4.825159942906389e-05, + "loss": 2.3302, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0026438236236572, + "learning_rate": 4.820644597176709e-05, + "loss": 2.1517, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 1.3532180786132812, + "learning_rate": 4.81607385453334e-05, + "loss": 2.1229, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 0.7670988440513611, + "learning_rate": 4.81144782408607e-05, + "loss": 2.1382, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 1.0405700206756592, + "learning_rate": 4.8067666162644774e-05, + "loss": 1.9614, + "step": 405 + }, + { + "epoch": 0.13, + "grad_norm": 1.2252662181854248, + "learning_rate": 4.802030342815304e-05, + "loss": 2.1399, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 1.237946629524231, + "learning_rate": 4.7972391167997754e-05, + "loss": 1.9034, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 0.8064705729484558, + "learning_rate": 4.7923930525909156e-05, + "loss": 2.0075, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 0.8717565536499023, + "learning_rate": 4.7874922658708065e-05, + "loss": 2.0105, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 1.6693098545074463, + "learning_rate": 4.782536873627832e-05, + "loss": 2.0242, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.82447350025177, + "learning_rate": 4.777526994153882e-05, + "loss": 2.0267, + "step": 435 + }, + { + "epoch": 0.14, + "grad_norm": 0.9926588535308838, + "learning_rate": 4.7724627470415307e-05, + "loss": 1.9119, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 1.0924450159072876, + "learning_rate": 4.7673442531811796e-05, + "loss": 2.2653, + "step": 445 + }, + { + "epoch": 0.14, + "grad_norm": 1.1592103242874146, + "learning_rate": 4.762171634758177e-05, + "loss": 2.0017, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.9172110557556152, + "learning_rate": 4.7569450152498927e-05, + "loss": 2.1408, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 1.1897525787353516, + "learning_rate": 4.751664519422778e-05, + "loss": 2.0935, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 0.8793094158172607, + "learning_rate": 4.746330273329386e-05, + "loss": 2.1142, + "step": 465 + }, + { + "epoch": 0.15, + "grad_norm": 1.4337489604949951, + "learning_rate": 4.740942404305356e-05, + "loss": 2.1289, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 1.0251764059066772, + "learning_rate": 4.735501040966383e-05, + "loss": 1.9741, + "step": 475 + }, + { + "epoch": 0.15, + "grad_norm": 1.2659822702407837, + "learning_rate": 4.730006313205143e-05, + "loss": 2.088, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.8884140849113464, + "learning_rate": 4.724458352188192e-05, + "loss": 2.2079, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 1.1937768459320068, + "learning_rate": 4.718857290352835e-05, + "loss": 2.048, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 0.9741552472114563, + "learning_rate": 4.713203261403966e-05, + "loss": 2.2569, + "step": 495 + }, + { + "epoch": 0.16, + "grad_norm": 0.7996780872344971, + "learning_rate": 4.707496400310874e-05, + "loss": 1.9574, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 1.8182051181793213, + "learning_rate": 4.701736843304025e-05, + "loss": 2.0951, + "step": 505 + }, + { + "epoch": 0.16, + "grad_norm": 1.507320761680603, + "learning_rate": 4.695924727871805e-05, + "loss": 2.0253, + "step": 510 + }, + { + "epoch": 0.16, + "grad_norm": 0.759121835231781, + "learning_rate": 4.690060192757242e-05, + "loss": 2.0602, + "step": 515 + }, + { + "epoch": 0.16, + "grad_norm": 1.5943195819854736, + "learning_rate": 4.684143377954691e-05, + "loss": 2.0386, + "step": 520 + }, + { + "epoch": 0.16, + "grad_norm": 0.8568710088729858, + "learning_rate": 4.6781744247064955e-05, + "loss": 2.073, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 1.3352620601654053, + "learning_rate": 4.6721534754996125e-05, + "loss": 2.1443, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 1.3417474031448364, + "learning_rate": 4.666080674062213e-05, + "loss": 2.0288, + "step": 535 + }, + { + "epoch": 0.17, + "grad_norm": 1.5334464311599731, + "learning_rate": 4.659956165360251e-05, + "loss": 2.0609, + "step": 540 + }, + { + "epoch": 0.17, + "grad_norm": 0.9658721089363098, + "learning_rate": 4.6537800955940005e-05, + "loss": 1.9539, + "step": 545 + }, + { + "epoch": 0.17, + "grad_norm": 1.9197947978973389, + "learning_rate": 4.647552612194572e-05, + "loss": 2.149, + "step": 550 + }, + { + "epoch": 0.17, + "grad_norm": 0.8512137532234192, + "learning_rate": 4.641273863820383e-05, + "loss": 1.9722, + "step": 555 + }, + { + "epoch": 0.17, + "grad_norm": 1.827289342880249, + "learning_rate": 4.634944000353622e-05, + "loss": 2.0729, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 1.088416337966919, + "learning_rate": 4.628563172896655e-05, + "loss": 1.9507, + "step": 565 + }, + { + "epoch": 0.18, + "grad_norm": 1.3566908836364746, + "learning_rate": 4.6221315337684353e-05, + "loss": 2.1643, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 1.3541293144226074, + "learning_rate": 4.615649236500854e-05, + "loss": 2.1839, + "step": 575 + }, + { + "epoch": 0.18, + "grad_norm": 0.991269588470459, + "learning_rate": 4.609116435835083e-05, + "loss": 2.0976, + "step": 580 + }, + { + "epoch": 0.18, + "grad_norm": 1.0280535221099854, + "learning_rate": 4.602533287717877e-05, + "loss": 2.1474, + "step": 585 + }, + { + "epoch": 0.18, + "grad_norm": 1.013123631477356, + "learning_rate": 4.5958999492978524e-05, + "loss": 2.1873, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 1.1753040552139282, + "learning_rate": 4.589216578921737e-05, + "loss": 2.1744, + "step": 595 + }, + { + "epoch": 0.19, + "grad_norm": 1.1839090585708618, + "learning_rate": 4.582483336130586e-05, + "loss": 1.9982, + "step": 600 + }, + { + "epoch": 0.19, + "grad_norm": 1.0724798440933228, + "learning_rate": 4.575700381655979e-05, + "loss": 2.1234, + "step": 605 + }, + { + "epoch": 0.19, + "grad_norm": 2.009913682937622, + "learning_rate": 4.5688678774161796e-05, + "loss": 1.9478, + "step": 610 + }, + { + "epoch": 0.19, + "grad_norm": 0.9897060394287109, + "learning_rate": 4.561985986512271e-05, + "loss": 1.8268, + "step": 615 + }, + { + "epoch": 0.19, + "grad_norm": 0.8881808519363403, + "learning_rate": 4.555054873224263e-05, + "loss": 1.9887, + "step": 620 + }, + { + "epoch": 0.19, + "grad_norm": 1.155900001525879, + "learning_rate": 4.54807470300717e-05, + "loss": 2.0777, + "step": 625 + }, + { + "epoch": 0.2, + "grad_norm": 0.8782421350479126, + "learning_rate": 4.5410456424870596e-05, + "loss": 2.0566, + "step": 630 + }, + { + "epoch": 0.2, + "grad_norm": 1.3324674367904663, + "learning_rate": 4.5339678594570795e-05, + "loss": 2.047, + "step": 635 + }, + { + "epoch": 0.2, + "grad_norm": 1.9805939197540283, + "learning_rate": 4.526841522873449e-05, + "loss": 1.962, + "step": 640 + }, + { + "epoch": 0.2, + "grad_norm": 1.4999943971633911, + "learning_rate": 4.519666802851422e-05, + "loss": 2.0972, + "step": 645 + }, + { + "epoch": 0.2, + "grad_norm": 1.4504961967468262, + "learning_rate": 4.5124438706612376e-05, + "loss": 2.0041, + "step": 650 + }, + { + "epoch": 0.2, + "grad_norm": 0.9078169465065002, + "learning_rate": 4.505172898724018e-05, + "loss": 2.1229, + "step": 655 + }, + { + "epoch": 0.21, + "grad_norm": 1.1635804176330566, + "learning_rate": 4.497854060607662e-05, + "loss": 2.0195, + "step": 660 + }, + { + "epoch": 0.21, + "grad_norm": 1.46576726436615, + "learning_rate": 4.490487531022699e-05, + "loss": 2.0745, + "step": 665 + }, + { + "epoch": 0.21, + "grad_norm": 1.2094652652740479, + "learning_rate": 4.4830734858181145e-05, + "loss": 2.1068, + "step": 670 + }, + { + "epoch": 0.21, + "grad_norm": 1.4738895893096924, + "learning_rate": 4.47561210197716e-05, + "loss": 1.8088, + "step": 675 + }, + { + "epoch": 0.21, + "grad_norm": 1.23384690284729, + "learning_rate": 4.4681035576131215e-05, + "loss": 2.0995, + "step": 680 + }, + { + "epoch": 0.21, + "grad_norm": 0.8332946300506592, + "learning_rate": 4.46054803196507e-05, + "loss": 2.0541, + "step": 685 + }, + { + "epoch": 0.21, + "grad_norm": 0.9207485318183899, + "learning_rate": 4.452945705393586e-05, + "loss": 2.166, + "step": 690 + }, + { + "epoch": 0.22, + "grad_norm": 1.292945146560669, + "learning_rate": 4.445296759376449e-05, + "loss": 2.0784, + "step": 695 + }, + { + "epoch": 0.22, + "grad_norm": 0.9874763488769531, + "learning_rate": 4.437601376504307e-05, + "loss": 2.2087, + "step": 700 + }, + { + "epoch": 0.22, + "grad_norm": 0.9427415132522583, + "learning_rate": 4.4298597404763186e-05, + "loss": 2.1199, + "step": 705 + }, + { + "epoch": 0.22, + "grad_norm": 1.7369529008865356, + "learning_rate": 4.422072036095768e-05, + "loss": 2.0355, + "step": 710 + }, + { + "epoch": 0.22, + "grad_norm": 1.2423696517944336, + "learning_rate": 4.414238449265654e-05, + "loss": 2.0011, + "step": 715 + }, + { + "epoch": 0.22, + "grad_norm": 1.2304831743240356, + "learning_rate": 4.406359166984249e-05, + "loss": 2.0368, + "step": 720 + }, + { + "epoch": 0.23, + "grad_norm": 0.9090413451194763, + "learning_rate": 4.39843437734064e-05, + "loss": 1.9983, + "step": 725 + }, + { + "epoch": 0.23, + "grad_norm": 1.2729507684707642, + "learning_rate": 4.390464269510233e-05, + "loss": 2.021, + "step": 730 + }, + { + "epoch": 0.23, + "grad_norm": 1.3009227514266968, + "learning_rate": 4.382449033750244e-05, + "loss": 1.9743, + "step": 735 + }, + { + "epoch": 0.23, + "grad_norm": 1.5456056594848633, + "learning_rate": 4.37438886139515e-05, + "loss": 2.0689, + "step": 740 + }, + { + "epoch": 0.23, + "grad_norm": 1.3235007524490356, + "learning_rate": 4.3662839448521264e-05, + "loss": 2.0838, + "step": 745 + }, + { + "epoch": 0.23, + "grad_norm": 2.2074007987976074, + "learning_rate": 4.358134477596454e-05, + "loss": 2.0835, + "step": 750 + }, + { + "epoch": 0.23, + "grad_norm": 1.403738021850586, + "learning_rate": 4.3499406541668966e-05, + "loss": 2.0916, + "step": 755 + }, + { + "epoch": 0.24, + "grad_norm": 1.0940325260162354, + "learning_rate": 4.3417026701610616e-05, + "loss": 1.972, + "step": 760 + }, + { + "epoch": 0.24, + "grad_norm": 1.666353702545166, + "learning_rate": 4.3334207222307275e-05, + "loss": 1.927, + "step": 765 + }, + { + "epoch": 0.24, + "grad_norm": 1.0777515172958374, + "learning_rate": 4.325095008077154e-05, + "loss": 2.1192, + "step": 770 + }, + { + "epoch": 0.24, + "grad_norm": 1.7218186855316162, + "learning_rate": 4.316725726446353e-05, + "loss": 2.0774, + "step": 775 + }, + { + "epoch": 0.24, + "grad_norm": 1.356753945350647, + "learning_rate": 4.3083130771243586e-05, + "loss": 2.0847, + "step": 780 + }, + { + "epoch": 0.24, + "grad_norm": 0.9967429637908936, + "learning_rate": 4.299857260932445e-05, + "loss": 2.0485, + "step": 785 + }, + { + "epoch": 0.25, + "grad_norm": 1.6216442584991455, + "learning_rate": 4.2913584797223397e-05, + "loss": 2.1008, + "step": 790 + }, + { + "epoch": 0.25, + "grad_norm": 1.2556742429733276, + "learning_rate": 4.2828169363714016e-05, + "loss": 1.9209, + "step": 795 + }, + { + "epoch": 0.25, + "grad_norm": 1.1800439357757568, + "learning_rate": 4.274232834777782e-05, + "loss": 1.9722, + "step": 800 + }, + { + "epoch": 0.25, + "grad_norm": 1.1313499212265015, + "learning_rate": 4.2656063798555515e-05, + "loss": 1.9176, + "step": 805 + }, + { + "epoch": 0.25, + "grad_norm": 1.137534737586975, + "learning_rate": 4.256937777529815e-05, + "loss": 1.9929, + "step": 810 + }, + { + "epoch": 0.25, + "grad_norm": 1.0575093030929565, + "learning_rate": 4.2482272347317906e-05, + "loss": 2.166, + "step": 815 + }, + { + "epoch": 0.25, + "grad_norm": 1.5939594507217407, + "learning_rate": 4.2394749593938733e-05, + "loss": 2.1334, + "step": 820 + }, + { + "epoch": 0.26, + "grad_norm": 1.1045507192611694, + "learning_rate": 4.230681160444669e-05, + "loss": 2.0853, + "step": 825 + }, + { + "epoch": 0.26, + "grad_norm": 1.3480136394500732, + "learning_rate": 4.221846047804009e-05, + "loss": 2.1802, + "step": 830 + }, + { + "epoch": 0.26, + "grad_norm": 1.1822657585144043, + "learning_rate": 4.2129698323779366e-05, + "loss": 2.0739, + "step": 835 + }, + { + "epoch": 0.26, + "grad_norm": 1.1771117448806763, + "learning_rate": 4.204052726053676e-05, + "loss": 2.0238, + "step": 840 + }, + { + "epoch": 0.26, + "grad_norm": 1.4757814407348633, + "learning_rate": 4.195094941694571e-05, + "loss": 2.1557, + "step": 845 + }, + { + "epoch": 0.26, + "grad_norm": 0.9095075726509094, + "learning_rate": 4.1860966931350054e-05, + "loss": 2.1666, + "step": 850 + }, + { + "epoch": 0.27, + "grad_norm": 1.1039543151855469, + "learning_rate": 4.1770581951752976e-05, + "loss": 2.105, + "step": 855 + }, + { + "epoch": 0.27, + "grad_norm": 0.8517205119132996, + "learning_rate": 4.1679796635765735e-05, + "loss": 1.9656, + "step": 860 + }, + { + "epoch": 0.27, + "grad_norm": 1.239492654800415, + "learning_rate": 4.158861315055617e-05, + "loss": 2.0166, + "step": 865 + }, + { + "epoch": 0.27, + "grad_norm": 1.1358321905136108, + "learning_rate": 4.1497033672796924e-05, + "loss": 2.0076, + "step": 870 + }, + { + "epoch": 0.27, + "grad_norm": 1.6215249300003052, + "learning_rate": 4.140506038861356e-05, + "loss": 2.1594, + "step": 875 + }, + { + "epoch": 0.27, + "grad_norm": 1.0528080463409424, + "learning_rate": 4.131269549353229e-05, + "loss": 2.1416, + "step": 880 + }, + { + "epoch": 0.28, + "grad_norm": 0.8976901769638062, + "learning_rate": 4.1219941192427644e-05, + "loss": 2.1242, + "step": 885 + }, + { + "epoch": 0.28, + "grad_norm": 1.263594388961792, + "learning_rate": 4.112679969946977e-05, + "loss": 2.02, + "step": 890 + }, + { + "epoch": 0.28, + "grad_norm": 1.4173017740249634, + "learning_rate": 4.103327323807162e-05, + "loss": 2.0438, + "step": 895 + }, + { + "epoch": 0.28, + "grad_norm": 1.876170039176941, + "learning_rate": 4.093936404083585e-05, + "loss": 1.9806, + "step": 900 + }, + { + "epoch": 0.28, + "grad_norm": 1.4649231433868408, + "learning_rate": 4.0845074349501544e-05, + "loss": 2.1476, + "step": 905 + }, + { + "epoch": 0.28, + "grad_norm": 1.0446043014526367, + "learning_rate": 4.0750406414890695e-05, + "loss": 1.9672, + "step": 910 + }, + { + "epoch": 0.28, + "grad_norm": 1.0225305557250977, + "learning_rate": 4.065536249685448e-05, + "loss": 1.9984, + "step": 915 + }, + { + "epoch": 0.29, + "grad_norm": 1.0120617151260376, + "learning_rate": 4.055994486421929e-05, + "loss": 2.1162, + "step": 920 + }, + { + "epoch": 0.29, + "grad_norm": 1.0469881296157837, + "learning_rate": 4.04641557947326e-05, + "loss": 2.0435, + "step": 925 + }, + { + "epoch": 0.29, + "grad_norm": 1.2435941696166992, + "learning_rate": 4.036799757500856e-05, + "loss": 2.0431, + "step": 930 + }, + { + "epoch": 0.29, + "grad_norm": 1.0055103302001953, + "learning_rate": 4.027147250047348e-05, + "loss": 2.2021, + "step": 935 + }, + { + "epoch": 0.29, + "grad_norm": 1.1212949752807617, + "learning_rate": 4.017458287531094e-05, + "loss": 1.997, + "step": 940 + }, + { + "epoch": 0.29, + "grad_norm": 1.1048357486724854, + "learning_rate": 4.007733101240685e-05, + "loss": 1.946, + "step": 945 + }, + { + "epoch": 0.3, + "grad_norm": 1.4721689224243164, + "learning_rate": 3.997971923329426e-05, + "loss": 2.0723, + "step": 950 + }, + { + "epoch": 0.3, + "grad_norm": 1.3793156147003174, + "learning_rate": 3.988174986809783e-05, + "loss": 2.034, + "step": 955 + }, + { + "epoch": 0.3, + "grad_norm": 0.9013482928276062, + "learning_rate": 3.9783425255478355e-05, + "loss": 1.9736, + "step": 960 + }, + { + "epoch": 0.3, + "grad_norm": 0.9192422032356262, + "learning_rate": 3.968474774257682e-05, + "loss": 1.9878, + "step": 965 + }, + { + "epoch": 0.3, + "grad_norm": 1.9304206371307373, + "learning_rate": 3.9585719684958446e-05, + "loss": 2.117, + "step": 970 + }, + { + "epoch": 0.3, + "grad_norm": 1.0435137748718262, + "learning_rate": 3.948634344655639e-05, + "loss": 2.0585, + "step": 975 + }, + { + "epoch": 0.3, + "grad_norm": 1.4636590480804443, + "learning_rate": 3.938662139961538e-05, + "loss": 2.0409, + "step": 980 + }, + { + "epoch": 0.31, + "grad_norm": 1.8014529943466187, + "learning_rate": 3.928655592463508e-05, + "loss": 2.0369, + "step": 985 + }, + { + "epoch": 0.31, + "grad_norm": 1.2412620782852173, + "learning_rate": 3.918614941031319e-05, + "loss": 1.967, + "step": 990 + }, + { + "epoch": 0.31, + "grad_norm": 1.3581103086471558, + "learning_rate": 3.908540425348852e-05, + "loss": 2.0037, + "step": 995 + }, + { + "epoch": 0.31, + "grad_norm": 1.2377780675888062, + "learning_rate": 3.8984322859083725e-05, + "loss": 1.9991, + "step": 1000 + }, + { + "epoch": 0.31, + "grad_norm": 0.9209259748458862, + "learning_rate": 3.8882907640047896e-05, + "loss": 2.0448, + "step": 1005 + }, + { + "epoch": 0.31, + "grad_norm": 1.0150959491729736, + "learning_rate": 3.878116101729897e-05, + "loss": 2.0791, + "step": 1010 + }, + { + "epoch": 0.32, + "grad_norm": 1.5959141254425049, + "learning_rate": 3.867908541966594e-05, + "loss": 1.9997, + "step": 1015 + }, + { + "epoch": 0.32, + "grad_norm": 1.3945012092590332, + "learning_rate": 3.857668328383088e-05, + "loss": 2.0481, + "step": 1020 + }, + { + "epoch": 0.32, + "grad_norm": 1.2361671924591064, + "learning_rate": 3.847395705427075e-05, + "loss": 2.2664, + "step": 1025 + }, + { + "epoch": 0.32, + "grad_norm": 1.9661719799041748, + "learning_rate": 3.837090918319909e-05, + "loss": 1.9752, + "step": 1030 + }, + { + "epoch": 0.32, + "grad_norm": 1.6995949745178223, + "learning_rate": 3.8267542130507436e-05, + "loss": 2.1332, + "step": 1035 + }, + { + "epoch": 0.32, + "grad_norm": 1.1248412132263184, + "learning_rate": 3.816385836370663e-05, + "loss": 2.0432, + "step": 1040 + }, + { + "epoch": 0.32, + "grad_norm": 0.8734235763549805, + "learning_rate": 3.805986035786789e-05, + "loss": 1.9618, + "step": 1045 + }, + { + "epoch": 0.33, + "grad_norm": 1.322766661643982, + "learning_rate": 3.795555059556378e-05, + "loss": 2.0267, + "step": 1050 + }, + { + "epoch": 0.33, + "grad_norm": 1.0396028757095337, + "learning_rate": 3.7850931566808866e-05, + "loss": 2.1075, + "step": 1055 + }, + { + "epoch": 0.33, + "grad_norm": 0.9574625492095947, + "learning_rate": 3.7746005769000363e-05, + "loss": 2.156, + "step": 1060 + }, + { + "epoch": 0.33, + "grad_norm": 1.4480133056640625, + "learning_rate": 3.764077570685844e-05, + "loss": 1.9615, + "step": 1065 + }, + { + "epoch": 0.33, + "grad_norm": 1.5908560752868652, + "learning_rate": 3.753524389236648e-05, + "loss": 2.0928, + "step": 1070 + }, + { + "epoch": 0.33, + "grad_norm": 1.2628813982009888, + "learning_rate": 3.742941284471111e-05, + "loss": 2.1074, + "step": 1075 + }, + { + "epoch": 0.34, + "grad_norm": 1.2687503099441528, + "learning_rate": 3.7323285090222054e-05, + "loss": 1.9666, + "step": 1080 + }, + { + "epoch": 0.34, + "grad_norm": 1.2571731805801392, + "learning_rate": 3.721686316231181e-05, + "loss": 2.0468, + "step": 1085 + }, + { + "epoch": 0.34, + "grad_norm": 1.007453441619873, + "learning_rate": 3.7110149601415215e-05, + "loss": 2.0624, + "step": 1090 + }, + { + "epoch": 0.34, + "grad_norm": 1.2390377521514893, + "learning_rate": 3.700314695492876e-05, + "loss": 1.9888, + "step": 1095 + }, + { + "epoch": 0.34, + "grad_norm": 1.0878371000289917, + "learning_rate": 3.6895857777149825e-05, + "loss": 2.1013, + "step": 1100 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.478467994517504e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1100/training_args.bin b/checkpoint-1100/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-1100/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/checkpoint-1200/README.md b/checkpoint-1200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-1200/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1200/adapter_config.json b/checkpoint-1200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-1200/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1200/adapter_model.safetensors b/checkpoint-1200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d3871fd26d9cc993a3a9b582859988bc33199474 --- /dev/null +++ b/checkpoint-1200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7ac7f86b3d698764177b292c0945fc14ade25e4053b2ea32433e2ec468c1c68 +size 2099272 diff --git a/checkpoint-1200/optimizer.pt b/checkpoint-1200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..92ee7636340dd2b5d37d195ef4b533c20a5e0169 --- /dev/null +++ b/checkpoint-1200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f46418bdb2847edff424887e74f54e939ccb878883a90f7033fb72d289847b08 +size 4208302 diff --git a/checkpoint-1200/rng_state.pth b/checkpoint-1200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..b221020e695dc394acd40bb79272217a2f504bec --- /dev/null +++ b/checkpoint-1200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76802f226aa39edc0b86081075bc5ce21c5a32a4f1656a577b0f88858dbbf174 +size 14244 diff --git a/checkpoint-1200/scheduler.pt b/checkpoint-1200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf3f988a1e02202f3ce9c66ae845472cd5d86cfc --- /dev/null +++ b/checkpoint-1200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca63f72cf59858dda6b2859e21cee9d57c26194ed3023a7e6e3eb27a883baab6 +size 1064 diff --git a/checkpoint-1200/special_tokens_map.json b/checkpoint-1200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-1200/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1200/tokenizer.model b/checkpoint-1200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-1200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-1200/tokenizer_config.json b/checkpoint-1200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-1200/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-1200/trainer_state.json b/checkpoint-1200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9b1f058c7a57a56ca7312a3cf5c021abfc3c9cde --- /dev/null +++ b/checkpoint-1200/trainer_state.json @@ -0,0 +1,1701 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3731488319664166, + "eval_steps": 500, + "global_step": 1200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.2905739545822144, + "learning_rate": 4.9500077405018807e-05, + "loss": 2.0987, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.8612006306648254, + "learning_rate": 4.9475480211302583e-05, + "loss": 2.1765, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 1.3128459453582764, + "learning_rate": 4.945029875489212e-05, + "loss": 1.9926, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 0.9610918164253235, + "learning_rate": 4.94245336369027e-05, + "loss": 2.0124, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.873160183429718, + "learning_rate": 4.939818547238241e-05, + "loss": 2.2229, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.5535285472869873, + "learning_rate": 4.9371254890297446e-05, + "loss": 2.2013, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951836347579956, + "learning_rate": 4.93437425335171e-05, + "loss": 2.014, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874170541763306, + "learning_rate": 4.9315649058798384e-05, + "loss": 2.1701, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 1.3503323793411255, + "learning_rate": 4.928697513677042e-05, + "loss": 2.1681, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091179132461548, + "learning_rate": 4.925772145191834e-05, + "loss": 2.1224, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.4428555965423584, + "learning_rate": 4.9227888702567044e-05, + "loss": 2.0512, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.8234395980834961, + "learning_rate": 4.9197477600864446e-05, + "loss": 2.1067, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 1.9094969034194946, + "learning_rate": 4.9166488872764526e-05, + "loss": 1.8884, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 1.0074087381362915, + "learning_rate": 4.913492325800999e-05, + "loss": 1.9345, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 1.0867297649383545, + "learning_rate": 4.910278151011458e-05, + "loss": 2.1928, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 0.6842357516288757, + "learning_rate": 4.907006439634516e-05, + "loss": 2.0407, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.8409023284912109, + "learning_rate": 4.903677269770329e-05, + "loss": 2.2344, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 0.8119503259658813, + "learning_rate": 4.900290720890671e-05, + "loss": 2.1296, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.9938147068023682, + "learning_rate": 4.8968468738370244e-05, + "loss": 2.152, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.9865244030952454, + "learning_rate": 4.8933458108186606e-05, + "loss": 1.9623, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944802284240723, + "learning_rate": 4.889787615410672e-05, + "loss": 1.915, + "step": 305 + }, + { + "epoch": 0.1, + "grad_norm": 1.3749767541885376, + "learning_rate": 4.886172372551977e-05, + "loss": 1.9934, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.9024938941001892, + "learning_rate": 4.882500168543294e-05, + "loss": 2.1541, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 1.1978263854980469, + "learning_rate": 4.878771091045082e-05, + "loss": 2.1688, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8360010981559753, + "learning_rate": 4.874985229075446e-05, + "loss": 2.1387, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 0.7683364152908325, + "learning_rate": 4.871142673008012e-05, + "loss": 2.0215, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 1.4230670928955078, + "learning_rate": 4.867243514569772e-05, + "loss": 1.9491, + "step": 335 + }, + { + "epoch": 0.11, + "grad_norm": 0.8198773860931396, + "learning_rate": 4.863287846838891e-05, + "loss": 2.0151, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 1.467207908630371, + "learning_rate": 4.85927576424249e-05, + "loss": 1.8906, + "step": 345 + }, + { + "epoch": 0.11, + "grad_norm": 0.9537095427513123, + "learning_rate": 4.855207362554385e-05, + "loss": 2.1844, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 1.0757155418395996, + "learning_rate": 4.851082738892809e-05, + "loss": 2.048, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6884938478469849, + "learning_rate": 4.8469019917180846e-05, + "loss": 1.9537, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 1.4680182933807373, + "learning_rate": 4.8426652208302814e-05, + "loss": 1.9731, + "step": 365 + }, + { + "epoch": 0.12, + "grad_norm": 1.1778632402420044, + "learning_rate": 4.83837252736683e-05, + "loss": 2.1395, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 1.2865056991577148, + "learning_rate": 4.834024013800108e-05, + "loss": 2.0016, + "step": 375 + }, + { + "epoch": 0.12, + "grad_norm": 1.055177092552185, + "learning_rate": 4.8296197839349944e-05, + "loss": 1.9632, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.0041871070861816, + "learning_rate": 4.825159942906389e-05, + "loss": 2.3302, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0026438236236572, + "learning_rate": 4.820644597176709e-05, + "loss": 2.1517, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 1.3532180786132812, + "learning_rate": 4.81607385453334e-05, + "loss": 2.1229, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 0.7670988440513611, + "learning_rate": 4.81144782408607e-05, + "loss": 2.1382, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 1.0405700206756592, + "learning_rate": 4.8067666162644774e-05, + "loss": 1.9614, + "step": 405 + }, + { + "epoch": 0.13, + "grad_norm": 1.2252662181854248, + "learning_rate": 4.802030342815304e-05, + "loss": 2.1399, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 1.237946629524231, + "learning_rate": 4.7972391167997754e-05, + "loss": 1.9034, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 0.8064705729484558, + "learning_rate": 4.7923930525909156e-05, + "loss": 2.0075, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 0.8717565536499023, + "learning_rate": 4.7874922658708065e-05, + "loss": 2.0105, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 1.6693098545074463, + "learning_rate": 4.782536873627832e-05, + "loss": 2.0242, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.82447350025177, + "learning_rate": 4.777526994153882e-05, + "loss": 2.0267, + "step": 435 + }, + { + "epoch": 0.14, + "grad_norm": 0.9926588535308838, + "learning_rate": 4.7724627470415307e-05, + "loss": 1.9119, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 1.0924450159072876, + "learning_rate": 4.7673442531811796e-05, + "loss": 2.2653, + "step": 445 + }, + { + "epoch": 0.14, + "grad_norm": 1.1592103242874146, + "learning_rate": 4.762171634758177e-05, + "loss": 2.0017, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.9172110557556152, + "learning_rate": 4.7569450152498927e-05, + "loss": 2.1408, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 1.1897525787353516, + "learning_rate": 4.751664519422778e-05, + "loss": 2.0935, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 0.8793094158172607, + "learning_rate": 4.746330273329386e-05, + "loss": 2.1142, + "step": 465 + }, + { + "epoch": 0.15, + "grad_norm": 1.4337489604949951, + "learning_rate": 4.740942404305356e-05, + "loss": 2.1289, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 1.0251764059066772, + "learning_rate": 4.735501040966383e-05, + "loss": 1.9741, + "step": 475 + }, + { + "epoch": 0.15, + "grad_norm": 1.2659822702407837, + "learning_rate": 4.730006313205143e-05, + "loss": 2.088, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.8884140849113464, + "learning_rate": 4.724458352188192e-05, + "loss": 2.2079, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 1.1937768459320068, + "learning_rate": 4.718857290352835e-05, + "loss": 2.048, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 0.9741552472114563, + "learning_rate": 4.713203261403966e-05, + "loss": 2.2569, + "step": 495 + }, + { + "epoch": 0.16, + "grad_norm": 0.7996780872344971, + "learning_rate": 4.707496400310874e-05, + "loss": 1.9574, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 1.8182051181793213, + "learning_rate": 4.701736843304025e-05, + "loss": 2.0951, + "step": 505 + }, + { + "epoch": 0.16, + "grad_norm": 1.507320761680603, + "learning_rate": 4.695924727871805e-05, + "loss": 2.0253, + "step": 510 + }, + { + "epoch": 0.16, + "grad_norm": 0.759121835231781, + "learning_rate": 4.690060192757242e-05, + "loss": 2.0602, + "step": 515 + }, + { + "epoch": 0.16, + "grad_norm": 1.5943195819854736, + "learning_rate": 4.684143377954691e-05, + "loss": 2.0386, + "step": 520 + }, + { + "epoch": 0.16, + "grad_norm": 0.8568710088729858, + "learning_rate": 4.6781744247064955e-05, + "loss": 2.073, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 1.3352620601654053, + "learning_rate": 4.6721534754996125e-05, + "loss": 2.1443, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 1.3417474031448364, + "learning_rate": 4.666080674062213e-05, + "loss": 2.0288, + "step": 535 + }, + { + "epoch": 0.17, + "grad_norm": 1.5334464311599731, + "learning_rate": 4.659956165360251e-05, + "loss": 2.0609, + "step": 540 + }, + { + "epoch": 0.17, + "grad_norm": 0.9658721089363098, + "learning_rate": 4.6537800955940005e-05, + "loss": 1.9539, + "step": 545 + }, + { + "epoch": 0.17, + "grad_norm": 1.9197947978973389, + "learning_rate": 4.647552612194572e-05, + "loss": 2.149, + "step": 550 + }, + { + "epoch": 0.17, + "grad_norm": 0.8512137532234192, + "learning_rate": 4.641273863820383e-05, + "loss": 1.9722, + "step": 555 + }, + { + "epoch": 0.17, + "grad_norm": 1.827289342880249, + "learning_rate": 4.634944000353622e-05, + "loss": 2.0729, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 1.088416337966919, + "learning_rate": 4.628563172896655e-05, + "loss": 1.9507, + "step": 565 + }, + { + "epoch": 0.18, + "grad_norm": 1.3566908836364746, + "learning_rate": 4.6221315337684353e-05, + "loss": 2.1643, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 1.3541293144226074, + "learning_rate": 4.615649236500854e-05, + "loss": 2.1839, + "step": 575 + }, + { + "epoch": 0.18, + "grad_norm": 0.991269588470459, + "learning_rate": 4.609116435835083e-05, + "loss": 2.0976, + "step": 580 + }, + { + "epoch": 0.18, + "grad_norm": 1.0280535221099854, + "learning_rate": 4.602533287717877e-05, + "loss": 2.1474, + "step": 585 + }, + { + "epoch": 0.18, + "grad_norm": 1.013123631477356, + "learning_rate": 4.5958999492978524e-05, + "loss": 2.1873, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 1.1753040552139282, + "learning_rate": 4.589216578921737e-05, + "loss": 2.1744, + "step": 595 + }, + { + "epoch": 0.19, + "grad_norm": 1.1839090585708618, + "learning_rate": 4.582483336130586e-05, + "loss": 1.9982, + "step": 600 + }, + { + "epoch": 0.19, + "grad_norm": 1.0724798440933228, + "learning_rate": 4.575700381655979e-05, + "loss": 2.1234, + "step": 605 + }, + { + "epoch": 0.19, + "grad_norm": 2.009913682937622, + "learning_rate": 4.5688678774161796e-05, + "loss": 1.9478, + "step": 610 + }, + { + "epoch": 0.19, + "grad_norm": 0.9897060394287109, + "learning_rate": 4.561985986512271e-05, + "loss": 1.8268, + "step": 615 + }, + { + "epoch": 0.19, + "grad_norm": 0.8881808519363403, + "learning_rate": 4.555054873224263e-05, + "loss": 1.9887, + "step": 620 + }, + { + "epoch": 0.19, + "grad_norm": 1.155900001525879, + "learning_rate": 4.54807470300717e-05, + "loss": 2.0777, + "step": 625 + }, + { + "epoch": 0.2, + "grad_norm": 0.8782421350479126, + "learning_rate": 4.5410456424870596e-05, + "loss": 2.0566, + "step": 630 + }, + { + "epoch": 0.2, + "grad_norm": 1.3324674367904663, + "learning_rate": 4.5339678594570795e-05, + "loss": 2.047, + "step": 635 + }, + { + "epoch": 0.2, + "grad_norm": 1.9805939197540283, + "learning_rate": 4.526841522873449e-05, + "loss": 1.962, + "step": 640 + }, + { + "epoch": 0.2, + "grad_norm": 1.4999943971633911, + "learning_rate": 4.519666802851422e-05, + "loss": 2.0972, + "step": 645 + }, + { + "epoch": 0.2, + "grad_norm": 1.4504961967468262, + "learning_rate": 4.5124438706612376e-05, + "loss": 2.0041, + "step": 650 + }, + { + "epoch": 0.2, + "grad_norm": 0.9078169465065002, + "learning_rate": 4.505172898724018e-05, + "loss": 2.1229, + "step": 655 + }, + { + "epoch": 0.21, + "grad_norm": 1.1635804176330566, + "learning_rate": 4.497854060607662e-05, + "loss": 2.0195, + "step": 660 + }, + { + "epoch": 0.21, + "grad_norm": 1.46576726436615, + "learning_rate": 4.490487531022699e-05, + "loss": 2.0745, + "step": 665 + }, + { + "epoch": 0.21, + "grad_norm": 1.2094652652740479, + "learning_rate": 4.4830734858181145e-05, + "loss": 2.1068, + "step": 670 + }, + { + "epoch": 0.21, + "grad_norm": 1.4738895893096924, + "learning_rate": 4.47561210197716e-05, + "loss": 1.8088, + "step": 675 + }, + { + "epoch": 0.21, + "grad_norm": 1.23384690284729, + "learning_rate": 4.4681035576131215e-05, + "loss": 2.0995, + "step": 680 + }, + { + "epoch": 0.21, + "grad_norm": 0.8332946300506592, + "learning_rate": 4.46054803196507e-05, + "loss": 2.0541, + "step": 685 + }, + { + "epoch": 0.21, + "grad_norm": 0.9207485318183899, + "learning_rate": 4.452945705393586e-05, + "loss": 2.166, + "step": 690 + }, + { + "epoch": 0.22, + "grad_norm": 1.292945146560669, + "learning_rate": 4.445296759376449e-05, + "loss": 2.0784, + "step": 695 + }, + { + "epoch": 0.22, + "grad_norm": 0.9874763488769531, + "learning_rate": 4.437601376504307e-05, + "loss": 2.2087, + "step": 700 + }, + { + "epoch": 0.22, + "grad_norm": 0.9427415132522583, + "learning_rate": 4.4298597404763186e-05, + "loss": 2.1199, + "step": 705 + }, + { + "epoch": 0.22, + "grad_norm": 1.7369529008865356, + "learning_rate": 4.422072036095768e-05, + "loss": 2.0355, + "step": 710 + }, + { + "epoch": 0.22, + "grad_norm": 1.2423696517944336, + "learning_rate": 4.414238449265654e-05, + "loss": 2.0011, + "step": 715 + }, + { + "epoch": 0.22, + "grad_norm": 1.2304831743240356, + "learning_rate": 4.406359166984249e-05, + "loss": 2.0368, + "step": 720 + }, + { + "epoch": 0.23, + "grad_norm": 0.9090413451194763, + "learning_rate": 4.39843437734064e-05, + "loss": 1.9983, + "step": 725 + }, + { + "epoch": 0.23, + "grad_norm": 1.2729507684707642, + "learning_rate": 4.390464269510233e-05, + "loss": 2.021, + "step": 730 + }, + { + "epoch": 0.23, + "grad_norm": 1.3009227514266968, + "learning_rate": 4.382449033750244e-05, + "loss": 1.9743, + "step": 735 + }, + { + "epoch": 0.23, + "grad_norm": 1.5456056594848633, + "learning_rate": 4.37438886139515e-05, + "loss": 2.0689, + "step": 740 + }, + { + "epoch": 0.23, + "grad_norm": 1.3235007524490356, + "learning_rate": 4.3662839448521264e-05, + "loss": 2.0838, + "step": 745 + }, + { + "epoch": 0.23, + "grad_norm": 2.2074007987976074, + "learning_rate": 4.358134477596454e-05, + "loss": 2.0835, + "step": 750 + }, + { + "epoch": 0.23, + "grad_norm": 1.403738021850586, + "learning_rate": 4.3499406541668966e-05, + "loss": 2.0916, + "step": 755 + }, + { + "epoch": 0.24, + "grad_norm": 1.0940325260162354, + "learning_rate": 4.3417026701610616e-05, + "loss": 1.972, + "step": 760 + }, + { + "epoch": 0.24, + "grad_norm": 1.666353702545166, + "learning_rate": 4.3334207222307275e-05, + "loss": 1.927, + "step": 765 + }, + { + "epoch": 0.24, + "grad_norm": 1.0777515172958374, + "learning_rate": 4.325095008077154e-05, + "loss": 2.1192, + "step": 770 + }, + { + "epoch": 0.24, + "grad_norm": 1.7218186855316162, + "learning_rate": 4.316725726446353e-05, + "loss": 2.0774, + "step": 775 + }, + { + "epoch": 0.24, + "grad_norm": 1.356753945350647, + "learning_rate": 4.3083130771243586e-05, + "loss": 2.0847, + "step": 780 + }, + { + "epoch": 0.24, + "grad_norm": 0.9967429637908936, + "learning_rate": 4.299857260932445e-05, + "loss": 2.0485, + "step": 785 + }, + { + "epoch": 0.25, + "grad_norm": 1.6216442584991455, + "learning_rate": 4.2913584797223397e-05, + "loss": 2.1008, + "step": 790 + }, + { + "epoch": 0.25, + "grad_norm": 1.2556742429733276, + "learning_rate": 4.2828169363714016e-05, + "loss": 1.9209, + "step": 795 + }, + { + "epoch": 0.25, + "grad_norm": 1.1800439357757568, + "learning_rate": 4.274232834777782e-05, + "loss": 1.9722, + "step": 800 + }, + { + "epoch": 0.25, + "grad_norm": 1.1313499212265015, + "learning_rate": 4.2656063798555515e-05, + "loss": 1.9176, + "step": 805 + }, + { + "epoch": 0.25, + "grad_norm": 1.137534737586975, + "learning_rate": 4.256937777529815e-05, + "loss": 1.9929, + "step": 810 + }, + { + "epoch": 0.25, + "grad_norm": 1.0575093030929565, + "learning_rate": 4.2482272347317906e-05, + "loss": 2.166, + "step": 815 + }, + { + "epoch": 0.25, + "grad_norm": 1.5939594507217407, + "learning_rate": 4.2394749593938733e-05, + "loss": 2.1334, + "step": 820 + }, + { + "epoch": 0.26, + "grad_norm": 1.1045507192611694, + "learning_rate": 4.230681160444669e-05, + "loss": 2.0853, + "step": 825 + }, + { + "epoch": 0.26, + "grad_norm": 1.3480136394500732, + "learning_rate": 4.221846047804009e-05, + "loss": 2.1802, + "step": 830 + }, + { + "epoch": 0.26, + "grad_norm": 1.1822657585144043, + "learning_rate": 4.2129698323779366e-05, + "loss": 2.0739, + "step": 835 + }, + { + "epoch": 0.26, + "grad_norm": 1.1771117448806763, + "learning_rate": 4.204052726053676e-05, + "loss": 2.0238, + "step": 840 + }, + { + "epoch": 0.26, + "grad_norm": 1.4757814407348633, + "learning_rate": 4.195094941694571e-05, + "loss": 2.1557, + "step": 845 + }, + { + "epoch": 0.26, + "grad_norm": 0.9095075726509094, + "learning_rate": 4.1860966931350054e-05, + "loss": 2.1666, + "step": 850 + }, + { + "epoch": 0.27, + "grad_norm": 1.1039543151855469, + "learning_rate": 4.1770581951752976e-05, + "loss": 2.105, + "step": 855 + }, + { + "epoch": 0.27, + "grad_norm": 0.8517205119132996, + "learning_rate": 4.1679796635765735e-05, + "loss": 1.9656, + "step": 860 + }, + { + "epoch": 0.27, + "grad_norm": 1.239492654800415, + "learning_rate": 4.158861315055617e-05, + "loss": 2.0166, + "step": 865 + }, + { + "epoch": 0.27, + "grad_norm": 1.1358321905136108, + "learning_rate": 4.1497033672796924e-05, + "loss": 2.0076, + "step": 870 + }, + { + "epoch": 0.27, + "grad_norm": 1.6215249300003052, + "learning_rate": 4.140506038861356e-05, + "loss": 2.1594, + "step": 875 + }, + { + "epoch": 0.27, + "grad_norm": 1.0528080463409424, + "learning_rate": 4.131269549353229e-05, + "loss": 2.1416, + "step": 880 + }, + { + "epoch": 0.28, + "grad_norm": 0.8976901769638062, + "learning_rate": 4.1219941192427644e-05, + "loss": 2.1242, + "step": 885 + }, + { + "epoch": 0.28, + "grad_norm": 1.263594388961792, + "learning_rate": 4.112679969946977e-05, + "loss": 2.02, + "step": 890 + }, + { + "epoch": 0.28, + "grad_norm": 1.4173017740249634, + "learning_rate": 4.103327323807162e-05, + "loss": 2.0438, + "step": 895 + }, + { + "epoch": 0.28, + "grad_norm": 1.876170039176941, + "learning_rate": 4.093936404083585e-05, + "loss": 1.9806, + "step": 900 + }, + { + "epoch": 0.28, + "grad_norm": 1.4649231433868408, + "learning_rate": 4.0845074349501544e-05, + "loss": 2.1476, + "step": 905 + }, + { + "epoch": 0.28, + "grad_norm": 1.0446043014526367, + "learning_rate": 4.0750406414890695e-05, + "loss": 1.9672, + "step": 910 + }, + { + "epoch": 0.28, + "grad_norm": 1.0225305557250977, + "learning_rate": 4.065536249685448e-05, + "loss": 1.9984, + "step": 915 + }, + { + "epoch": 0.29, + "grad_norm": 1.0120617151260376, + "learning_rate": 4.055994486421929e-05, + "loss": 2.1162, + "step": 920 + }, + { + "epoch": 0.29, + "grad_norm": 1.0469881296157837, + "learning_rate": 4.04641557947326e-05, + "loss": 2.0435, + "step": 925 + }, + { + "epoch": 0.29, + "grad_norm": 1.2435941696166992, + "learning_rate": 4.036799757500856e-05, + "loss": 2.0431, + "step": 930 + }, + { + "epoch": 0.29, + "grad_norm": 1.0055103302001953, + "learning_rate": 4.027147250047348e-05, + "loss": 2.2021, + "step": 935 + }, + { + "epoch": 0.29, + "grad_norm": 1.1212949752807617, + "learning_rate": 4.017458287531094e-05, + "loss": 1.997, + "step": 940 + }, + { + "epoch": 0.29, + "grad_norm": 1.1048357486724854, + "learning_rate": 4.007733101240685e-05, + "loss": 1.946, + "step": 945 + }, + { + "epoch": 0.3, + "grad_norm": 1.4721689224243164, + "learning_rate": 3.997971923329426e-05, + "loss": 2.0723, + "step": 950 + }, + { + "epoch": 0.3, + "grad_norm": 1.3793156147003174, + "learning_rate": 3.988174986809783e-05, + "loss": 2.034, + "step": 955 + }, + { + "epoch": 0.3, + "grad_norm": 0.9013482928276062, + "learning_rate": 3.9783425255478355e-05, + "loss": 1.9736, + "step": 960 + }, + { + "epoch": 0.3, + "grad_norm": 0.9192422032356262, + "learning_rate": 3.968474774257682e-05, + "loss": 1.9878, + "step": 965 + }, + { + "epoch": 0.3, + "grad_norm": 1.9304206371307373, + "learning_rate": 3.9585719684958446e-05, + "loss": 2.117, + "step": 970 + }, + { + "epoch": 0.3, + "grad_norm": 1.0435137748718262, + "learning_rate": 3.948634344655639e-05, + "loss": 2.0585, + "step": 975 + }, + { + "epoch": 0.3, + "grad_norm": 1.4636590480804443, + "learning_rate": 3.938662139961538e-05, + "loss": 2.0409, + "step": 980 + }, + { + "epoch": 0.31, + "grad_norm": 1.8014529943466187, + "learning_rate": 3.928655592463508e-05, + "loss": 2.0369, + "step": 985 + }, + { + "epoch": 0.31, + "grad_norm": 1.2412620782852173, + "learning_rate": 3.918614941031319e-05, + "loss": 1.967, + "step": 990 + }, + { + "epoch": 0.31, + "grad_norm": 1.3581103086471558, + "learning_rate": 3.908540425348852e-05, + "loss": 2.0037, + "step": 995 + }, + { + "epoch": 0.31, + "grad_norm": 1.2377780675888062, + "learning_rate": 3.8984322859083725e-05, + "loss": 1.9991, + "step": 1000 + }, + { + "epoch": 0.31, + "grad_norm": 0.9209259748458862, + "learning_rate": 3.8882907640047896e-05, + "loss": 2.0448, + "step": 1005 + }, + { + "epoch": 0.31, + "grad_norm": 1.0150959491729736, + "learning_rate": 3.878116101729897e-05, + "loss": 2.0791, + "step": 1010 + }, + { + "epoch": 0.32, + "grad_norm": 1.5959141254425049, + "learning_rate": 3.867908541966594e-05, + "loss": 1.9997, + "step": 1015 + }, + { + "epoch": 0.32, + "grad_norm": 1.3945012092590332, + "learning_rate": 3.857668328383088e-05, + "loss": 2.0481, + "step": 1020 + }, + { + "epoch": 0.32, + "grad_norm": 1.2361671924591064, + "learning_rate": 3.847395705427075e-05, + "loss": 2.2664, + "step": 1025 + }, + { + "epoch": 0.32, + "grad_norm": 1.9661719799041748, + "learning_rate": 3.837090918319909e-05, + "loss": 1.9752, + "step": 1030 + }, + { + "epoch": 0.32, + "grad_norm": 1.6995949745178223, + "learning_rate": 3.8267542130507436e-05, + "loss": 2.1332, + "step": 1035 + }, + { + "epoch": 0.32, + "grad_norm": 1.1248412132263184, + "learning_rate": 3.816385836370663e-05, + "loss": 2.0432, + "step": 1040 + }, + { + "epoch": 0.32, + "grad_norm": 0.8734235763549805, + "learning_rate": 3.805986035786789e-05, + "loss": 1.9618, + "step": 1045 + }, + { + "epoch": 0.33, + "grad_norm": 1.322766661643982, + "learning_rate": 3.795555059556378e-05, + "loss": 2.0267, + "step": 1050 + }, + { + "epoch": 0.33, + "grad_norm": 1.0396028757095337, + "learning_rate": 3.7850931566808866e-05, + "loss": 2.1075, + "step": 1055 + }, + { + "epoch": 0.33, + "grad_norm": 0.9574625492095947, + "learning_rate": 3.7746005769000363e-05, + "loss": 2.156, + "step": 1060 + }, + { + "epoch": 0.33, + "grad_norm": 1.4480133056640625, + "learning_rate": 3.764077570685844e-05, + "loss": 1.9615, + "step": 1065 + }, + { + "epoch": 0.33, + "grad_norm": 1.5908560752868652, + "learning_rate": 3.753524389236648e-05, + "loss": 2.0928, + "step": 1070 + }, + { + "epoch": 0.33, + "grad_norm": 1.2628813982009888, + "learning_rate": 3.742941284471111e-05, + "loss": 2.1074, + "step": 1075 + }, + { + "epoch": 0.34, + "grad_norm": 1.2687503099441528, + "learning_rate": 3.7323285090222054e-05, + "loss": 1.9666, + "step": 1080 + }, + { + "epoch": 0.34, + "grad_norm": 1.2571731805801392, + "learning_rate": 3.721686316231181e-05, + "loss": 2.0468, + "step": 1085 + }, + { + "epoch": 0.34, + "grad_norm": 1.007453441619873, + "learning_rate": 3.7110149601415215e-05, + "loss": 2.0624, + "step": 1090 + }, + { + "epoch": 0.34, + "grad_norm": 1.2390377521514893, + "learning_rate": 3.700314695492876e-05, + "loss": 1.9888, + "step": 1095 + }, + { + "epoch": 0.34, + "grad_norm": 1.0878371000289917, + "learning_rate": 3.6895857777149825e-05, + "loss": 2.1013, + "step": 1100 + }, + { + "epoch": 0.34, + "grad_norm": 0.8759217262268066, + "learning_rate": 3.6788284629215624e-05, + "loss": 1.875, + "step": 1105 + }, + { + "epoch": 0.35, + "grad_norm": 1.1345970630645752, + "learning_rate": 3.668043007904219e-05, + "loss": 1.9096, + "step": 1110 + }, + { + "epoch": 0.35, + "grad_norm": 1.253629446029663, + "learning_rate": 3.6572296701262966e-05, + "loss": 2.1859, + "step": 1115 + }, + { + "epoch": 0.35, + "grad_norm": 0.9796190857887268, + "learning_rate": 3.646388707716738e-05, + "loss": 2.2092, + "step": 1120 + }, + { + "epoch": 0.35, + "grad_norm": 1.3893767595291138, + "learning_rate": 3.635520379463926e-05, + "loss": 2.0026, + "step": 1125 + }, + { + "epoch": 0.35, + "grad_norm": 0.8778309226036072, + "learning_rate": 3.6246249448095004e-05, + "loss": 2.2112, + "step": 1130 + }, + { + "epoch": 0.35, + "grad_norm": 1.2479698657989502, + "learning_rate": 3.6137026638421696e-05, + "loss": 2.0221, + "step": 1135 + }, + { + "epoch": 0.35, + "grad_norm": 1.3813824653625488, + "learning_rate": 3.6027537972914974e-05, + "loss": 1.9106, + "step": 1140 + }, + { + "epoch": 0.36, + "grad_norm": 1.2043218612670898, + "learning_rate": 3.5917786065216826e-05, + "loss": 2.0673, + "step": 1145 + }, + { + "epoch": 0.36, + "grad_norm": 1.5337340831756592, + "learning_rate": 3.580777353525318e-05, + "loss": 2.1463, + "step": 1150 + }, + { + "epoch": 0.36, + "grad_norm": 1.155813455581665, + "learning_rate": 3.5697503009171385e-05, + "loss": 2.0255, + "step": 1155 + }, + { + "epoch": 0.36, + "grad_norm": 1.034644365310669, + "learning_rate": 3.558697711927748e-05, + "loss": 2.1348, + "step": 1160 + }, + { + "epoch": 0.36, + "grad_norm": 1.0959795713424683, + "learning_rate": 3.54761985039734e-05, + "loss": 2.1457, + "step": 1165 + }, + { + "epoch": 0.36, + "grad_norm": 1.1938838958740234, + "learning_rate": 3.5365169807693966e-05, + "loss": 2.1256, + "step": 1170 + }, + { + "epoch": 0.37, + "grad_norm": 0.8162047863006592, + "learning_rate": 3.525389368084379e-05, + "loss": 1.9587, + "step": 1175 + }, + { + "epoch": 0.37, + "grad_norm": 0.9358930587768555, + "learning_rate": 3.514237277973393e-05, + "loss": 1.8965, + "step": 1180 + }, + { + "epoch": 0.37, + "grad_norm": 0.9210988879203796, + "learning_rate": 3.503060976651862e-05, + "loss": 1.9669, + "step": 1185 + }, + { + "epoch": 0.37, + "grad_norm": 1.4641343355178833, + "learning_rate": 3.491860730913156e-05, + "loss": 2.003, + "step": 1190 + }, + { + "epoch": 0.37, + "grad_norm": 1.2458257675170898, + "learning_rate": 3.480636808122235e-05, + "loss": 2.1487, + "step": 1195 + }, + { + "epoch": 0.37, + "grad_norm": 1.6770122051239014, + "learning_rate": 3.469389476209259e-05, + "loss": 2.0686, + "step": 1200 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.613880123457536e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1200/training_args.bin b/checkpoint-1200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-1200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/checkpoint-1300/README.md b/checkpoint-1300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-1300/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1300/adapter_config.json b/checkpoint-1300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-1300/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1300/adapter_model.safetensors b/checkpoint-1300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e0382739e62785a838f827ecd8282f87b0c726aa --- /dev/null +++ b/checkpoint-1300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7f5a3debe200cebce199a766f211c3a049b8d6f10373d8f3f83faa87c3e960b +size 2099272 diff --git a/checkpoint-1300/optimizer.pt b/checkpoint-1300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3129b4c94ab5bfbc61d1d7e9ea30089521b170f2 --- /dev/null +++ b/checkpoint-1300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87eabd303a694623686d7f654d52392eddab6262cd4355830697686f67c0a855 +size 4208302 diff --git a/checkpoint-1300/rng_state.pth b/checkpoint-1300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..35939a6ba3357b9f0e8c27610c512092142b7951 --- /dev/null +++ b/checkpoint-1300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4791e25833f91f30ef2f35dd4e766076d71c0375d8c0095c91afb487cabff9a0 +size 14244 diff --git a/checkpoint-1300/scheduler.pt b/checkpoint-1300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..620239d9d62203c933d741ea700759304d84d8d7 --- /dev/null +++ b/checkpoint-1300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dc06e39f1e7d640266c26ad77111cfa1a6963193bf9dba5df25b2337808bbe7 +size 1064 diff --git a/checkpoint-1300/special_tokens_map.json b/checkpoint-1300/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-1300/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1300/tokenizer.model b/checkpoint-1300/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-1300/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-1300/tokenizer_config.json b/checkpoint-1300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-1300/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-1300/trainer_state.json b/checkpoint-1300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..7533c209de558de20f04010af25f164baf7b197b --- /dev/null +++ b/checkpoint-1300/trainer_state.json @@ -0,0 +1,1841 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.40424456796361796, + "eval_steps": 500, + "global_step": 1300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.2905739545822144, + "learning_rate": 4.9500077405018807e-05, + "loss": 2.0987, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.8612006306648254, + "learning_rate": 4.9475480211302583e-05, + "loss": 2.1765, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 1.3128459453582764, + "learning_rate": 4.945029875489212e-05, + "loss": 1.9926, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 0.9610918164253235, + "learning_rate": 4.94245336369027e-05, + "loss": 2.0124, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.873160183429718, + "learning_rate": 4.939818547238241e-05, + "loss": 2.2229, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.5535285472869873, + "learning_rate": 4.9371254890297446e-05, + "loss": 2.2013, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951836347579956, + "learning_rate": 4.93437425335171e-05, + "loss": 2.014, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874170541763306, + "learning_rate": 4.9315649058798384e-05, + "loss": 2.1701, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 1.3503323793411255, + "learning_rate": 4.928697513677042e-05, + "loss": 2.1681, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091179132461548, + "learning_rate": 4.925772145191834e-05, + "loss": 2.1224, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.4428555965423584, + "learning_rate": 4.9227888702567044e-05, + "loss": 2.0512, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.8234395980834961, + "learning_rate": 4.9197477600864446e-05, + "loss": 2.1067, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 1.9094969034194946, + "learning_rate": 4.9166488872764526e-05, + "loss": 1.8884, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 1.0074087381362915, + "learning_rate": 4.913492325800999e-05, + "loss": 1.9345, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 1.0867297649383545, + "learning_rate": 4.910278151011458e-05, + "loss": 2.1928, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 0.6842357516288757, + "learning_rate": 4.907006439634516e-05, + "loss": 2.0407, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.8409023284912109, + "learning_rate": 4.903677269770329e-05, + "loss": 2.2344, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 0.8119503259658813, + "learning_rate": 4.900290720890671e-05, + "loss": 2.1296, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.9938147068023682, + "learning_rate": 4.8968468738370244e-05, + "loss": 2.152, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.9865244030952454, + "learning_rate": 4.8933458108186606e-05, + "loss": 1.9623, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944802284240723, + "learning_rate": 4.889787615410672e-05, + "loss": 1.915, + "step": 305 + }, + { + "epoch": 0.1, + "grad_norm": 1.3749767541885376, + "learning_rate": 4.886172372551977e-05, + "loss": 1.9934, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.9024938941001892, + "learning_rate": 4.882500168543294e-05, + "loss": 2.1541, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 1.1978263854980469, + "learning_rate": 4.878771091045082e-05, + "loss": 2.1688, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8360010981559753, + "learning_rate": 4.874985229075446e-05, + "loss": 2.1387, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 0.7683364152908325, + "learning_rate": 4.871142673008012e-05, + "loss": 2.0215, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 1.4230670928955078, + "learning_rate": 4.867243514569772e-05, + "loss": 1.9491, + "step": 335 + }, + { + "epoch": 0.11, + "grad_norm": 0.8198773860931396, + "learning_rate": 4.863287846838891e-05, + "loss": 2.0151, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 1.467207908630371, + "learning_rate": 4.85927576424249e-05, + "loss": 1.8906, + "step": 345 + }, + { + "epoch": 0.11, + "grad_norm": 0.9537095427513123, + "learning_rate": 4.855207362554385e-05, + "loss": 2.1844, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 1.0757155418395996, + "learning_rate": 4.851082738892809e-05, + "loss": 2.048, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6884938478469849, + "learning_rate": 4.8469019917180846e-05, + "loss": 1.9537, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 1.4680182933807373, + "learning_rate": 4.8426652208302814e-05, + "loss": 1.9731, + "step": 365 + }, + { + "epoch": 0.12, + "grad_norm": 1.1778632402420044, + "learning_rate": 4.83837252736683e-05, + "loss": 2.1395, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 1.2865056991577148, + "learning_rate": 4.834024013800108e-05, + "loss": 2.0016, + "step": 375 + }, + { + "epoch": 0.12, + "grad_norm": 1.055177092552185, + "learning_rate": 4.8296197839349944e-05, + "loss": 1.9632, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.0041871070861816, + "learning_rate": 4.825159942906389e-05, + "loss": 2.3302, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0026438236236572, + "learning_rate": 4.820644597176709e-05, + "loss": 2.1517, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 1.3532180786132812, + "learning_rate": 4.81607385453334e-05, + "loss": 2.1229, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 0.7670988440513611, + "learning_rate": 4.81144782408607e-05, + "loss": 2.1382, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 1.0405700206756592, + "learning_rate": 4.8067666162644774e-05, + "loss": 1.9614, + "step": 405 + }, + { + "epoch": 0.13, + "grad_norm": 1.2252662181854248, + "learning_rate": 4.802030342815304e-05, + "loss": 2.1399, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 1.237946629524231, + "learning_rate": 4.7972391167997754e-05, + "loss": 1.9034, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 0.8064705729484558, + "learning_rate": 4.7923930525909156e-05, + "loss": 2.0075, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 0.8717565536499023, + "learning_rate": 4.7874922658708065e-05, + "loss": 2.0105, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 1.6693098545074463, + "learning_rate": 4.782536873627832e-05, + "loss": 2.0242, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.82447350025177, + "learning_rate": 4.777526994153882e-05, + "loss": 2.0267, + "step": 435 + }, + { + "epoch": 0.14, + "grad_norm": 0.9926588535308838, + "learning_rate": 4.7724627470415307e-05, + "loss": 1.9119, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 1.0924450159072876, + "learning_rate": 4.7673442531811796e-05, + "loss": 2.2653, + "step": 445 + }, + { + "epoch": 0.14, + "grad_norm": 1.1592103242874146, + "learning_rate": 4.762171634758177e-05, + "loss": 2.0017, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.9172110557556152, + "learning_rate": 4.7569450152498927e-05, + "loss": 2.1408, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 1.1897525787353516, + "learning_rate": 4.751664519422778e-05, + "loss": 2.0935, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 0.8793094158172607, + "learning_rate": 4.746330273329386e-05, + "loss": 2.1142, + "step": 465 + }, + { + "epoch": 0.15, + "grad_norm": 1.4337489604949951, + "learning_rate": 4.740942404305356e-05, + "loss": 2.1289, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 1.0251764059066772, + "learning_rate": 4.735501040966383e-05, + "loss": 1.9741, + "step": 475 + }, + { + "epoch": 0.15, + "grad_norm": 1.2659822702407837, + "learning_rate": 4.730006313205143e-05, + "loss": 2.088, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.8884140849113464, + "learning_rate": 4.724458352188192e-05, + "loss": 2.2079, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 1.1937768459320068, + "learning_rate": 4.718857290352835e-05, + "loss": 2.048, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 0.9741552472114563, + "learning_rate": 4.713203261403966e-05, + "loss": 2.2569, + "step": 495 + }, + { + "epoch": 0.16, + "grad_norm": 0.7996780872344971, + "learning_rate": 4.707496400310874e-05, + "loss": 1.9574, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 1.8182051181793213, + "learning_rate": 4.701736843304025e-05, + "loss": 2.0951, + "step": 505 + }, + { + "epoch": 0.16, + "grad_norm": 1.507320761680603, + "learning_rate": 4.695924727871805e-05, + "loss": 2.0253, + "step": 510 + }, + { + "epoch": 0.16, + "grad_norm": 0.759121835231781, + "learning_rate": 4.690060192757242e-05, + "loss": 2.0602, + "step": 515 + }, + { + "epoch": 0.16, + "grad_norm": 1.5943195819854736, + "learning_rate": 4.684143377954691e-05, + "loss": 2.0386, + "step": 520 + }, + { + "epoch": 0.16, + "grad_norm": 0.8568710088729858, + "learning_rate": 4.6781744247064955e-05, + "loss": 2.073, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 1.3352620601654053, + "learning_rate": 4.6721534754996125e-05, + "loss": 2.1443, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 1.3417474031448364, + "learning_rate": 4.666080674062213e-05, + "loss": 2.0288, + "step": 535 + }, + { + "epoch": 0.17, + "grad_norm": 1.5334464311599731, + "learning_rate": 4.659956165360251e-05, + "loss": 2.0609, + "step": 540 + }, + { + "epoch": 0.17, + "grad_norm": 0.9658721089363098, + "learning_rate": 4.6537800955940005e-05, + "loss": 1.9539, + "step": 545 + }, + { + "epoch": 0.17, + "grad_norm": 1.9197947978973389, + "learning_rate": 4.647552612194572e-05, + "loss": 2.149, + "step": 550 + }, + { + "epoch": 0.17, + "grad_norm": 0.8512137532234192, + "learning_rate": 4.641273863820383e-05, + "loss": 1.9722, + "step": 555 + }, + { + "epoch": 0.17, + "grad_norm": 1.827289342880249, + "learning_rate": 4.634944000353622e-05, + "loss": 2.0729, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 1.088416337966919, + "learning_rate": 4.628563172896655e-05, + "loss": 1.9507, + "step": 565 + }, + { + "epoch": 0.18, + "grad_norm": 1.3566908836364746, + "learning_rate": 4.6221315337684353e-05, + "loss": 2.1643, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 1.3541293144226074, + "learning_rate": 4.615649236500854e-05, + "loss": 2.1839, + "step": 575 + }, + { + "epoch": 0.18, + "grad_norm": 0.991269588470459, + "learning_rate": 4.609116435835083e-05, + "loss": 2.0976, + "step": 580 + }, + { + "epoch": 0.18, + "grad_norm": 1.0280535221099854, + "learning_rate": 4.602533287717877e-05, + "loss": 2.1474, + "step": 585 + }, + { + "epoch": 0.18, + "grad_norm": 1.013123631477356, + "learning_rate": 4.5958999492978524e-05, + "loss": 2.1873, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 1.1753040552139282, + "learning_rate": 4.589216578921737e-05, + "loss": 2.1744, + "step": 595 + }, + { + "epoch": 0.19, + "grad_norm": 1.1839090585708618, + "learning_rate": 4.582483336130586e-05, + "loss": 1.9982, + "step": 600 + }, + { + "epoch": 0.19, + "grad_norm": 1.0724798440933228, + "learning_rate": 4.575700381655979e-05, + "loss": 2.1234, + "step": 605 + }, + { + "epoch": 0.19, + "grad_norm": 2.009913682937622, + "learning_rate": 4.5688678774161796e-05, + "loss": 1.9478, + "step": 610 + }, + { + "epoch": 0.19, + "grad_norm": 0.9897060394287109, + "learning_rate": 4.561985986512271e-05, + "loss": 1.8268, + "step": 615 + }, + { + "epoch": 0.19, + "grad_norm": 0.8881808519363403, + "learning_rate": 4.555054873224263e-05, + "loss": 1.9887, + "step": 620 + }, + { + "epoch": 0.19, + "grad_norm": 1.155900001525879, + "learning_rate": 4.54807470300717e-05, + "loss": 2.0777, + "step": 625 + }, + { + "epoch": 0.2, + "grad_norm": 0.8782421350479126, + "learning_rate": 4.5410456424870596e-05, + "loss": 2.0566, + "step": 630 + }, + { + "epoch": 0.2, + "grad_norm": 1.3324674367904663, + "learning_rate": 4.5339678594570795e-05, + "loss": 2.047, + "step": 635 + }, + { + "epoch": 0.2, + "grad_norm": 1.9805939197540283, + "learning_rate": 4.526841522873449e-05, + "loss": 1.962, + "step": 640 + }, + { + "epoch": 0.2, + "grad_norm": 1.4999943971633911, + "learning_rate": 4.519666802851422e-05, + "loss": 2.0972, + "step": 645 + }, + { + "epoch": 0.2, + "grad_norm": 1.4504961967468262, + "learning_rate": 4.5124438706612376e-05, + "loss": 2.0041, + "step": 650 + }, + { + "epoch": 0.2, + "grad_norm": 0.9078169465065002, + "learning_rate": 4.505172898724018e-05, + "loss": 2.1229, + "step": 655 + }, + { + "epoch": 0.21, + "grad_norm": 1.1635804176330566, + "learning_rate": 4.497854060607662e-05, + "loss": 2.0195, + "step": 660 + }, + { + "epoch": 0.21, + "grad_norm": 1.46576726436615, + "learning_rate": 4.490487531022699e-05, + "loss": 2.0745, + "step": 665 + }, + { + "epoch": 0.21, + "grad_norm": 1.2094652652740479, + "learning_rate": 4.4830734858181145e-05, + "loss": 2.1068, + "step": 670 + }, + { + "epoch": 0.21, + "grad_norm": 1.4738895893096924, + "learning_rate": 4.47561210197716e-05, + "loss": 1.8088, + "step": 675 + }, + { + "epoch": 0.21, + "grad_norm": 1.23384690284729, + "learning_rate": 4.4681035576131215e-05, + "loss": 2.0995, + "step": 680 + }, + { + "epoch": 0.21, + "grad_norm": 0.8332946300506592, + "learning_rate": 4.46054803196507e-05, + "loss": 2.0541, + "step": 685 + }, + { + "epoch": 0.21, + "grad_norm": 0.9207485318183899, + "learning_rate": 4.452945705393586e-05, + "loss": 2.166, + "step": 690 + }, + { + "epoch": 0.22, + "grad_norm": 1.292945146560669, + "learning_rate": 4.445296759376449e-05, + "loss": 2.0784, + "step": 695 + }, + { + "epoch": 0.22, + "grad_norm": 0.9874763488769531, + "learning_rate": 4.437601376504307e-05, + "loss": 2.2087, + "step": 700 + }, + { + "epoch": 0.22, + "grad_norm": 0.9427415132522583, + "learning_rate": 4.4298597404763186e-05, + "loss": 2.1199, + "step": 705 + }, + { + "epoch": 0.22, + "grad_norm": 1.7369529008865356, + "learning_rate": 4.422072036095768e-05, + "loss": 2.0355, + "step": 710 + }, + { + "epoch": 0.22, + "grad_norm": 1.2423696517944336, + "learning_rate": 4.414238449265654e-05, + "loss": 2.0011, + "step": 715 + }, + { + "epoch": 0.22, + "grad_norm": 1.2304831743240356, + "learning_rate": 4.406359166984249e-05, + "loss": 2.0368, + "step": 720 + }, + { + "epoch": 0.23, + "grad_norm": 0.9090413451194763, + "learning_rate": 4.39843437734064e-05, + "loss": 1.9983, + "step": 725 + }, + { + "epoch": 0.23, + "grad_norm": 1.2729507684707642, + "learning_rate": 4.390464269510233e-05, + "loss": 2.021, + "step": 730 + }, + { + "epoch": 0.23, + "grad_norm": 1.3009227514266968, + "learning_rate": 4.382449033750244e-05, + "loss": 1.9743, + "step": 735 + }, + { + "epoch": 0.23, + "grad_norm": 1.5456056594848633, + "learning_rate": 4.37438886139515e-05, + "loss": 2.0689, + "step": 740 + }, + { + "epoch": 0.23, + "grad_norm": 1.3235007524490356, + "learning_rate": 4.3662839448521264e-05, + "loss": 2.0838, + "step": 745 + }, + { + "epoch": 0.23, + "grad_norm": 2.2074007987976074, + "learning_rate": 4.358134477596454e-05, + "loss": 2.0835, + "step": 750 + }, + { + "epoch": 0.23, + "grad_norm": 1.403738021850586, + "learning_rate": 4.3499406541668966e-05, + "loss": 2.0916, + "step": 755 + }, + { + "epoch": 0.24, + "grad_norm": 1.0940325260162354, + "learning_rate": 4.3417026701610616e-05, + "loss": 1.972, + "step": 760 + }, + { + "epoch": 0.24, + "grad_norm": 1.666353702545166, + "learning_rate": 4.3334207222307275e-05, + "loss": 1.927, + "step": 765 + }, + { + "epoch": 0.24, + "grad_norm": 1.0777515172958374, + "learning_rate": 4.325095008077154e-05, + "loss": 2.1192, + "step": 770 + }, + { + "epoch": 0.24, + "grad_norm": 1.7218186855316162, + "learning_rate": 4.316725726446353e-05, + "loss": 2.0774, + "step": 775 + }, + { + "epoch": 0.24, + "grad_norm": 1.356753945350647, + "learning_rate": 4.3083130771243586e-05, + "loss": 2.0847, + "step": 780 + }, + { + "epoch": 0.24, + "grad_norm": 0.9967429637908936, + "learning_rate": 4.299857260932445e-05, + "loss": 2.0485, + "step": 785 + }, + { + "epoch": 0.25, + "grad_norm": 1.6216442584991455, + "learning_rate": 4.2913584797223397e-05, + "loss": 2.1008, + "step": 790 + }, + { + "epoch": 0.25, + "grad_norm": 1.2556742429733276, + "learning_rate": 4.2828169363714016e-05, + "loss": 1.9209, + "step": 795 + }, + { + "epoch": 0.25, + "grad_norm": 1.1800439357757568, + "learning_rate": 4.274232834777782e-05, + "loss": 1.9722, + "step": 800 + }, + { + "epoch": 0.25, + "grad_norm": 1.1313499212265015, + "learning_rate": 4.2656063798555515e-05, + "loss": 1.9176, + "step": 805 + }, + { + "epoch": 0.25, + "grad_norm": 1.137534737586975, + "learning_rate": 4.256937777529815e-05, + "loss": 1.9929, + "step": 810 + }, + { + "epoch": 0.25, + "grad_norm": 1.0575093030929565, + "learning_rate": 4.2482272347317906e-05, + "loss": 2.166, + "step": 815 + }, + { + "epoch": 0.25, + "grad_norm": 1.5939594507217407, + "learning_rate": 4.2394749593938733e-05, + "loss": 2.1334, + "step": 820 + }, + { + "epoch": 0.26, + "grad_norm": 1.1045507192611694, + "learning_rate": 4.230681160444669e-05, + "loss": 2.0853, + "step": 825 + }, + { + "epoch": 0.26, + "grad_norm": 1.3480136394500732, + "learning_rate": 4.221846047804009e-05, + "loss": 2.1802, + "step": 830 + }, + { + "epoch": 0.26, + "grad_norm": 1.1822657585144043, + "learning_rate": 4.2129698323779366e-05, + "loss": 2.0739, + "step": 835 + }, + { + "epoch": 0.26, + "grad_norm": 1.1771117448806763, + "learning_rate": 4.204052726053676e-05, + "loss": 2.0238, + "step": 840 + }, + { + "epoch": 0.26, + "grad_norm": 1.4757814407348633, + "learning_rate": 4.195094941694571e-05, + "loss": 2.1557, + "step": 845 + }, + { + "epoch": 0.26, + "grad_norm": 0.9095075726509094, + "learning_rate": 4.1860966931350054e-05, + "loss": 2.1666, + "step": 850 + }, + { + "epoch": 0.27, + "grad_norm": 1.1039543151855469, + "learning_rate": 4.1770581951752976e-05, + "loss": 2.105, + "step": 855 + }, + { + "epoch": 0.27, + "grad_norm": 0.8517205119132996, + "learning_rate": 4.1679796635765735e-05, + "loss": 1.9656, + "step": 860 + }, + { + "epoch": 0.27, + "grad_norm": 1.239492654800415, + "learning_rate": 4.158861315055617e-05, + "loss": 2.0166, + "step": 865 + }, + { + "epoch": 0.27, + "grad_norm": 1.1358321905136108, + "learning_rate": 4.1497033672796924e-05, + "loss": 2.0076, + "step": 870 + }, + { + "epoch": 0.27, + "grad_norm": 1.6215249300003052, + "learning_rate": 4.140506038861356e-05, + "loss": 2.1594, + "step": 875 + }, + { + "epoch": 0.27, + "grad_norm": 1.0528080463409424, + "learning_rate": 4.131269549353229e-05, + "loss": 2.1416, + "step": 880 + }, + { + "epoch": 0.28, + "grad_norm": 0.8976901769638062, + "learning_rate": 4.1219941192427644e-05, + "loss": 2.1242, + "step": 885 + }, + { + "epoch": 0.28, + "grad_norm": 1.263594388961792, + "learning_rate": 4.112679969946977e-05, + "loss": 2.02, + "step": 890 + }, + { + "epoch": 0.28, + "grad_norm": 1.4173017740249634, + "learning_rate": 4.103327323807162e-05, + "loss": 2.0438, + "step": 895 + }, + { + "epoch": 0.28, + "grad_norm": 1.876170039176941, + "learning_rate": 4.093936404083585e-05, + "loss": 1.9806, + "step": 900 + }, + { + "epoch": 0.28, + "grad_norm": 1.4649231433868408, + "learning_rate": 4.0845074349501544e-05, + "loss": 2.1476, + "step": 905 + }, + { + "epoch": 0.28, + "grad_norm": 1.0446043014526367, + "learning_rate": 4.0750406414890695e-05, + "loss": 1.9672, + "step": 910 + }, + { + "epoch": 0.28, + "grad_norm": 1.0225305557250977, + "learning_rate": 4.065536249685448e-05, + "loss": 1.9984, + "step": 915 + }, + { + "epoch": 0.29, + "grad_norm": 1.0120617151260376, + "learning_rate": 4.055994486421929e-05, + "loss": 2.1162, + "step": 920 + }, + { + "epoch": 0.29, + "grad_norm": 1.0469881296157837, + "learning_rate": 4.04641557947326e-05, + "loss": 2.0435, + "step": 925 + }, + { + "epoch": 0.29, + "grad_norm": 1.2435941696166992, + "learning_rate": 4.036799757500856e-05, + "loss": 2.0431, + "step": 930 + }, + { + "epoch": 0.29, + "grad_norm": 1.0055103302001953, + "learning_rate": 4.027147250047348e-05, + "loss": 2.2021, + "step": 935 + }, + { + "epoch": 0.29, + "grad_norm": 1.1212949752807617, + "learning_rate": 4.017458287531094e-05, + "loss": 1.997, + "step": 940 + }, + { + "epoch": 0.29, + "grad_norm": 1.1048357486724854, + "learning_rate": 4.007733101240685e-05, + "loss": 1.946, + "step": 945 + }, + { + "epoch": 0.3, + "grad_norm": 1.4721689224243164, + "learning_rate": 3.997971923329426e-05, + "loss": 2.0723, + "step": 950 + }, + { + "epoch": 0.3, + "grad_norm": 1.3793156147003174, + "learning_rate": 3.988174986809783e-05, + "loss": 2.034, + "step": 955 + }, + { + "epoch": 0.3, + "grad_norm": 0.9013482928276062, + "learning_rate": 3.9783425255478355e-05, + "loss": 1.9736, + "step": 960 + }, + { + "epoch": 0.3, + "grad_norm": 0.9192422032356262, + "learning_rate": 3.968474774257682e-05, + "loss": 1.9878, + "step": 965 + }, + { + "epoch": 0.3, + "grad_norm": 1.9304206371307373, + "learning_rate": 3.9585719684958446e-05, + "loss": 2.117, + "step": 970 + }, + { + "epoch": 0.3, + "grad_norm": 1.0435137748718262, + "learning_rate": 3.948634344655639e-05, + "loss": 2.0585, + "step": 975 + }, + { + "epoch": 0.3, + "grad_norm": 1.4636590480804443, + "learning_rate": 3.938662139961538e-05, + "loss": 2.0409, + "step": 980 + }, + { + "epoch": 0.31, + "grad_norm": 1.8014529943466187, + "learning_rate": 3.928655592463508e-05, + "loss": 2.0369, + "step": 985 + }, + { + "epoch": 0.31, + "grad_norm": 1.2412620782852173, + "learning_rate": 3.918614941031319e-05, + "loss": 1.967, + "step": 990 + }, + { + "epoch": 0.31, + "grad_norm": 1.3581103086471558, + "learning_rate": 3.908540425348852e-05, + "loss": 2.0037, + "step": 995 + }, + { + "epoch": 0.31, + "grad_norm": 1.2377780675888062, + "learning_rate": 3.8984322859083725e-05, + "loss": 1.9991, + "step": 1000 + }, + { + "epoch": 0.31, + "grad_norm": 0.9209259748458862, + "learning_rate": 3.8882907640047896e-05, + "loss": 2.0448, + "step": 1005 + }, + { + "epoch": 0.31, + "grad_norm": 1.0150959491729736, + "learning_rate": 3.878116101729897e-05, + "loss": 2.0791, + "step": 1010 + }, + { + "epoch": 0.32, + "grad_norm": 1.5959141254425049, + "learning_rate": 3.867908541966594e-05, + "loss": 1.9997, + "step": 1015 + }, + { + "epoch": 0.32, + "grad_norm": 1.3945012092590332, + "learning_rate": 3.857668328383088e-05, + "loss": 2.0481, + "step": 1020 + }, + { + "epoch": 0.32, + "grad_norm": 1.2361671924591064, + "learning_rate": 3.847395705427075e-05, + "loss": 2.2664, + "step": 1025 + }, + { + "epoch": 0.32, + "grad_norm": 1.9661719799041748, + "learning_rate": 3.837090918319909e-05, + "loss": 1.9752, + "step": 1030 + }, + { + "epoch": 0.32, + "grad_norm": 1.6995949745178223, + "learning_rate": 3.8267542130507436e-05, + "loss": 2.1332, + "step": 1035 + }, + { + "epoch": 0.32, + "grad_norm": 1.1248412132263184, + "learning_rate": 3.816385836370663e-05, + "loss": 2.0432, + "step": 1040 + }, + { + "epoch": 0.32, + "grad_norm": 0.8734235763549805, + "learning_rate": 3.805986035786789e-05, + "loss": 1.9618, + "step": 1045 + }, + { + "epoch": 0.33, + "grad_norm": 1.322766661643982, + "learning_rate": 3.795555059556378e-05, + "loss": 2.0267, + "step": 1050 + }, + { + "epoch": 0.33, + "grad_norm": 1.0396028757095337, + "learning_rate": 3.7850931566808866e-05, + "loss": 2.1075, + "step": 1055 + }, + { + "epoch": 0.33, + "grad_norm": 0.9574625492095947, + "learning_rate": 3.7746005769000363e-05, + "loss": 2.156, + "step": 1060 + }, + { + "epoch": 0.33, + "grad_norm": 1.4480133056640625, + "learning_rate": 3.764077570685844e-05, + "loss": 1.9615, + "step": 1065 + }, + { + "epoch": 0.33, + "grad_norm": 1.5908560752868652, + "learning_rate": 3.753524389236648e-05, + "loss": 2.0928, + "step": 1070 + }, + { + "epoch": 0.33, + "grad_norm": 1.2628813982009888, + "learning_rate": 3.742941284471111e-05, + "loss": 2.1074, + "step": 1075 + }, + { + "epoch": 0.34, + "grad_norm": 1.2687503099441528, + "learning_rate": 3.7323285090222054e-05, + "loss": 1.9666, + "step": 1080 + }, + { + "epoch": 0.34, + "grad_norm": 1.2571731805801392, + "learning_rate": 3.721686316231181e-05, + "loss": 2.0468, + "step": 1085 + }, + { + "epoch": 0.34, + "grad_norm": 1.007453441619873, + "learning_rate": 3.7110149601415215e-05, + "loss": 2.0624, + "step": 1090 + }, + { + "epoch": 0.34, + "grad_norm": 1.2390377521514893, + "learning_rate": 3.700314695492876e-05, + "loss": 1.9888, + "step": 1095 + }, + { + "epoch": 0.34, + "grad_norm": 1.0878371000289917, + "learning_rate": 3.6895857777149825e-05, + "loss": 2.1013, + "step": 1100 + }, + { + "epoch": 0.34, + "grad_norm": 0.8759217262268066, + "learning_rate": 3.6788284629215624e-05, + "loss": 1.875, + "step": 1105 + }, + { + "epoch": 0.35, + "grad_norm": 1.1345970630645752, + "learning_rate": 3.668043007904219e-05, + "loss": 1.9096, + "step": 1110 + }, + { + "epoch": 0.35, + "grad_norm": 1.253629446029663, + "learning_rate": 3.6572296701262966e-05, + "loss": 2.1859, + "step": 1115 + }, + { + "epoch": 0.35, + "grad_norm": 0.9796190857887268, + "learning_rate": 3.646388707716738e-05, + "loss": 2.2092, + "step": 1120 + }, + { + "epoch": 0.35, + "grad_norm": 1.3893767595291138, + "learning_rate": 3.635520379463926e-05, + "loss": 2.0026, + "step": 1125 + }, + { + "epoch": 0.35, + "grad_norm": 0.8778309226036072, + "learning_rate": 3.6246249448095004e-05, + "loss": 2.2112, + "step": 1130 + }, + { + "epoch": 0.35, + "grad_norm": 1.2479698657989502, + "learning_rate": 3.6137026638421696e-05, + "loss": 2.0221, + "step": 1135 + }, + { + "epoch": 0.35, + "grad_norm": 1.3813824653625488, + "learning_rate": 3.6027537972914974e-05, + "loss": 1.9106, + "step": 1140 + }, + { + "epoch": 0.36, + "grad_norm": 1.2043218612670898, + "learning_rate": 3.5917786065216826e-05, + "loss": 2.0673, + "step": 1145 + }, + { + "epoch": 0.36, + "grad_norm": 1.5337340831756592, + "learning_rate": 3.580777353525318e-05, + "loss": 2.1463, + "step": 1150 + }, + { + "epoch": 0.36, + "grad_norm": 1.155813455581665, + "learning_rate": 3.5697503009171385e-05, + "loss": 2.0255, + "step": 1155 + }, + { + "epoch": 0.36, + "grad_norm": 1.034644365310669, + "learning_rate": 3.558697711927748e-05, + "loss": 2.1348, + "step": 1160 + }, + { + "epoch": 0.36, + "grad_norm": 1.0959795713424683, + "learning_rate": 3.54761985039734e-05, + "loss": 2.1457, + "step": 1165 + }, + { + "epoch": 0.36, + "grad_norm": 1.1938838958740234, + "learning_rate": 3.5365169807693966e-05, + "loss": 2.1256, + "step": 1170 + }, + { + "epoch": 0.37, + "grad_norm": 0.8162047863006592, + "learning_rate": 3.525389368084379e-05, + "loss": 1.9587, + "step": 1175 + }, + { + "epoch": 0.37, + "grad_norm": 0.9358930587768555, + "learning_rate": 3.514237277973393e-05, + "loss": 1.8965, + "step": 1180 + }, + { + "epoch": 0.37, + "grad_norm": 0.9210988879203796, + "learning_rate": 3.503060976651862e-05, + "loss": 1.9669, + "step": 1185 + }, + { + "epoch": 0.37, + "grad_norm": 1.4641343355178833, + "learning_rate": 3.491860730913156e-05, + "loss": 2.003, + "step": 1190 + }, + { + "epoch": 0.37, + "grad_norm": 1.2458257675170898, + "learning_rate": 3.480636808122235e-05, + "loss": 2.1487, + "step": 1195 + }, + { + "epoch": 0.37, + "grad_norm": 1.6770122051239014, + "learning_rate": 3.469389476209259e-05, + "loss": 2.0686, + "step": 1200 + }, + { + "epoch": 0.37, + "grad_norm": 0.9083845019340515, + "learning_rate": 3.458119003663199e-05, + "loss": 2.0284, + "step": 1205 + }, + { + "epoch": 0.38, + "grad_norm": 1.2679696083068848, + "learning_rate": 3.446825659525421e-05, + "loss": 2.0555, + "step": 1210 + }, + { + "epoch": 0.38, + "grad_norm": 1.3823720216751099, + "learning_rate": 3.435509713383268e-05, + "loss": 1.9375, + "step": 1215 + }, + { + "epoch": 0.38, + "grad_norm": 1.5862077474594116, + "learning_rate": 3.424171435363623e-05, + "loss": 2.0271, + "step": 1220 + }, + { + "epoch": 0.38, + "grad_norm": 2.0107533931732178, + "learning_rate": 3.412811096126461e-05, + "loss": 2.1897, + "step": 1225 + }, + { + "epoch": 0.38, + "grad_norm": 1.4544458389282227, + "learning_rate": 3.401428966858387e-05, + "loss": 1.9978, + "step": 1230 + }, + { + "epoch": 0.38, + "grad_norm": 1.188170075416565, + "learning_rate": 3.390025319266167e-05, + "loss": 2.0688, + "step": 1235 + }, + { + "epoch": 0.39, + "grad_norm": 1.1016322374343872, + "learning_rate": 3.3786004255702336e-05, + "loss": 2.0396, + "step": 1240 + }, + { + "epoch": 0.39, + "grad_norm": 1.6623334884643555, + "learning_rate": 3.3671545584981954e-05, + "loss": 1.9566, + "step": 1245 + }, + { + "epoch": 0.39, + "grad_norm": 0.9161584377288818, + "learning_rate": 3.355687991278324e-05, + "loss": 2.0474, + "step": 1250 + }, + { + "epoch": 0.39, + "grad_norm": 0.9911025166511536, + "learning_rate": 3.3442009976330305e-05, + "loss": 2.2163, + "step": 1255 + }, + { + "epoch": 0.39, + "grad_norm": 1.1504255533218384, + "learning_rate": 3.332693851772331e-05, + "loss": 2.1088, + "step": 1260 + }, + { + "epoch": 0.39, + "grad_norm": 0.9544184803962708, + "learning_rate": 3.3211668283873035e-05, + "loss": 1.8947, + "step": 1265 + }, + { + "epoch": 0.39, + "grad_norm": 1.4625756740570068, + "learning_rate": 3.3096202026435304e-05, + "loss": 2.1748, + "step": 1270 + }, + { + "epoch": 0.4, + "grad_norm": 1.3267475366592407, + "learning_rate": 3.298054250174527e-05, + "loss": 1.9218, + "step": 1275 + }, + { + "epoch": 0.4, + "grad_norm": 0.9869363903999329, + "learning_rate": 3.2864692470751654e-05, + "loss": 2.2723, + "step": 1280 + }, + { + "epoch": 0.4, + "grad_norm": 1.5177838802337646, + "learning_rate": 3.27486546989508e-05, + "loss": 2.1456, + "step": 1285 + }, + { + "epoch": 0.4, + "grad_norm": 1.1998714208602905, + "learning_rate": 3.263243195632068e-05, + "loss": 1.8877, + "step": 1290 + }, + { + "epoch": 0.4, + "grad_norm": 1.2112164497375488, + "learning_rate": 3.2516027017254785e-05, + "loss": 2.0615, + "step": 1295 + }, + { + "epoch": 0.4, + "grad_norm": 1.0616129636764526, + "learning_rate": 3.239944266049587e-05, + "loss": 2.0402, + "step": 1300 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.747451332067328e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1300/training_args.bin b/checkpoint-1300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-1300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/checkpoint-1400/README.md b/checkpoint-1400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-1400/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1400/adapter_config.json b/checkpoint-1400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-1400/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1400/adapter_model.safetensors b/checkpoint-1400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..eaa8b2650253ebfcf941d1c16402912ba0b8cbf6 --- /dev/null +++ b/checkpoint-1400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30abf59f7e85e60d6fc510e94a01f896e2ef14b837a8b3f56ac7d4bb9a248e9c +size 2099272 diff --git a/checkpoint-1400/optimizer.pt b/checkpoint-1400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..3a7f029ddb416fca8d95340f936edf538016445d --- /dev/null +++ b/checkpoint-1400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fafb6357aceb71a6ccddedd4839e0ef4f366976d7d2e2245e1422ecc10b7e69c +size 4208302 diff --git a/checkpoint-1400/rng_state.pth b/checkpoint-1400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f33244e2e624fdb74b459fefcf7646f046d95e23 --- /dev/null +++ b/checkpoint-1400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77b2546a9b2821ed4272c4daa26fcfa05f024372bf736fd11b02713ac1401a37 +size 14244 diff --git a/checkpoint-1400/scheduler.pt b/checkpoint-1400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e6511dff5368667fbf76bca367bf31a711991b2 --- /dev/null +++ b/checkpoint-1400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e15b497f0a675bff3c8aab7f24bf8b46dabf69d9eb519a60f1c8b5f7ccc2be1c +size 1064 diff --git a/checkpoint-1400/special_tokens_map.json b/checkpoint-1400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-1400/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1400/tokenizer.model b/checkpoint-1400/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-1400/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-1400/tokenizer_config.json b/checkpoint-1400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-1400/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-1400/trainer_state.json b/checkpoint-1400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..14654d90543430d5e16193ecb647812ae3bd312d --- /dev/null +++ b/checkpoint-1400/trainer_state.json @@ -0,0 +1,1981 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.43534030396081935, + "eval_steps": 500, + "global_step": 1400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.2905739545822144, + "learning_rate": 4.9500077405018807e-05, + "loss": 2.0987, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.8612006306648254, + "learning_rate": 4.9475480211302583e-05, + "loss": 2.1765, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 1.3128459453582764, + "learning_rate": 4.945029875489212e-05, + "loss": 1.9926, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 0.9610918164253235, + "learning_rate": 4.94245336369027e-05, + "loss": 2.0124, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.873160183429718, + "learning_rate": 4.939818547238241e-05, + "loss": 2.2229, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.5535285472869873, + "learning_rate": 4.9371254890297446e-05, + "loss": 2.2013, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951836347579956, + "learning_rate": 4.93437425335171e-05, + "loss": 2.014, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874170541763306, + "learning_rate": 4.9315649058798384e-05, + "loss": 2.1701, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 1.3503323793411255, + "learning_rate": 4.928697513677042e-05, + "loss": 2.1681, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091179132461548, + "learning_rate": 4.925772145191834e-05, + "loss": 2.1224, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.4428555965423584, + "learning_rate": 4.9227888702567044e-05, + "loss": 2.0512, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.8234395980834961, + "learning_rate": 4.9197477600864446e-05, + "loss": 2.1067, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 1.9094969034194946, + "learning_rate": 4.9166488872764526e-05, + "loss": 1.8884, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 1.0074087381362915, + "learning_rate": 4.913492325800999e-05, + "loss": 1.9345, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 1.0867297649383545, + "learning_rate": 4.910278151011458e-05, + "loss": 2.1928, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 0.6842357516288757, + "learning_rate": 4.907006439634516e-05, + "loss": 2.0407, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.8409023284912109, + "learning_rate": 4.903677269770329e-05, + "loss": 2.2344, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 0.8119503259658813, + "learning_rate": 4.900290720890671e-05, + "loss": 2.1296, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.9938147068023682, + "learning_rate": 4.8968468738370244e-05, + "loss": 2.152, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.9865244030952454, + "learning_rate": 4.8933458108186606e-05, + "loss": 1.9623, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944802284240723, + "learning_rate": 4.889787615410672e-05, + "loss": 1.915, + "step": 305 + }, + { + "epoch": 0.1, + "grad_norm": 1.3749767541885376, + "learning_rate": 4.886172372551977e-05, + "loss": 1.9934, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.9024938941001892, + "learning_rate": 4.882500168543294e-05, + "loss": 2.1541, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 1.1978263854980469, + "learning_rate": 4.878771091045082e-05, + "loss": 2.1688, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8360010981559753, + "learning_rate": 4.874985229075446e-05, + "loss": 2.1387, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 0.7683364152908325, + "learning_rate": 4.871142673008012e-05, + "loss": 2.0215, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 1.4230670928955078, + "learning_rate": 4.867243514569772e-05, + "loss": 1.9491, + "step": 335 + }, + { + "epoch": 0.11, + "grad_norm": 0.8198773860931396, + "learning_rate": 4.863287846838891e-05, + "loss": 2.0151, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 1.467207908630371, + "learning_rate": 4.85927576424249e-05, + "loss": 1.8906, + "step": 345 + }, + { + "epoch": 0.11, + "grad_norm": 0.9537095427513123, + "learning_rate": 4.855207362554385e-05, + "loss": 2.1844, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 1.0757155418395996, + "learning_rate": 4.851082738892809e-05, + "loss": 2.048, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6884938478469849, + "learning_rate": 4.8469019917180846e-05, + "loss": 1.9537, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 1.4680182933807373, + "learning_rate": 4.8426652208302814e-05, + "loss": 1.9731, + "step": 365 + }, + { + "epoch": 0.12, + "grad_norm": 1.1778632402420044, + "learning_rate": 4.83837252736683e-05, + "loss": 2.1395, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 1.2865056991577148, + "learning_rate": 4.834024013800108e-05, + "loss": 2.0016, + "step": 375 + }, + { + "epoch": 0.12, + "grad_norm": 1.055177092552185, + "learning_rate": 4.8296197839349944e-05, + "loss": 1.9632, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.0041871070861816, + "learning_rate": 4.825159942906389e-05, + "loss": 2.3302, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0026438236236572, + "learning_rate": 4.820644597176709e-05, + "loss": 2.1517, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 1.3532180786132812, + "learning_rate": 4.81607385453334e-05, + "loss": 2.1229, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 0.7670988440513611, + "learning_rate": 4.81144782408607e-05, + "loss": 2.1382, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 1.0405700206756592, + "learning_rate": 4.8067666162644774e-05, + "loss": 1.9614, + "step": 405 + }, + { + "epoch": 0.13, + "grad_norm": 1.2252662181854248, + "learning_rate": 4.802030342815304e-05, + "loss": 2.1399, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 1.237946629524231, + "learning_rate": 4.7972391167997754e-05, + "loss": 1.9034, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 0.8064705729484558, + "learning_rate": 4.7923930525909156e-05, + "loss": 2.0075, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 0.8717565536499023, + "learning_rate": 4.7874922658708065e-05, + "loss": 2.0105, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 1.6693098545074463, + "learning_rate": 4.782536873627832e-05, + "loss": 2.0242, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.82447350025177, + "learning_rate": 4.777526994153882e-05, + "loss": 2.0267, + "step": 435 + }, + { + "epoch": 0.14, + "grad_norm": 0.9926588535308838, + "learning_rate": 4.7724627470415307e-05, + "loss": 1.9119, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 1.0924450159072876, + "learning_rate": 4.7673442531811796e-05, + "loss": 2.2653, + "step": 445 + }, + { + "epoch": 0.14, + "grad_norm": 1.1592103242874146, + "learning_rate": 4.762171634758177e-05, + "loss": 2.0017, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.9172110557556152, + "learning_rate": 4.7569450152498927e-05, + "loss": 2.1408, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 1.1897525787353516, + "learning_rate": 4.751664519422778e-05, + "loss": 2.0935, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 0.8793094158172607, + "learning_rate": 4.746330273329386e-05, + "loss": 2.1142, + "step": 465 + }, + { + "epoch": 0.15, + "grad_norm": 1.4337489604949951, + "learning_rate": 4.740942404305356e-05, + "loss": 2.1289, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 1.0251764059066772, + "learning_rate": 4.735501040966383e-05, + "loss": 1.9741, + "step": 475 + }, + { + "epoch": 0.15, + "grad_norm": 1.2659822702407837, + "learning_rate": 4.730006313205143e-05, + "loss": 2.088, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.8884140849113464, + "learning_rate": 4.724458352188192e-05, + "loss": 2.2079, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 1.1937768459320068, + "learning_rate": 4.718857290352835e-05, + "loss": 2.048, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 0.9741552472114563, + "learning_rate": 4.713203261403966e-05, + "loss": 2.2569, + "step": 495 + }, + { + "epoch": 0.16, + "grad_norm": 0.7996780872344971, + "learning_rate": 4.707496400310874e-05, + "loss": 1.9574, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 1.8182051181793213, + "learning_rate": 4.701736843304025e-05, + "loss": 2.0951, + "step": 505 + }, + { + "epoch": 0.16, + "grad_norm": 1.507320761680603, + "learning_rate": 4.695924727871805e-05, + "loss": 2.0253, + "step": 510 + }, + { + "epoch": 0.16, + "grad_norm": 0.759121835231781, + "learning_rate": 4.690060192757242e-05, + "loss": 2.0602, + "step": 515 + }, + { + "epoch": 0.16, + "grad_norm": 1.5943195819854736, + "learning_rate": 4.684143377954691e-05, + "loss": 2.0386, + "step": 520 + }, + { + "epoch": 0.16, + "grad_norm": 0.8568710088729858, + "learning_rate": 4.6781744247064955e-05, + "loss": 2.073, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 1.3352620601654053, + "learning_rate": 4.6721534754996125e-05, + "loss": 2.1443, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 1.3417474031448364, + "learning_rate": 4.666080674062213e-05, + "loss": 2.0288, + "step": 535 + }, + { + "epoch": 0.17, + "grad_norm": 1.5334464311599731, + "learning_rate": 4.659956165360251e-05, + "loss": 2.0609, + "step": 540 + }, + { + "epoch": 0.17, + "grad_norm": 0.9658721089363098, + "learning_rate": 4.6537800955940005e-05, + "loss": 1.9539, + "step": 545 + }, + { + "epoch": 0.17, + "grad_norm": 1.9197947978973389, + "learning_rate": 4.647552612194572e-05, + "loss": 2.149, + "step": 550 + }, + { + "epoch": 0.17, + "grad_norm": 0.8512137532234192, + "learning_rate": 4.641273863820383e-05, + "loss": 1.9722, + "step": 555 + }, + { + "epoch": 0.17, + "grad_norm": 1.827289342880249, + "learning_rate": 4.634944000353622e-05, + "loss": 2.0729, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 1.088416337966919, + "learning_rate": 4.628563172896655e-05, + "loss": 1.9507, + "step": 565 + }, + { + "epoch": 0.18, + "grad_norm": 1.3566908836364746, + "learning_rate": 4.6221315337684353e-05, + "loss": 2.1643, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 1.3541293144226074, + "learning_rate": 4.615649236500854e-05, + "loss": 2.1839, + "step": 575 + }, + { + "epoch": 0.18, + "grad_norm": 0.991269588470459, + "learning_rate": 4.609116435835083e-05, + "loss": 2.0976, + "step": 580 + }, + { + "epoch": 0.18, + "grad_norm": 1.0280535221099854, + "learning_rate": 4.602533287717877e-05, + "loss": 2.1474, + "step": 585 + }, + { + "epoch": 0.18, + "grad_norm": 1.013123631477356, + "learning_rate": 4.5958999492978524e-05, + "loss": 2.1873, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 1.1753040552139282, + "learning_rate": 4.589216578921737e-05, + "loss": 2.1744, + "step": 595 + }, + { + "epoch": 0.19, + "grad_norm": 1.1839090585708618, + "learning_rate": 4.582483336130586e-05, + "loss": 1.9982, + "step": 600 + }, + { + "epoch": 0.19, + "grad_norm": 1.0724798440933228, + "learning_rate": 4.575700381655979e-05, + "loss": 2.1234, + "step": 605 + }, + { + "epoch": 0.19, + "grad_norm": 2.009913682937622, + "learning_rate": 4.5688678774161796e-05, + "loss": 1.9478, + "step": 610 + }, + { + "epoch": 0.19, + "grad_norm": 0.9897060394287109, + "learning_rate": 4.561985986512271e-05, + "loss": 1.8268, + "step": 615 + }, + { + "epoch": 0.19, + "grad_norm": 0.8881808519363403, + "learning_rate": 4.555054873224263e-05, + "loss": 1.9887, + "step": 620 + }, + { + "epoch": 0.19, + "grad_norm": 1.155900001525879, + "learning_rate": 4.54807470300717e-05, + "loss": 2.0777, + "step": 625 + }, + { + "epoch": 0.2, + "grad_norm": 0.8782421350479126, + "learning_rate": 4.5410456424870596e-05, + "loss": 2.0566, + "step": 630 + }, + { + "epoch": 0.2, + "grad_norm": 1.3324674367904663, + "learning_rate": 4.5339678594570795e-05, + "loss": 2.047, + "step": 635 + }, + { + "epoch": 0.2, + "grad_norm": 1.9805939197540283, + "learning_rate": 4.526841522873449e-05, + "loss": 1.962, + "step": 640 + }, + { + "epoch": 0.2, + "grad_norm": 1.4999943971633911, + "learning_rate": 4.519666802851422e-05, + "loss": 2.0972, + "step": 645 + }, + { + "epoch": 0.2, + "grad_norm": 1.4504961967468262, + "learning_rate": 4.5124438706612376e-05, + "loss": 2.0041, + "step": 650 + }, + { + "epoch": 0.2, + "grad_norm": 0.9078169465065002, + "learning_rate": 4.505172898724018e-05, + "loss": 2.1229, + "step": 655 + }, + { + "epoch": 0.21, + "grad_norm": 1.1635804176330566, + "learning_rate": 4.497854060607662e-05, + "loss": 2.0195, + "step": 660 + }, + { + "epoch": 0.21, + "grad_norm": 1.46576726436615, + "learning_rate": 4.490487531022699e-05, + "loss": 2.0745, + "step": 665 + }, + { + "epoch": 0.21, + "grad_norm": 1.2094652652740479, + "learning_rate": 4.4830734858181145e-05, + "loss": 2.1068, + "step": 670 + }, + { + "epoch": 0.21, + "grad_norm": 1.4738895893096924, + "learning_rate": 4.47561210197716e-05, + "loss": 1.8088, + "step": 675 + }, + { + "epoch": 0.21, + "grad_norm": 1.23384690284729, + "learning_rate": 4.4681035576131215e-05, + "loss": 2.0995, + "step": 680 + }, + { + "epoch": 0.21, + "grad_norm": 0.8332946300506592, + "learning_rate": 4.46054803196507e-05, + "loss": 2.0541, + "step": 685 + }, + { + "epoch": 0.21, + "grad_norm": 0.9207485318183899, + "learning_rate": 4.452945705393586e-05, + "loss": 2.166, + "step": 690 + }, + { + "epoch": 0.22, + "grad_norm": 1.292945146560669, + "learning_rate": 4.445296759376449e-05, + "loss": 2.0784, + "step": 695 + }, + { + "epoch": 0.22, + "grad_norm": 0.9874763488769531, + "learning_rate": 4.437601376504307e-05, + "loss": 2.2087, + "step": 700 + }, + { + "epoch": 0.22, + "grad_norm": 0.9427415132522583, + "learning_rate": 4.4298597404763186e-05, + "loss": 2.1199, + "step": 705 + }, + { + "epoch": 0.22, + "grad_norm": 1.7369529008865356, + "learning_rate": 4.422072036095768e-05, + "loss": 2.0355, + "step": 710 + }, + { + "epoch": 0.22, + "grad_norm": 1.2423696517944336, + "learning_rate": 4.414238449265654e-05, + "loss": 2.0011, + "step": 715 + }, + { + "epoch": 0.22, + "grad_norm": 1.2304831743240356, + "learning_rate": 4.406359166984249e-05, + "loss": 2.0368, + "step": 720 + }, + { + "epoch": 0.23, + "grad_norm": 0.9090413451194763, + "learning_rate": 4.39843437734064e-05, + "loss": 1.9983, + "step": 725 + }, + { + "epoch": 0.23, + "grad_norm": 1.2729507684707642, + "learning_rate": 4.390464269510233e-05, + "loss": 2.021, + "step": 730 + }, + { + "epoch": 0.23, + "grad_norm": 1.3009227514266968, + "learning_rate": 4.382449033750244e-05, + "loss": 1.9743, + "step": 735 + }, + { + "epoch": 0.23, + "grad_norm": 1.5456056594848633, + "learning_rate": 4.37438886139515e-05, + "loss": 2.0689, + "step": 740 + }, + { + "epoch": 0.23, + "grad_norm": 1.3235007524490356, + "learning_rate": 4.3662839448521264e-05, + "loss": 2.0838, + "step": 745 + }, + { + "epoch": 0.23, + "grad_norm": 2.2074007987976074, + "learning_rate": 4.358134477596454e-05, + "loss": 2.0835, + "step": 750 + }, + { + "epoch": 0.23, + "grad_norm": 1.403738021850586, + "learning_rate": 4.3499406541668966e-05, + "loss": 2.0916, + "step": 755 + }, + { + "epoch": 0.24, + "grad_norm": 1.0940325260162354, + "learning_rate": 4.3417026701610616e-05, + "loss": 1.972, + "step": 760 + }, + { + "epoch": 0.24, + "grad_norm": 1.666353702545166, + "learning_rate": 4.3334207222307275e-05, + "loss": 1.927, + "step": 765 + }, + { + "epoch": 0.24, + "grad_norm": 1.0777515172958374, + "learning_rate": 4.325095008077154e-05, + "loss": 2.1192, + "step": 770 + }, + { + "epoch": 0.24, + "grad_norm": 1.7218186855316162, + "learning_rate": 4.316725726446353e-05, + "loss": 2.0774, + "step": 775 + }, + { + "epoch": 0.24, + "grad_norm": 1.356753945350647, + "learning_rate": 4.3083130771243586e-05, + "loss": 2.0847, + "step": 780 + }, + { + "epoch": 0.24, + "grad_norm": 0.9967429637908936, + "learning_rate": 4.299857260932445e-05, + "loss": 2.0485, + "step": 785 + }, + { + "epoch": 0.25, + "grad_norm": 1.6216442584991455, + "learning_rate": 4.2913584797223397e-05, + "loss": 2.1008, + "step": 790 + }, + { + "epoch": 0.25, + "grad_norm": 1.2556742429733276, + "learning_rate": 4.2828169363714016e-05, + "loss": 1.9209, + "step": 795 + }, + { + "epoch": 0.25, + "grad_norm": 1.1800439357757568, + "learning_rate": 4.274232834777782e-05, + "loss": 1.9722, + "step": 800 + }, + { + "epoch": 0.25, + "grad_norm": 1.1313499212265015, + "learning_rate": 4.2656063798555515e-05, + "loss": 1.9176, + "step": 805 + }, + { + "epoch": 0.25, + "grad_norm": 1.137534737586975, + "learning_rate": 4.256937777529815e-05, + "loss": 1.9929, + "step": 810 + }, + { + "epoch": 0.25, + "grad_norm": 1.0575093030929565, + "learning_rate": 4.2482272347317906e-05, + "loss": 2.166, + "step": 815 + }, + { + "epoch": 0.25, + "grad_norm": 1.5939594507217407, + "learning_rate": 4.2394749593938733e-05, + "loss": 2.1334, + "step": 820 + }, + { + "epoch": 0.26, + "grad_norm": 1.1045507192611694, + "learning_rate": 4.230681160444669e-05, + "loss": 2.0853, + "step": 825 + }, + { + "epoch": 0.26, + "grad_norm": 1.3480136394500732, + "learning_rate": 4.221846047804009e-05, + "loss": 2.1802, + "step": 830 + }, + { + "epoch": 0.26, + "grad_norm": 1.1822657585144043, + "learning_rate": 4.2129698323779366e-05, + "loss": 2.0739, + "step": 835 + }, + { + "epoch": 0.26, + "grad_norm": 1.1771117448806763, + "learning_rate": 4.204052726053676e-05, + "loss": 2.0238, + "step": 840 + }, + { + "epoch": 0.26, + "grad_norm": 1.4757814407348633, + "learning_rate": 4.195094941694571e-05, + "loss": 2.1557, + "step": 845 + }, + { + "epoch": 0.26, + "grad_norm": 0.9095075726509094, + "learning_rate": 4.1860966931350054e-05, + "loss": 2.1666, + "step": 850 + }, + { + "epoch": 0.27, + "grad_norm": 1.1039543151855469, + "learning_rate": 4.1770581951752976e-05, + "loss": 2.105, + "step": 855 + }, + { + "epoch": 0.27, + "grad_norm": 0.8517205119132996, + "learning_rate": 4.1679796635765735e-05, + "loss": 1.9656, + "step": 860 + }, + { + "epoch": 0.27, + "grad_norm": 1.239492654800415, + "learning_rate": 4.158861315055617e-05, + "loss": 2.0166, + "step": 865 + }, + { + "epoch": 0.27, + "grad_norm": 1.1358321905136108, + "learning_rate": 4.1497033672796924e-05, + "loss": 2.0076, + "step": 870 + }, + { + "epoch": 0.27, + "grad_norm": 1.6215249300003052, + "learning_rate": 4.140506038861356e-05, + "loss": 2.1594, + "step": 875 + }, + { + "epoch": 0.27, + "grad_norm": 1.0528080463409424, + "learning_rate": 4.131269549353229e-05, + "loss": 2.1416, + "step": 880 + }, + { + "epoch": 0.28, + "grad_norm": 0.8976901769638062, + "learning_rate": 4.1219941192427644e-05, + "loss": 2.1242, + "step": 885 + }, + { + "epoch": 0.28, + "grad_norm": 1.263594388961792, + "learning_rate": 4.112679969946977e-05, + "loss": 2.02, + "step": 890 + }, + { + "epoch": 0.28, + "grad_norm": 1.4173017740249634, + "learning_rate": 4.103327323807162e-05, + "loss": 2.0438, + "step": 895 + }, + { + "epoch": 0.28, + "grad_norm": 1.876170039176941, + "learning_rate": 4.093936404083585e-05, + "loss": 1.9806, + "step": 900 + }, + { + "epoch": 0.28, + "grad_norm": 1.4649231433868408, + "learning_rate": 4.0845074349501544e-05, + "loss": 2.1476, + "step": 905 + }, + { + "epoch": 0.28, + "grad_norm": 1.0446043014526367, + "learning_rate": 4.0750406414890695e-05, + "loss": 1.9672, + "step": 910 + }, + { + "epoch": 0.28, + "grad_norm": 1.0225305557250977, + "learning_rate": 4.065536249685448e-05, + "loss": 1.9984, + "step": 915 + }, + { + "epoch": 0.29, + "grad_norm": 1.0120617151260376, + "learning_rate": 4.055994486421929e-05, + "loss": 2.1162, + "step": 920 + }, + { + "epoch": 0.29, + "grad_norm": 1.0469881296157837, + "learning_rate": 4.04641557947326e-05, + "loss": 2.0435, + "step": 925 + }, + { + "epoch": 0.29, + "grad_norm": 1.2435941696166992, + "learning_rate": 4.036799757500856e-05, + "loss": 2.0431, + "step": 930 + }, + { + "epoch": 0.29, + "grad_norm": 1.0055103302001953, + "learning_rate": 4.027147250047348e-05, + "loss": 2.2021, + "step": 935 + }, + { + "epoch": 0.29, + "grad_norm": 1.1212949752807617, + "learning_rate": 4.017458287531094e-05, + "loss": 1.997, + "step": 940 + }, + { + "epoch": 0.29, + "grad_norm": 1.1048357486724854, + "learning_rate": 4.007733101240685e-05, + "loss": 1.946, + "step": 945 + }, + { + "epoch": 0.3, + "grad_norm": 1.4721689224243164, + "learning_rate": 3.997971923329426e-05, + "loss": 2.0723, + "step": 950 + }, + { + "epoch": 0.3, + "grad_norm": 1.3793156147003174, + "learning_rate": 3.988174986809783e-05, + "loss": 2.034, + "step": 955 + }, + { + "epoch": 0.3, + "grad_norm": 0.9013482928276062, + "learning_rate": 3.9783425255478355e-05, + "loss": 1.9736, + "step": 960 + }, + { + "epoch": 0.3, + "grad_norm": 0.9192422032356262, + "learning_rate": 3.968474774257682e-05, + "loss": 1.9878, + "step": 965 + }, + { + "epoch": 0.3, + "grad_norm": 1.9304206371307373, + "learning_rate": 3.9585719684958446e-05, + "loss": 2.117, + "step": 970 + }, + { + "epoch": 0.3, + "grad_norm": 1.0435137748718262, + "learning_rate": 3.948634344655639e-05, + "loss": 2.0585, + "step": 975 + }, + { + "epoch": 0.3, + "grad_norm": 1.4636590480804443, + "learning_rate": 3.938662139961538e-05, + "loss": 2.0409, + "step": 980 + }, + { + "epoch": 0.31, + "grad_norm": 1.8014529943466187, + "learning_rate": 3.928655592463508e-05, + "loss": 2.0369, + "step": 985 + }, + { + "epoch": 0.31, + "grad_norm": 1.2412620782852173, + "learning_rate": 3.918614941031319e-05, + "loss": 1.967, + "step": 990 + }, + { + "epoch": 0.31, + "grad_norm": 1.3581103086471558, + "learning_rate": 3.908540425348852e-05, + "loss": 2.0037, + "step": 995 + }, + { + "epoch": 0.31, + "grad_norm": 1.2377780675888062, + "learning_rate": 3.8984322859083725e-05, + "loss": 1.9991, + "step": 1000 + }, + { + "epoch": 0.31, + "grad_norm": 0.9209259748458862, + "learning_rate": 3.8882907640047896e-05, + "loss": 2.0448, + "step": 1005 + }, + { + "epoch": 0.31, + "grad_norm": 1.0150959491729736, + "learning_rate": 3.878116101729897e-05, + "loss": 2.0791, + "step": 1010 + }, + { + "epoch": 0.32, + "grad_norm": 1.5959141254425049, + "learning_rate": 3.867908541966594e-05, + "loss": 1.9997, + "step": 1015 + }, + { + "epoch": 0.32, + "grad_norm": 1.3945012092590332, + "learning_rate": 3.857668328383088e-05, + "loss": 2.0481, + "step": 1020 + }, + { + "epoch": 0.32, + "grad_norm": 1.2361671924591064, + "learning_rate": 3.847395705427075e-05, + "loss": 2.2664, + "step": 1025 + }, + { + "epoch": 0.32, + "grad_norm": 1.9661719799041748, + "learning_rate": 3.837090918319909e-05, + "loss": 1.9752, + "step": 1030 + }, + { + "epoch": 0.32, + "grad_norm": 1.6995949745178223, + "learning_rate": 3.8267542130507436e-05, + "loss": 2.1332, + "step": 1035 + }, + { + "epoch": 0.32, + "grad_norm": 1.1248412132263184, + "learning_rate": 3.816385836370663e-05, + "loss": 2.0432, + "step": 1040 + }, + { + "epoch": 0.32, + "grad_norm": 0.8734235763549805, + "learning_rate": 3.805986035786789e-05, + "loss": 1.9618, + "step": 1045 + }, + { + "epoch": 0.33, + "grad_norm": 1.322766661643982, + "learning_rate": 3.795555059556378e-05, + "loss": 2.0267, + "step": 1050 + }, + { + "epoch": 0.33, + "grad_norm": 1.0396028757095337, + "learning_rate": 3.7850931566808866e-05, + "loss": 2.1075, + "step": 1055 + }, + { + "epoch": 0.33, + "grad_norm": 0.9574625492095947, + "learning_rate": 3.7746005769000363e-05, + "loss": 2.156, + "step": 1060 + }, + { + "epoch": 0.33, + "grad_norm": 1.4480133056640625, + "learning_rate": 3.764077570685844e-05, + "loss": 1.9615, + "step": 1065 + }, + { + "epoch": 0.33, + "grad_norm": 1.5908560752868652, + "learning_rate": 3.753524389236648e-05, + "loss": 2.0928, + "step": 1070 + }, + { + "epoch": 0.33, + "grad_norm": 1.2628813982009888, + "learning_rate": 3.742941284471111e-05, + "loss": 2.1074, + "step": 1075 + }, + { + "epoch": 0.34, + "grad_norm": 1.2687503099441528, + "learning_rate": 3.7323285090222054e-05, + "loss": 1.9666, + "step": 1080 + }, + { + "epoch": 0.34, + "grad_norm": 1.2571731805801392, + "learning_rate": 3.721686316231181e-05, + "loss": 2.0468, + "step": 1085 + }, + { + "epoch": 0.34, + "grad_norm": 1.007453441619873, + "learning_rate": 3.7110149601415215e-05, + "loss": 2.0624, + "step": 1090 + }, + { + "epoch": 0.34, + "grad_norm": 1.2390377521514893, + "learning_rate": 3.700314695492876e-05, + "loss": 1.9888, + "step": 1095 + }, + { + "epoch": 0.34, + "grad_norm": 1.0878371000289917, + "learning_rate": 3.6895857777149825e-05, + "loss": 2.1013, + "step": 1100 + }, + { + "epoch": 0.34, + "grad_norm": 0.8759217262268066, + "learning_rate": 3.6788284629215624e-05, + "loss": 1.875, + "step": 1105 + }, + { + "epoch": 0.35, + "grad_norm": 1.1345970630645752, + "learning_rate": 3.668043007904219e-05, + "loss": 1.9096, + "step": 1110 + }, + { + "epoch": 0.35, + "grad_norm": 1.253629446029663, + "learning_rate": 3.6572296701262966e-05, + "loss": 2.1859, + "step": 1115 + }, + { + "epoch": 0.35, + "grad_norm": 0.9796190857887268, + "learning_rate": 3.646388707716738e-05, + "loss": 2.2092, + "step": 1120 + }, + { + "epoch": 0.35, + "grad_norm": 1.3893767595291138, + "learning_rate": 3.635520379463926e-05, + "loss": 2.0026, + "step": 1125 + }, + { + "epoch": 0.35, + "grad_norm": 0.8778309226036072, + "learning_rate": 3.6246249448095004e-05, + "loss": 2.2112, + "step": 1130 + }, + { + "epoch": 0.35, + "grad_norm": 1.2479698657989502, + "learning_rate": 3.6137026638421696e-05, + "loss": 2.0221, + "step": 1135 + }, + { + "epoch": 0.35, + "grad_norm": 1.3813824653625488, + "learning_rate": 3.6027537972914974e-05, + "loss": 1.9106, + "step": 1140 + }, + { + "epoch": 0.36, + "grad_norm": 1.2043218612670898, + "learning_rate": 3.5917786065216826e-05, + "loss": 2.0673, + "step": 1145 + }, + { + "epoch": 0.36, + "grad_norm": 1.5337340831756592, + "learning_rate": 3.580777353525318e-05, + "loss": 2.1463, + "step": 1150 + }, + { + "epoch": 0.36, + "grad_norm": 1.155813455581665, + "learning_rate": 3.5697503009171385e-05, + "loss": 2.0255, + "step": 1155 + }, + { + "epoch": 0.36, + "grad_norm": 1.034644365310669, + "learning_rate": 3.558697711927748e-05, + "loss": 2.1348, + "step": 1160 + }, + { + "epoch": 0.36, + "grad_norm": 1.0959795713424683, + "learning_rate": 3.54761985039734e-05, + "loss": 2.1457, + "step": 1165 + }, + { + "epoch": 0.36, + "grad_norm": 1.1938838958740234, + "learning_rate": 3.5365169807693966e-05, + "loss": 2.1256, + "step": 1170 + }, + { + "epoch": 0.37, + "grad_norm": 0.8162047863006592, + "learning_rate": 3.525389368084379e-05, + "loss": 1.9587, + "step": 1175 + }, + { + "epoch": 0.37, + "grad_norm": 0.9358930587768555, + "learning_rate": 3.514237277973393e-05, + "loss": 1.8965, + "step": 1180 + }, + { + "epoch": 0.37, + "grad_norm": 0.9210988879203796, + "learning_rate": 3.503060976651862e-05, + "loss": 1.9669, + "step": 1185 + }, + { + "epoch": 0.37, + "grad_norm": 1.4641343355178833, + "learning_rate": 3.491860730913156e-05, + "loss": 2.003, + "step": 1190 + }, + { + "epoch": 0.37, + "grad_norm": 1.2458257675170898, + "learning_rate": 3.480636808122235e-05, + "loss": 2.1487, + "step": 1195 + }, + { + "epoch": 0.37, + "grad_norm": 1.6770122051239014, + "learning_rate": 3.469389476209259e-05, + "loss": 2.0686, + "step": 1200 + }, + { + "epoch": 0.37, + "grad_norm": 0.9083845019340515, + "learning_rate": 3.458119003663199e-05, + "loss": 2.0284, + "step": 1205 + }, + { + "epoch": 0.38, + "grad_norm": 1.2679696083068848, + "learning_rate": 3.446825659525421e-05, + "loss": 2.0555, + "step": 1210 + }, + { + "epoch": 0.38, + "grad_norm": 1.3823720216751099, + "learning_rate": 3.435509713383268e-05, + "loss": 1.9375, + "step": 1215 + }, + { + "epoch": 0.38, + "grad_norm": 1.5862077474594116, + "learning_rate": 3.424171435363623e-05, + "loss": 2.0271, + "step": 1220 + }, + { + "epoch": 0.38, + "grad_norm": 2.0107533931732178, + "learning_rate": 3.412811096126461e-05, + "loss": 2.1897, + "step": 1225 + }, + { + "epoch": 0.38, + "grad_norm": 1.4544458389282227, + "learning_rate": 3.401428966858387e-05, + "loss": 1.9978, + "step": 1230 + }, + { + "epoch": 0.38, + "grad_norm": 1.188170075416565, + "learning_rate": 3.390025319266167e-05, + "loss": 2.0688, + "step": 1235 + }, + { + "epoch": 0.39, + "grad_norm": 1.1016322374343872, + "learning_rate": 3.3786004255702336e-05, + "loss": 2.0396, + "step": 1240 + }, + { + "epoch": 0.39, + "grad_norm": 1.6623334884643555, + "learning_rate": 3.3671545584981954e-05, + "loss": 1.9566, + "step": 1245 + }, + { + "epoch": 0.39, + "grad_norm": 0.9161584377288818, + "learning_rate": 3.355687991278324e-05, + "loss": 2.0474, + "step": 1250 + }, + { + "epoch": 0.39, + "grad_norm": 0.9911025166511536, + "learning_rate": 3.3442009976330305e-05, + "loss": 2.2163, + "step": 1255 + }, + { + "epoch": 0.39, + "grad_norm": 1.1504255533218384, + "learning_rate": 3.332693851772331e-05, + "loss": 2.1088, + "step": 1260 + }, + { + "epoch": 0.39, + "grad_norm": 0.9544184803962708, + "learning_rate": 3.3211668283873035e-05, + "loss": 1.8947, + "step": 1265 + }, + { + "epoch": 0.39, + "grad_norm": 1.4625756740570068, + "learning_rate": 3.3096202026435304e-05, + "loss": 2.1748, + "step": 1270 + }, + { + "epoch": 0.4, + "grad_norm": 1.3267475366592407, + "learning_rate": 3.298054250174527e-05, + "loss": 1.9218, + "step": 1275 + }, + { + "epoch": 0.4, + "grad_norm": 0.9869363903999329, + "learning_rate": 3.2864692470751654e-05, + "loss": 2.2723, + "step": 1280 + }, + { + "epoch": 0.4, + "grad_norm": 1.5177838802337646, + "learning_rate": 3.27486546989508e-05, + "loss": 2.1456, + "step": 1285 + }, + { + "epoch": 0.4, + "grad_norm": 1.1998714208602905, + "learning_rate": 3.263243195632068e-05, + "loss": 1.8877, + "step": 1290 + }, + { + "epoch": 0.4, + "grad_norm": 1.2112164497375488, + "learning_rate": 3.2516027017254785e-05, + "loss": 2.0615, + "step": 1295 + }, + { + "epoch": 0.4, + "grad_norm": 1.0616129636764526, + "learning_rate": 3.239944266049587e-05, + "loss": 2.0402, + "step": 1300 + }, + { + "epoch": 0.41, + "grad_norm": 1.4537287950515747, + "learning_rate": 3.228268166906962e-05, + "loss": 2.0728, + "step": 1305 + }, + { + "epoch": 0.41, + "grad_norm": 1.3899391889572144, + "learning_rate": 3.2165746830218254e-05, + "loss": 2.1815, + "step": 1310 + }, + { + "epoch": 0.41, + "grad_norm": 1.332529067993164, + "learning_rate": 3.204864093533394e-05, + "loss": 1.8935, + "step": 1315 + }, + { + "epoch": 0.41, + "grad_norm": 1.4466496706008911, + "learning_rate": 3.193136677989221e-05, + "loss": 1.9567, + "step": 1320 + }, + { + "epoch": 0.41, + "grad_norm": 1.1781721115112305, + "learning_rate": 3.181392716338516e-05, + "loss": 2.055, + "step": 1325 + }, + { + "epoch": 0.41, + "grad_norm": 0.9411901831626892, + "learning_rate": 3.1696324889254716e-05, + "loss": 1.8794, + "step": 1330 + }, + { + "epoch": 0.42, + "grad_norm": 1.2628341913223267, + "learning_rate": 3.15785627648256e-05, + "loss": 2.0299, + "step": 1335 + }, + { + "epoch": 0.42, + "grad_norm": 1.4857370853424072, + "learning_rate": 3.146064360123846e-05, + "loss": 1.9342, + "step": 1340 + }, + { + "epoch": 0.42, + "grad_norm": 1.661470651626587, + "learning_rate": 3.1342570213382594e-05, + "loss": 2.0399, + "step": 1345 + }, + { + "epoch": 0.42, + "grad_norm": 1.522845983505249, + "learning_rate": 3.122434541982888e-05, + "loss": 2.1419, + "step": 1350 + }, + { + "epoch": 0.42, + "grad_norm": 1.5679118633270264, + "learning_rate": 3.110597204276247e-05, + "loss": 2.2932, + "step": 1355 + }, + { + "epoch": 0.42, + "grad_norm": 1.3367788791656494, + "learning_rate": 3.098745290791539e-05, + "loss": 1.8989, + "step": 1360 + }, + { + "epoch": 0.42, + "grad_norm": 1.3873472213745117, + "learning_rate": 3.086879084449907e-05, + "loss": 2.1214, + "step": 1365 + }, + { + "epoch": 0.43, + "grad_norm": 1.2957035303115845, + "learning_rate": 3.074998868513688e-05, + "loss": 2.2538, + "step": 1370 + }, + { + "epoch": 0.43, + "grad_norm": 1.122176170349121, + "learning_rate": 3.0631049265796465e-05, + "loss": 2.0974, + "step": 1375 + }, + { + "epoch": 0.43, + "grad_norm": 1.0422618389129639, + "learning_rate": 3.051197542572203e-05, + "loss": 2.054, + "step": 1380 + }, + { + "epoch": 0.43, + "grad_norm": 1.1926140785217285, + "learning_rate": 3.0392770007366584e-05, + "loss": 1.9798, + "step": 1385 + }, + { + "epoch": 0.43, + "grad_norm": 0.8764025568962097, + "learning_rate": 3.0273435856324112e-05, + "loss": 2.0796, + "step": 1390 + }, + { + "epoch": 0.43, + "grad_norm": 0.8200764656066895, + "learning_rate": 3.0153975821261605e-05, + "loss": 1.9116, + "step": 1395 + }, + { + "epoch": 0.44, + "grad_norm": 1.0340498685836792, + "learning_rate": 3.0034392753851066e-05, + "loss": 2.0235, + "step": 1400 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.882972921135104e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1400/training_args.bin b/checkpoint-1400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-1400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/checkpoint-1500/README.md b/checkpoint-1500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-1500/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-1500/adapter_config.json b/checkpoint-1500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-1500/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1500/adapter_model.safetensors b/checkpoint-1500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..6c2d95973ab5dcc39d138d45ed82277df67fe94c --- /dev/null +++ b/checkpoint-1500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2f60d025081c52abcefd12bd979226e64c15fd546a1ee3e1adf198151a00500 +size 2099272 diff --git a/checkpoint-1500/optimizer.pt b/checkpoint-1500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..86366385128e9283e92abda1cbfbc83f9402ca08 --- /dev/null +++ b/checkpoint-1500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7a0a5e958488fbf75d0f567d49412c03fdbb981c6b650f3bdb392b7fa1d76a1 +size 4208302 diff --git a/checkpoint-1500/rng_state.pth b/checkpoint-1500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..d0432b613bb44fb8774aebf9e899af9d1f00491d --- /dev/null +++ b/checkpoint-1500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73915b1ab178881d6013bb6a6fe6df32c58dc47c81965c3d636ff61253042ecb +size 14244 diff --git a/checkpoint-1500/scheduler.pt b/checkpoint-1500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b0ebc24aa6ed81d67da211547b65c827d85b097 --- /dev/null +++ b/checkpoint-1500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15e6dbea6b02ec35a2eb009d4eef91b1c0ffa2c9fe7ffea15f5ebda80bcb5333 +size 1064 diff --git a/checkpoint-1500/special_tokens_map.json b/checkpoint-1500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-1500/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1500/tokenizer.model b/checkpoint-1500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-1500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-1500/tokenizer_config.json b/checkpoint-1500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-1500/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-1500/trainer_state.json b/checkpoint-1500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..acd2e02b84fe640d3cb9761fcd12113bbe806d38 --- /dev/null +++ b/checkpoint-1500/trainer_state.json @@ -0,0 +1,2121 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.46643603995802074, + "eval_steps": 500, + "global_step": 1500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.2905739545822144, + "learning_rate": 4.9500077405018807e-05, + "loss": 2.0987, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.8612006306648254, + "learning_rate": 4.9475480211302583e-05, + "loss": 2.1765, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 1.3128459453582764, + "learning_rate": 4.945029875489212e-05, + "loss": 1.9926, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 0.9610918164253235, + "learning_rate": 4.94245336369027e-05, + "loss": 2.0124, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.873160183429718, + "learning_rate": 4.939818547238241e-05, + "loss": 2.2229, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.5535285472869873, + "learning_rate": 4.9371254890297446e-05, + "loss": 2.2013, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951836347579956, + "learning_rate": 4.93437425335171e-05, + "loss": 2.014, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874170541763306, + "learning_rate": 4.9315649058798384e-05, + "loss": 2.1701, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 1.3503323793411255, + "learning_rate": 4.928697513677042e-05, + "loss": 2.1681, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091179132461548, + "learning_rate": 4.925772145191834e-05, + "loss": 2.1224, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.4428555965423584, + "learning_rate": 4.9227888702567044e-05, + "loss": 2.0512, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.8234395980834961, + "learning_rate": 4.9197477600864446e-05, + "loss": 2.1067, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 1.9094969034194946, + "learning_rate": 4.9166488872764526e-05, + "loss": 1.8884, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 1.0074087381362915, + "learning_rate": 4.913492325800999e-05, + "loss": 1.9345, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 1.0867297649383545, + "learning_rate": 4.910278151011458e-05, + "loss": 2.1928, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 0.6842357516288757, + "learning_rate": 4.907006439634516e-05, + "loss": 2.0407, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.8409023284912109, + "learning_rate": 4.903677269770329e-05, + "loss": 2.2344, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 0.8119503259658813, + "learning_rate": 4.900290720890671e-05, + "loss": 2.1296, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.9938147068023682, + "learning_rate": 4.8968468738370244e-05, + "loss": 2.152, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.9865244030952454, + "learning_rate": 4.8933458108186606e-05, + "loss": 1.9623, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944802284240723, + "learning_rate": 4.889787615410672e-05, + "loss": 1.915, + "step": 305 + }, + { + "epoch": 0.1, + "grad_norm": 1.3749767541885376, + "learning_rate": 4.886172372551977e-05, + "loss": 1.9934, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.9024938941001892, + "learning_rate": 4.882500168543294e-05, + "loss": 2.1541, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 1.1978263854980469, + "learning_rate": 4.878771091045082e-05, + "loss": 2.1688, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8360010981559753, + "learning_rate": 4.874985229075446e-05, + "loss": 2.1387, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 0.7683364152908325, + "learning_rate": 4.871142673008012e-05, + "loss": 2.0215, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 1.4230670928955078, + "learning_rate": 4.867243514569772e-05, + "loss": 1.9491, + "step": 335 + }, + { + "epoch": 0.11, + "grad_norm": 0.8198773860931396, + "learning_rate": 4.863287846838891e-05, + "loss": 2.0151, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 1.467207908630371, + "learning_rate": 4.85927576424249e-05, + "loss": 1.8906, + "step": 345 + }, + { + "epoch": 0.11, + "grad_norm": 0.9537095427513123, + "learning_rate": 4.855207362554385e-05, + "loss": 2.1844, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 1.0757155418395996, + "learning_rate": 4.851082738892809e-05, + "loss": 2.048, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6884938478469849, + "learning_rate": 4.8469019917180846e-05, + "loss": 1.9537, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 1.4680182933807373, + "learning_rate": 4.8426652208302814e-05, + "loss": 1.9731, + "step": 365 + }, + { + "epoch": 0.12, + "grad_norm": 1.1778632402420044, + "learning_rate": 4.83837252736683e-05, + "loss": 2.1395, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 1.2865056991577148, + "learning_rate": 4.834024013800108e-05, + "loss": 2.0016, + "step": 375 + }, + { + "epoch": 0.12, + "grad_norm": 1.055177092552185, + "learning_rate": 4.8296197839349944e-05, + "loss": 1.9632, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.0041871070861816, + "learning_rate": 4.825159942906389e-05, + "loss": 2.3302, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0026438236236572, + "learning_rate": 4.820644597176709e-05, + "loss": 2.1517, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 1.3532180786132812, + "learning_rate": 4.81607385453334e-05, + "loss": 2.1229, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 0.7670988440513611, + "learning_rate": 4.81144782408607e-05, + "loss": 2.1382, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 1.0405700206756592, + "learning_rate": 4.8067666162644774e-05, + "loss": 1.9614, + "step": 405 + }, + { + "epoch": 0.13, + "grad_norm": 1.2252662181854248, + "learning_rate": 4.802030342815304e-05, + "loss": 2.1399, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 1.237946629524231, + "learning_rate": 4.7972391167997754e-05, + "loss": 1.9034, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 0.8064705729484558, + "learning_rate": 4.7923930525909156e-05, + "loss": 2.0075, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 0.8717565536499023, + "learning_rate": 4.7874922658708065e-05, + "loss": 2.0105, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 1.6693098545074463, + "learning_rate": 4.782536873627832e-05, + "loss": 2.0242, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.82447350025177, + "learning_rate": 4.777526994153882e-05, + "loss": 2.0267, + "step": 435 + }, + { + "epoch": 0.14, + "grad_norm": 0.9926588535308838, + "learning_rate": 4.7724627470415307e-05, + "loss": 1.9119, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 1.0924450159072876, + "learning_rate": 4.7673442531811796e-05, + "loss": 2.2653, + "step": 445 + }, + { + "epoch": 0.14, + "grad_norm": 1.1592103242874146, + "learning_rate": 4.762171634758177e-05, + "loss": 2.0017, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.9172110557556152, + "learning_rate": 4.7569450152498927e-05, + "loss": 2.1408, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 1.1897525787353516, + "learning_rate": 4.751664519422778e-05, + "loss": 2.0935, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 0.8793094158172607, + "learning_rate": 4.746330273329386e-05, + "loss": 2.1142, + "step": 465 + }, + { + "epoch": 0.15, + "grad_norm": 1.4337489604949951, + "learning_rate": 4.740942404305356e-05, + "loss": 2.1289, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 1.0251764059066772, + "learning_rate": 4.735501040966383e-05, + "loss": 1.9741, + "step": 475 + }, + { + "epoch": 0.15, + "grad_norm": 1.2659822702407837, + "learning_rate": 4.730006313205143e-05, + "loss": 2.088, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.8884140849113464, + "learning_rate": 4.724458352188192e-05, + "loss": 2.2079, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 1.1937768459320068, + "learning_rate": 4.718857290352835e-05, + "loss": 2.048, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 0.9741552472114563, + "learning_rate": 4.713203261403966e-05, + "loss": 2.2569, + "step": 495 + }, + { + "epoch": 0.16, + "grad_norm": 0.7996780872344971, + "learning_rate": 4.707496400310874e-05, + "loss": 1.9574, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 1.8182051181793213, + "learning_rate": 4.701736843304025e-05, + "loss": 2.0951, + "step": 505 + }, + { + "epoch": 0.16, + "grad_norm": 1.507320761680603, + "learning_rate": 4.695924727871805e-05, + "loss": 2.0253, + "step": 510 + }, + { + "epoch": 0.16, + "grad_norm": 0.759121835231781, + "learning_rate": 4.690060192757242e-05, + "loss": 2.0602, + "step": 515 + }, + { + "epoch": 0.16, + "grad_norm": 1.5943195819854736, + "learning_rate": 4.684143377954691e-05, + "loss": 2.0386, + "step": 520 + }, + { + "epoch": 0.16, + "grad_norm": 0.8568710088729858, + "learning_rate": 4.6781744247064955e-05, + "loss": 2.073, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 1.3352620601654053, + "learning_rate": 4.6721534754996125e-05, + "loss": 2.1443, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 1.3417474031448364, + "learning_rate": 4.666080674062213e-05, + "loss": 2.0288, + "step": 535 + }, + { + "epoch": 0.17, + "grad_norm": 1.5334464311599731, + "learning_rate": 4.659956165360251e-05, + "loss": 2.0609, + "step": 540 + }, + { + "epoch": 0.17, + "grad_norm": 0.9658721089363098, + "learning_rate": 4.6537800955940005e-05, + "loss": 1.9539, + "step": 545 + }, + { + "epoch": 0.17, + "grad_norm": 1.9197947978973389, + "learning_rate": 4.647552612194572e-05, + "loss": 2.149, + "step": 550 + }, + { + "epoch": 0.17, + "grad_norm": 0.8512137532234192, + "learning_rate": 4.641273863820383e-05, + "loss": 1.9722, + "step": 555 + }, + { + "epoch": 0.17, + "grad_norm": 1.827289342880249, + "learning_rate": 4.634944000353622e-05, + "loss": 2.0729, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 1.088416337966919, + "learning_rate": 4.628563172896655e-05, + "loss": 1.9507, + "step": 565 + }, + { + "epoch": 0.18, + "grad_norm": 1.3566908836364746, + "learning_rate": 4.6221315337684353e-05, + "loss": 2.1643, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 1.3541293144226074, + "learning_rate": 4.615649236500854e-05, + "loss": 2.1839, + "step": 575 + }, + { + "epoch": 0.18, + "grad_norm": 0.991269588470459, + "learning_rate": 4.609116435835083e-05, + "loss": 2.0976, + "step": 580 + }, + { + "epoch": 0.18, + "grad_norm": 1.0280535221099854, + "learning_rate": 4.602533287717877e-05, + "loss": 2.1474, + "step": 585 + }, + { + "epoch": 0.18, + "grad_norm": 1.013123631477356, + "learning_rate": 4.5958999492978524e-05, + "loss": 2.1873, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 1.1753040552139282, + "learning_rate": 4.589216578921737e-05, + "loss": 2.1744, + "step": 595 + }, + { + "epoch": 0.19, + "grad_norm": 1.1839090585708618, + "learning_rate": 4.582483336130586e-05, + "loss": 1.9982, + "step": 600 + }, + { + "epoch": 0.19, + "grad_norm": 1.0724798440933228, + "learning_rate": 4.575700381655979e-05, + "loss": 2.1234, + "step": 605 + }, + { + "epoch": 0.19, + "grad_norm": 2.009913682937622, + "learning_rate": 4.5688678774161796e-05, + "loss": 1.9478, + "step": 610 + }, + { + "epoch": 0.19, + "grad_norm": 0.9897060394287109, + "learning_rate": 4.561985986512271e-05, + "loss": 1.8268, + "step": 615 + }, + { + "epoch": 0.19, + "grad_norm": 0.8881808519363403, + "learning_rate": 4.555054873224263e-05, + "loss": 1.9887, + "step": 620 + }, + { + "epoch": 0.19, + "grad_norm": 1.155900001525879, + "learning_rate": 4.54807470300717e-05, + "loss": 2.0777, + "step": 625 + }, + { + "epoch": 0.2, + "grad_norm": 0.8782421350479126, + "learning_rate": 4.5410456424870596e-05, + "loss": 2.0566, + "step": 630 + }, + { + "epoch": 0.2, + "grad_norm": 1.3324674367904663, + "learning_rate": 4.5339678594570795e-05, + "loss": 2.047, + "step": 635 + }, + { + "epoch": 0.2, + "grad_norm": 1.9805939197540283, + "learning_rate": 4.526841522873449e-05, + "loss": 1.962, + "step": 640 + }, + { + "epoch": 0.2, + "grad_norm": 1.4999943971633911, + "learning_rate": 4.519666802851422e-05, + "loss": 2.0972, + "step": 645 + }, + { + "epoch": 0.2, + "grad_norm": 1.4504961967468262, + "learning_rate": 4.5124438706612376e-05, + "loss": 2.0041, + "step": 650 + }, + { + "epoch": 0.2, + "grad_norm": 0.9078169465065002, + "learning_rate": 4.505172898724018e-05, + "loss": 2.1229, + "step": 655 + }, + { + "epoch": 0.21, + "grad_norm": 1.1635804176330566, + "learning_rate": 4.497854060607662e-05, + "loss": 2.0195, + "step": 660 + }, + { + "epoch": 0.21, + "grad_norm": 1.46576726436615, + "learning_rate": 4.490487531022699e-05, + "loss": 2.0745, + "step": 665 + }, + { + "epoch": 0.21, + "grad_norm": 1.2094652652740479, + "learning_rate": 4.4830734858181145e-05, + "loss": 2.1068, + "step": 670 + }, + { + "epoch": 0.21, + "grad_norm": 1.4738895893096924, + "learning_rate": 4.47561210197716e-05, + "loss": 1.8088, + "step": 675 + }, + { + "epoch": 0.21, + "grad_norm": 1.23384690284729, + "learning_rate": 4.4681035576131215e-05, + "loss": 2.0995, + "step": 680 + }, + { + "epoch": 0.21, + "grad_norm": 0.8332946300506592, + "learning_rate": 4.46054803196507e-05, + "loss": 2.0541, + "step": 685 + }, + { + "epoch": 0.21, + "grad_norm": 0.9207485318183899, + "learning_rate": 4.452945705393586e-05, + "loss": 2.166, + "step": 690 + }, + { + "epoch": 0.22, + "grad_norm": 1.292945146560669, + "learning_rate": 4.445296759376449e-05, + "loss": 2.0784, + "step": 695 + }, + { + "epoch": 0.22, + "grad_norm": 0.9874763488769531, + "learning_rate": 4.437601376504307e-05, + "loss": 2.2087, + "step": 700 + }, + { + "epoch": 0.22, + "grad_norm": 0.9427415132522583, + "learning_rate": 4.4298597404763186e-05, + "loss": 2.1199, + "step": 705 + }, + { + "epoch": 0.22, + "grad_norm": 1.7369529008865356, + "learning_rate": 4.422072036095768e-05, + "loss": 2.0355, + "step": 710 + }, + { + "epoch": 0.22, + "grad_norm": 1.2423696517944336, + "learning_rate": 4.414238449265654e-05, + "loss": 2.0011, + "step": 715 + }, + { + "epoch": 0.22, + "grad_norm": 1.2304831743240356, + "learning_rate": 4.406359166984249e-05, + "loss": 2.0368, + "step": 720 + }, + { + "epoch": 0.23, + "grad_norm": 0.9090413451194763, + "learning_rate": 4.39843437734064e-05, + "loss": 1.9983, + "step": 725 + }, + { + "epoch": 0.23, + "grad_norm": 1.2729507684707642, + "learning_rate": 4.390464269510233e-05, + "loss": 2.021, + "step": 730 + }, + { + "epoch": 0.23, + "grad_norm": 1.3009227514266968, + "learning_rate": 4.382449033750244e-05, + "loss": 1.9743, + "step": 735 + }, + { + "epoch": 0.23, + "grad_norm": 1.5456056594848633, + "learning_rate": 4.37438886139515e-05, + "loss": 2.0689, + "step": 740 + }, + { + "epoch": 0.23, + "grad_norm": 1.3235007524490356, + "learning_rate": 4.3662839448521264e-05, + "loss": 2.0838, + "step": 745 + }, + { + "epoch": 0.23, + "grad_norm": 2.2074007987976074, + "learning_rate": 4.358134477596454e-05, + "loss": 2.0835, + "step": 750 + }, + { + "epoch": 0.23, + "grad_norm": 1.403738021850586, + "learning_rate": 4.3499406541668966e-05, + "loss": 2.0916, + "step": 755 + }, + { + "epoch": 0.24, + "grad_norm": 1.0940325260162354, + "learning_rate": 4.3417026701610616e-05, + "loss": 1.972, + "step": 760 + }, + { + "epoch": 0.24, + "grad_norm": 1.666353702545166, + "learning_rate": 4.3334207222307275e-05, + "loss": 1.927, + "step": 765 + }, + { + "epoch": 0.24, + "grad_norm": 1.0777515172958374, + "learning_rate": 4.325095008077154e-05, + "loss": 2.1192, + "step": 770 + }, + { + "epoch": 0.24, + "grad_norm": 1.7218186855316162, + "learning_rate": 4.316725726446353e-05, + "loss": 2.0774, + "step": 775 + }, + { + "epoch": 0.24, + "grad_norm": 1.356753945350647, + "learning_rate": 4.3083130771243586e-05, + "loss": 2.0847, + "step": 780 + }, + { + "epoch": 0.24, + "grad_norm": 0.9967429637908936, + "learning_rate": 4.299857260932445e-05, + "loss": 2.0485, + "step": 785 + }, + { + "epoch": 0.25, + "grad_norm": 1.6216442584991455, + "learning_rate": 4.2913584797223397e-05, + "loss": 2.1008, + "step": 790 + }, + { + "epoch": 0.25, + "grad_norm": 1.2556742429733276, + "learning_rate": 4.2828169363714016e-05, + "loss": 1.9209, + "step": 795 + }, + { + "epoch": 0.25, + "grad_norm": 1.1800439357757568, + "learning_rate": 4.274232834777782e-05, + "loss": 1.9722, + "step": 800 + }, + { + "epoch": 0.25, + "grad_norm": 1.1313499212265015, + "learning_rate": 4.2656063798555515e-05, + "loss": 1.9176, + "step": 805 + }, + { + "epoch": 0.25, + "grad_norm": 1.137534737586975, + "learning_rate": 4.256937777529815e-05, + "loss": 1.9929, + "step": 810 + }, + { + "epoch": 0.25, + "grad_norm": 1.0575093030929565, + "learning_rate": 4.2482272347317906e-05, + "loss": 2.166, + "step": 815 + }, + { + "epoch": 0.25, + "grad_norm": 1.5939594507217407, + "learning_rate": 4.2394749593938733e-05, + "loss": 2.1334, + "step": 820 + }, + { + "epoch": 0.26, + "grad_norm": 1.1045507192611694, + "learning_rate": 4.230681160444669e-05, + "loss": 2.0853, + "step": 825 + }, + { + "epoch": 0.26, + "grad_norm": 1.3480136394500732, + "learning_rate": 4.221846047804009e-05, + "loss": 2.1802, + "step": 830 + }, + { + "epoch": 0.26, + "grad_norm": 1.1822657585144043, + "learning_rate": 4.2129698323779366e-05, + "loss": 2.0739, + "step": 835 + }, + { + "epoch": 0.26, + "grad_norm": 1.1771117448806763, + "learning_rate": 4.204052726053676e-05, + "loss": 2.0238, + "step": 840 + }, + { + "epoch": 0.26, + "grad_norm": 1.4757814407348633, + "learning_rate": 4.195094941694571e-05, + "loss": 2.1557, + "step": 845 + }, + { + "epoch": 0.26, + "grad_norm": 0.9095075726509094, + "learning_rate": 4.1860966931350054e-05, + "loss": 2.1666, + "step": 850 + }, + { + "epoch": 0.27, + "grad_norm": 1.1039543151855469, + "learning_rate": 4.1770581951752976e-05, + "loss": 2.105, + "step": 855 + }, + { + "epoch": 0.27, + "grad_norm": 0.8517205119132996, + "learning_rate": 4.1679796635765735e-05, + "loss": 1.9656, + "step": 860 + }, + { + "epoch": 0.27, + "grad_norm": 1.239492654800415, + "learning_rate": 4.158861315055617e-05, + "loss": 2.0166, + "step": 865 + }, + { + "epoch": 0.27, + "grad_norm": 1.1358321905136108, + "learning_rate": 4.1497033672796924e-05, + "loss": 2.0076, + "step": 870 + }, + { + "epoch": 0.27, + "grad_norm": 1.6215249300003052, + "learning_rate": 4.140506038861356e-05, + "loss": 2.1594, + "step": 875 + }, + { + "epoch": 0.27, + "grad_norm": 1.0528080463409424, + "learning_rate": 4.131269549353229e-05, + "loss": 2.1416, + "step": 880 + }, + { + "epoch": 0.28, + "grad_norm": 0.8976901769638062, + "learning_rate": 4.1219941192427644e-05, + "loss": 2.1242, + "step": 885 + }, + { + "epoch": 0.28, + "grad_norm": 1.263594388961792, + "learning_rate": 4.112679969946977e-05, + "loss": 2.02, + "step": 890 + }, + { + "epoch": 0.28, + "grad_norm": 1.4173017740249634, + "learning_rate": 4.103327323807162e-05, + "loss": 2.0438, + "step": 895 + }, + { + "epoch": 0.28, + "grad_norm": 1.876170039176941, + "learning_rate": 4.093936404083585e-05, + "loss": 1.9806, + "step": 900 + }, + { + "epoch": 0.28, + "grad_norm": 1.4649231433868408, + "learning_rate": 4.0845074349501544e-05, + "loss": 2.1476, + "step": 905 + }, + { + "epoch": 0.28, + "grad_norm": 1.0446043014526367, + "learning_rate": 4.0750406414890695e-05, + "loss": 1.9672, + "step": 910 + }, + { + "epoch": 0.28, + "grad_norm": 1.0225305557250977, + "learning_rate": 4.065536249685448e-05, + "loss": 1.9984, + "step": 915 + }, + { + "epoch": 0.29, + "grad_norm": 1.0120617151260376, + "learning_rate": 4.055994486421929e-05, + "loss": 2.1162, + "step": 920 + }, + { + "epoch": 0.29, + "grad_norm": 1.0469881296157837, + "learning_rate": 4.04641557947326e-05, + "loss": 2.0435, + "step": 925 + }, + { + "epoch": 0.29, + "grad_norm": 1.2435941696166992, + "learning_rate": 4.036799757500856e-05, + "loss": 2.0431, + "step": 930 + }, + { + "epoch": 0.29, + "grad_norm": 1.0055103302001953, + "learning_rate": 4.027147250047348e-05, + "loss": 2.2021, + "step": 935 + }, + { + "epoch": 0.29, + "grad_norm": 1.1212949752807617, + "learning_rate": 4.017458287531094e-05, + "loss": 1.997, + "step": 940 + }, + { + "epoch": 0.29, + "grad_norm": 1.1048357486724854, + "learning_rate": 4.007733101240685e-05, + "loss": 1.946, + "step": 945 + }, + { + "epoch": 0.3, + "grad_norm": 1.4721689224243164, + "learning_rate": 3.997971923329426e-05, + "loss": 2.0723, + "step": 950 + }, + { + "epoch": 0.3, + "grad_norm": 1.3793156147003174, + "learning_rate": 3.988174986809783e-05, + "loss": 2.034, + "step": 955 + }, + { + "epoch": 0.3, + "grad_norm": 0.9013482928276062, + "learning_rate": 3.9783425255478355e-05, + "loss": 1.9736, + "step": 960 + }, + { + "epoch": 0.3, + "grad_norm": 0.9192422032356262, + "learning_rate": 3.968474774257682e-05, + "loss": 1.9878, + "step": 965 + }, + { + "epoch": 0.3, + "grad_norm": 1.9304206371307373, + "learning_rate": 3.9585719684958446e-05, + "loss": 2.117, + "step": 970 + }, + { + "epoch": 0.3, + "grad_norm": 1.0435137748718262, + "learning_rate": 3.948634344655639e-05, + "loss": 2.0585, + "step": 975 + }, + { + "epoch": 0.3, + "grad_norm": 1.4636590480804443, + "learning_rate": 3.938662139961538e-05, + "loss": 2.0409, + "step": 980 + }, + { + "epoch": 0.31, + "grad_norm": 1.8014529943466187, + "learning_rate": 3.928655592463508e-05, + "loss": 2.0369, + "step": 985 + }, + { + "epoch": 0.31, + "grad_norm": 1.2412620782852173, + "learning_rate": 3.918614941031319e-05, + "loss": 1.967, + "step": 990 + }, + { + "epoch": 0.31, + "grad_norm": 1.3581103086471558, + "learning_rate": 3.908540425348852e-05, + "loss": 2.0037, + "step": 995 + }, + { + "epoch": 0.31, + "grad_norm": 1.2377780675888062, + "learning_rate": 3.8984322859083725e-05, + "loss": 1.9991, + "step": 1000 + }, + { + "epoch": 0.31, + "grad_norm": 0.9209259748458862, + "learning_rate": 3.8882907640047896e-05, + "loss": 2.0448, + "step": 1005 + }, + { + "epoch": 0.31, + "grad_norm": 1.0150959491729736, + "learning_rate": 3.878116101729897e-05, + "loss": 2.0791, + "step": 1010 + }, + { + "epoch": 0.32, + "grad_norm": 1.5959141254425049, + "learning_rate": 3.867908541966594e-05, + "loss": 1.9997, + "step": 1015 + }, + { + "epoch": 0.32, + "grad_norm": 1.3945012092590332, + "learning_rate": 3.857668328383088e-05, + "loss": 2.0481, + "step": 1020 + }, + { + "epoch": 0.32, + "grad_norm": 1.2361671924591064, + "learning_rate": 3.847395705427075e-05, + "loss": 2.2664, + "step": 1025 + }, + { + "epoch": 0.32, + "grad_norm": 1.9661719799041748, + "learning_rate": 3.837090918319909e-05, + "loss": 1.9752, + "step": 1030 + }, + { + "epoch": 0.32, + "grad_norm": 1.6995949745178223, + "learning_rate": 3.8267542130507436e-05, + "loss": 2.1332, + "step": 1035 + }, + { + "epoch": 0.32, + "grad_norm": 1.1248412132263184, + "learning_rate": 3.816385836370663e-05, + "loss": 2.0432, + "step": 1040 + }, + { + "epoch": 0.32, + "grad_norm": 0.8734235763549805, + "learning_rate": 3.805986035786789e-05, + "loss": 1.9618, + "step": 1045 + }, + { + "epoch": 0.33, + "grad_norm": 1.322766661643982, + "learning_rate": 3.795555059556378e-05, + "loss": 2.0267, + "step": 1050 + }, + { + "epoch": 0.33, + "grad_norm": 1.0396028757095337, + "learning_rate": 3.7850931566808866e-05, + "loss": 2.1075, + "step": 1055 + }, + { + "epoch": 0.33, + "grad_norm": 0.9574625492095947, + "learning_rate": 3.7746005769000363e-05, + "loss": 2.156, + "step": 1060 + }, + { + "epoch": 0.33, + "grad_norm": 1.4480133056640625, + "learning_rate": 3.764077570685844e-05, + "loss": 1.9615, + "step": 1065 + }, + { + "epoch": 0.33, + "grad_norm": 1.5908560752868652, + "learning_rate": 3.753524389236648e-05, + "loss": 2.0928, + "step": 1070 + }, + { + "epoch": 0.33, + "grad_norm": 1.2628813982009888, + "learning_rate": 3.742941284471111e-05, + "loss": 2.1074, + "step": 1075 + }, + { + "epoch": 0.34, + "grad_norm": 1.2687503099441528, + "learning_rate": 3.7323285090222054e-05, + "loss": 1.9666, + "step": 1080 + }, + { + "epoch": 0.34, + "grad_norm": 1.2571731805801392, + "learning_rate": 3.721686316231181e-05, + "loss": 2.0468, + "step": 1085 + }, + { + "epoch": 0.34, + "grad_norm": 1.007453441619873, + "learning_rate": 3.7110149601415215e-05, + "loss": 2.0624, + "step": 1090 + }, + { + "epoch": 0.34, + "grad_norm": 1.2390377521514893, + "learning_rate": 3.700314695492876e-05, + "loss": 1.9888, + "step": 1095 + }, + { + "epoch": 0.34, + "grad_norm": 1.0878371000289917, + "learning_rate": 3.6895857777149825e-05, + "loss": 2.1013, + "step": 1100 + }, + { + "epoch": 0.34, + "grad_norm": 0.8759217262268066, + "learning_rate": 3.6788284629215624e-05, + "loss": 1.875, + "step": 1105 + }, + { + "epoch": 0.35, + "grad_norm": 1.1345970630645752, + "learning_rate": 3.668043007904219e-05, + "loss": 1.9096, + "step": 1110 + }, + { + "epoch": 0.35, + "grad_norm": 1.253629446029663, + "learning_rate": 3.6572296701262966e-05, + "loss": 2.1859, + "step": 1115 + }, + { + "epoch": 0.35, + "grad_norm": 0.9796190857887268, + "learning_rate": 3.646388707716738e-05, + "loss": 2.2092, + "step": 1120 + }, + { + "epoch": 0.35, + "grad_norm": 1.3893767595291138, + "learning_rate": 3.635520379463926e-05, + "loss": 2.0026, + "step": 1125 + }, + { + "epoch": 0.35, + "grad_norm": 0.8778309226036072, + "learning_rate": 3.6246249448095004e-05, + "loss": 2.2112, + "step": 1130 + }, + { + "epoch": 0.35, + "grad_norm": 1.2479698657989502, + "learning_rate": 3.6137026638421696e-05, + "loss": 2.0221, + "step": 1135 + }, + { + "epoch": 0.35, + "grad_norm": 1.3813824653625488, + "learning_rate": 3.6027537972914974e-05, + "loss": 1.9106, + "step": 1140 + }, + { + "epoch": 0.36, + "grad_norm": 1.2043218612670898, + "learning_rate": 3.5917786065216826e-05, + "loss": 2.0673, + "step": 1145 + }, + { + "epoch": 0.36, + "grad_norm": 1.5337340831756592, + "learning_rate": 3.580777353525318e-05, + "loss": 2.1463, + "step": 1150 + }, + { + "epoch": 0.36, + "grad_norm": 1.155813455581665, + "learning_rate": 3.5697503009171385e-05, + "loss": 2.0255, + "step": 1155 + }, + { + "epoch": 0.36, + "grad_norm": 1.034644365310669, + "learning_rate": 3.558697711927748e-05, + "loss": 2.1348, + "step": 1160 + }, + { + "epoch": 0.36, + "grad_norm": 1.0959795713424683, + "learning_rate": 3.54761985039734e-05, + "loss": 2.1457, + "step": 1165 + }, + { + "epoch": 0.36, + "grad_norm": 1.1938838958740234, + "learning_rate": 3.5365169807693966e-05, + "loss": 2.1256, + "step": 1170 + }, + { + "epoch": 0.37, + "grad_norm": 0.8162047863006592, + "learning_rate": 3.525389368084379e-05, + "loss": 1.9587, + "step": 1175 + }, + { + "epoch": 0.37, + "grad_norm": 0.9358930587768555, + "learning_rate": 3.514237277973393e-05, + "loss": 1.8965, + "step": 1180 + }, + { + "epoch": 0.37, + "grad_norm": 0.9210988879203796, + "learning_rate": 3.503060976651862e-05, + "loss": 1.9669, + "step": 1185 + }, + { + "epoch": 0.37, + "grad_norm": 1.4641343355178833, + "learning_rate": 3.491860730913156e-05, + "loss": 2.003, + "step": 1190 + }, + { + "epoch": 0.37, + "grad_norm": 1.2458257675170898, + "learning_rate": 3.480636808122235e-05, + "loss": 2.1487, + "step": 1195 + }, + { + "epoch": 0.37, + "grad_norm": 1.6770122051239014, + "learning_rate": 3.469389476209259e-05, + "loss": 2.0686, + "step": 1200 + }, + { + "epoch": 0.37, + "grad_norm": 0.9083845019340515, + "learning_rate": 3.458119003663199e-05, + "loss": 2.0284, + "step": 1205 + }, + { + "epoch": 0.38, + "grad_norm": 1.2679696083068848, + "learning_rate": 3.446825659525421e-05, + "loss": 2.0555, + "step": 1210 + }, + { + "epoch": 0.38, + "grad_norm": 1.3823720216751099, + "learning_rate": 3.435509713383268e-05, + "loss": 1.9375, + "step": 1215 + }, + { + "epoch": 0.38, + "grad_norm": 1.5862077474594116, + "learning_rate": 3.424171435363623e-05, + "loss": 2.0271, + "step": 1220 + }, + { + "epoch": 0.38, + "grad_norm": 2.0107533931732178, + "learning_rate": 3.412811096126461e-05, + "loss": 2.1897, + "step": 1225 + }, + { + "epoch": 0.38, + "grad_norm": 1.4544458389282227, + "learning_rate": 3.401428966858387e-05, + "loss": 1.9978, + "step": 1230 + }, + { + "epoch": 0.38, + "grad_norm": 1.188170075416565, + "learning_rate": 3.390025319266167e-05, + "loss": 2.0688, + "step": 1235 + }, + { + "epoch": 0.39, + "grad_norm": 1.1016322374343872, + "learning_rate": 3.3786004255702336e-05, + "loss": 2.0396, + "step": 1240 + }, + { + "epoch": 0.39, + "grad_norm": 1.6623334884643555, + "learning_rate": 3.3671545584981954e-05, + "loss": 1.9566, + "step": 1245 + }, + { + "epoch": 0.39, + "grad_norm": 0.9161584377288818, + "learning_rate": 3.355687991278324e-05, + "loss": 2.0474, + "step": 1250 + }, + { + "epoch": 0.39, + "grad_norm": 0.9911025166511536, + "learning_rate": 3.3442009976330305e-05, + "loss": 2.2163, + "step": 1255 + }, + { + "epoch": 0.39, + "grad_norm": 1.1504255533218384, + "learning_rate": 3.332693851772331e-05, + "loss": 2.1088, + "step": 1260 + }, + { + "epoch": 0.39, + "grad_norm": 0.9544184803962708, + "learning_rate": 3.3211668283873035e-05, + "loss": 1.8947, + "step": 1265 + }, + { + "epoch": 0.39, + "grad_norm": 1.4625756740570068, + "learning_rate": 3.3096202026435304e-05, + "loss": 2.1748, + "step": 1270 + }, + { + "epoch": 0.4, + "grad_norm": 1.3267475366592407, + "learning_rate": 3.298054250174527e-05, + "loss": 1.9218, + "step": 1275 + }, + { + "epoch": 0.4, + "grad_norm": 0.9869363903999329, + "learning_rate": 3.2864692470751654e-05, + "loss": 2.2723, + "step": 1280 + }, + { + "epoch": 0.4, + "grad_norm": 1.5177838802337646, + "learning_rate": 3.27486546989508e-05, + "loss": 2.1456, + "step": 1285 + }, + { + "epoch": 0.4, + "grad_norm": 1.1998714208602905, + "learning_rate": 3.263243195632068e-05, + "loss": 1.8877, + "step": 1290 + }, + { + "epoch": 0.4, + "grad_norm": 1.2112164497375488, + "learning_rate": 3.2516027017254785e-05, + "loss": 2.0615, + "step": 1295 + }, + { + "epoch": 0.4, + "grad_norm": 1.0616129636764526, + "learning_rate": 3.239944266049587e-05, + "loss": 2.0402, + "step": 1300 + }, + { + "epoch": 0.41, + "grad_norm": 1.4537287950515747, + "learning_rate": 3.228268166906962e-05, + "loss": 2.0728, + "step": 1305 + }, + { + "epoch": 0.41, + "grad_norm": 1.3899391889572144, + "learning_rate": 3.2165746830218254e-05, + "loss": 2.1815, + "step": 1310 + }, + { + "epoch": 0.41, + "grad_norm": 1.332529067993164, + "learning_rate": 3.204864093533394e-05, + "loss": 1.8935, + "step": 1315 + }, + { + "epoch": 0.41, + "grad_norm": 1.4466496706008911, + "learning_rate": 3.193136677989221e-05, + "loss": 1.9567, + "step": 1320 + }, + { + "epoch": 0.41, + "grad_norm": 1.1781721115112305, + "learning_rate": 3.181392716338516e-05, + "loss": 2.055, + "step": 1325 + }, + { + "epoch": 0.41, + "grad_norm": 0.9411901831626892, + "learning_rate": 3.1696324889254716e-05, + "loss": 1.8794, + "step": 1330 + }, + { + "epoch": 0.42, + "grad_norm": 1.2628341913223267, + "learning_rate": 3.15785627648256e-05, + "loss": 2.0299, + "step": 1335 + }, + { + "epoch": 0.42, + "grad_norm": 1.4857370853424072, + "learning_rate": 3.146064360123846e-05, + "loss": 1.9342, + "step": 1340 + }, + { + "epoch": 0.42, + "grad_norm": 1.661470651626587, + "learning_rate": 3.1342570213382594e-05, + "loss": 2.0399, + "step": 1345 + }, + { + "epoch": 0.42, + "grad_norm": 1.522845983505249, + "learning_rate": 3.122434541982888e-05, + "loss": 2.1419, + "step": 1350 + }, + { + "epoch": 0.42, + "grad_norm": 1.5679118633270264, + "learning_rate": 3.110597204276247e-05, + "loss": 2.2932, + "step": 1355 + }, + { + "epoch": 0.42, + "grad_norm": 1.3367788791656494, + "learning_rate": 3.098745290791539e-05, + "loss": 1.8989, + "step": 1360 + }, + { + "epoch": 0.42, + "grad_norm": 1.3873472213745117, + "learning_rate": 3.086879084449907e-05, + "loss": 2.1214, + "step": 1365 + }, + { + "epoch": 0.43, + "grad_norm": 1.2957035303115845, + "learning_rate": 3.074998868513688e-05, + "loss": 2.2538, + "step": 1370 + }, + { + "epoch": 0.43, + "grad_norm": 1.122176170349121, + "learning_rate": 3.0631049265796465e-05, + "loss": 2.0974, + "step": 1375 + }, + { + "epoch": 0.43, + "grad_norm": 1.0422618389129639, + "learning_rate": 3.051197542572203e-05, + "loss": 2.054, + "step": 1380 + }, + { + "epoch": 0.43, + "grad_norm": 1.1926140785217285, + "learning_rate": 3.0392770007366584e-05, + "loss": 1.9798, + "step": 1385 + }, + { + "epoch": 0.43, + "grad_norm": 0.8764025568962097, + "learning_rate": 3.0273435856324112e-05, + "loss": 2.0796, + "step": 1390 + }, + { + "epoch": 0.43, + "grad_norm": 0.8200764656066895, + "learning_rate": 3.0153975821261605e-05, + "loss": 1.9116, + "step": 1395 + }, + { + "epoch": 0.44, + "grad_norm": 1.0340498685836792, + "learning_rate": 3.0034392753851066e-05, + "loss": 2.0235, + "step": 1400 + }, + { + "epoch": 0.44, + "grad_norm": 1.0799012184143066, + "learning_rate": 2.9914689508701476e-05, + "loss": 2.1455, + "step": 1405 + }, + { + "epoch": 0.44, + "grad_norm": 1.301015853881836, + "learning_rate": 2.979486894329058e-05, + "loss": 2.0355, + "step": 1410 + }, + { + "epoch": 0.44, + "grad_norm": 1.2926914691925049, + "learning_rate": 2.9674933917896747e-05, + "loss": 2.0379, + "step": 1415 + }, + { + "epoch": 0.44, + "grad_norm": 1.4712942838668823, + "learning_rate": 2.9554887295530647e-05, + "loss": 2.0802, + "step": 1420 + }, + { + "epoch": 0.44, + "grad_norm": 1.1957335472106934, + "learning_rate": 2.943473194186693e-05, + "loss": 2.1044, + "step": 1425 + }, + { + "epoch": 0.44, + "grad_norm": 1.568293571472168, + "learning_rate": 2.9314470725175792e-05, + "loss": 2.0121, + "step": 1430 + }, + { + "epoch": 0.45, + "grad_norm": 1.4844893217086792, + "learning_rate": 2.919410651625455e-05, + "loss": 2.0717, + "step": 1435 + }, + { + "epoch": 0.45, + "grad_norm": 1.3942641019821167, + "learning_rate": 2.907364218835904e-05, + "loss": 1.9522, + "step": 1440 + }, + { + "epoch": 0.45, + "grad_norm": 0.7795314788818359, + "learning_rate": 2.8953080617135115e-05, + "loss": 1.9593, + "step": 1445 + }, + { + "epoch": 0.45, + "grad_norm": 1.751107931137085, + "learning_rate": 2.8832424680549937e-05, + "loss": 1.8073, + "step": 1450 + }, + { + "epoch": 0.45, + "grad_norm": 1.2202279567718506, + "learning_rate": 2.8711677258823306e-05, + "loss": 2.0042, + "step": 1455 + }, + { + "epoch": 0.45, + "grad_norm": 1.5163853168487549, + "learning_rate": 2.859084123435887e-05, + "loss": 1.9931, + "step": 1460 + }, + { + "epoch": 0.46, + "grad_norm": 0.94038987159729, + "learning_rate": 2.84699194916754e-05, + "loss": 2.1533, + "step": 1465 + }, + { + "epoch": 0.46, + "grad_norm": 1.4618102312088013, + "learning_rate": 2.834891491733781e-05, + "loss": 2.029, + "step": 1470 + }, + { + "epoch": 0.46, + "grad_norm": 0.9747155904769897, + "learning_rate": 2.822783039988836e-05, + "loss": 2.0241, + "step": 1475 + }, + { + "epoch": 0.46, + "grad_norm": 1.0887038707733154, + "learning_rate": 2.8106668829777645e-05, + "loss": 2.0959, + "step": 1480 + }, + { + "epoch": 0.46, + "grad_norm": 1.2170171737670898, + "learning_rate": 2.7985433099295618e-05, + "loss": 1.8718, + "step": 1485 + }, + { + "epoch": 0.46, + "grad_norm": 1.1366883516311646, + "learning_rate": 2.7864126102502524e-05, + "loss": 2.2397, + "step": 1490 + }, + { + "epoch": 0.46, + "grad_norm": 1.1206785440444946, + "learning_rate": 2.774275073515985e-05, + "loss": 2.1083, + "step": 1495 + }, + { + "epoch": 0.47, + "grad_norm": 1.126807451248169, + "learning_rate": 2.7621309894661167e-05, + "loss": 2.0764, + "step": 1500 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 2.01535997018112e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1500/training_args.bin b/checkpoint-1500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-1500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/checkpoint-200/README.md b/checkpoint-200/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-200/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-200/adapter_config.json b/checkpoint-200/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-200/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-200/adapter_model.safetensors b/checkpoint-200/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..78fcb02a90be8e210dcc77542f0daf40421ad364 --- /dev/null +++ b/checkpoint-200/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f35acad31ae62bc0123d9805573e0a0ebe6a74796fe8a3e82694ed0c92738660 +size 2099272 diff --git a/checkpoint-200/optimizer.pt b/checkpoint-200/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..40c1d2782dd1f86624396ff5fd460ad915786524 --- /dev/null +++ b/checkpoint-200/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fbdb058d049f37cb8663b64755cdb0b9c9914449aabe57fd861b9a38be97135 +size 4208302 diff --git a/checkpoint-200/rng_state.pth b/checkpoint-200/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..6279c5458de52148be4a6e2e6bf48ca6f1d7ef44 --- /dev/null +++ b/checkpoint-200/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6effff382e27d9bddcb6e934a8054a91fd0264c14b69428d12d2650e52650c3 +size 14244 diff --git a/checkpoint-200/scheduler.pt b/checkpoint-200/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cc26892a158fe75d5e7422a78f4d02a13c821d9 --- /dev/null +++ b/checkpoint-200/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7f3c90f0256918706f894a068be28979d7cef535fe184ef7473a22beeb1d99d +size 1064 diff --git a/checkpoint-200/special_tokens_map.json b/checkpoint-200/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-200/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-200/tokenizer.model b/checkpoint-200/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-200/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-200/tokenizer_config.json b/checkpoint-200/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-200/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-200/trainer_state.json b/checkpoint-200/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d2b8e69aab1de7672010cdbd6c4860aaba506d12 --- /dev/null +++ b/checkpoint-200/trainer_state.json @@ -0,0 +1,301 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.06219147199440277, + "eval_steps": 500, + "global_step": 200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 2692221596467200.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-200/training_args.bin b/checkpoint-200/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-200/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/checkpoint-300/README.md b/checkpoint-300/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-300/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-300/adapter_config.json b/checkpoint-300/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-300/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-300/adapter_model.safetensors b/checkpoint-300/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0ac6b14b7edcbdbd6348c7aeacfa237c7b35caff --- /dev/null +++ b/checkpoint-300/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0ede12c9ba9490e88263af318d5cb3c5fd89d2bd8453b124eb6975f3d7f927b +size 2099272 diff --git a/checkpoint-300/optimizer.pt b/checkpoint-300/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fa40c84ddcb648f8ffa9723491306a5fa08b7f7 --- /dev/null +++ b/checkpoint-300/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f0a5e78073a5a58dc7958a63b026af18792e3d727e023d1643f1ddccc83d202 +size 4208302 diff --git a/checkpoint-300/rng_state.pth b/checkpoint-300/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..f608004f7fbf3451c0be7be6f24ed27db5e66ded --- /dev/null +++ b/checkpoint-300/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17e78f09051534c4813981e8db41160ba79e32b82d6cc4338167319badcc1212 +size 14244 diff --git a/checkpoint-300/scheduler.pt b/checkpoint-300/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a7871bc7ebf8ae17ed09b08eeb31c7f5150680ea --- /dev/null +++ b/checkpoint-300/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:409a78607579321f124c8a569986becbe346fd5c384fbc7ec0a57d4dde2570ac +size 1064 diff --git a/checkpoint-300/special_tokens_map.json b/checkpoint-300/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-300/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-300/tokenizer.model b/checkpoint-300/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-300/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-300/tokenizer_config.json b/checkpoint-300/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-300/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-300/trainer_state.json b/checkpoint-300/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8809b5c012a5c37c166d4649c4d1d897ad0724cf --- /dev/null +++ b/checkpoint-300/trainer_state.json @@ -0,0 +1,441 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.09328720799160416, + "eval_steps": 500, + "global_step": 300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.2905739545822144, + "learning_rate": 4.9500077405018807e-05, + "loss": 2.0987, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.8612006306648254, + "learning_rate": 4.9475480211302583e-05, + "loss": 2.1765, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 1.3128459453582764, + "learning_rate": 4.945029875489212e-05, + "loss": 1.9926, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 0.9610918164253235, + "learning_rate": 4.94245336369027e-05, + "loss": 2.0124, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.873160183429718, + "learning_rate": 4.939818547238241e-05, + "loss": 2.2229, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.5535285472869873, + "learning_rate": 4.9371254890297446e-05, + "loss": 2.2013, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951836347579956, + "learning_rate": 4.93437425335171e-05, + "loss": 2.014, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874170541763306, + "learning_rate": 4.9315649058798384e-05, + "loss": 2.1701, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 1.3503323793411255, + "learning_rate": 4.928697513677042e-05, + "loss": 2.1681, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091179132461548, + "learning_rate": 4.925772145191834e-05, + "loss": 2.1224, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.4428555965423584, + "learning_rate": 4.9227888702567044e-05, + "loss": 2.0512, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.8234395980834961, + "learning_rate": 4.9197477600864446e-05, + "loss": 2.1067, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 1.9094969034194946, + "learning_rate": 4.9166488872764526e-05, + "loss": 1.8884, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 1.0074087381362915, + "learning_rate": 4.913492325800999e-05, + "loss": 1.9345, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 1.0867297649383545, + "learning_rate": 4.910278151011458e-05, + "loss": 2.1928, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 0.6842357516288757, + "learning_rate": 4.907006439634516e-05, + "loss": 2.0407, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.8409023284912109, + "learning_rate": 4.903677269770329e-05, + "loss": 2.2344, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 0.8119503259658813, + "learning_rate": 4.900290720890671e-05, + "loss": 2.1296, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.9938147068023682, + "learning_rate": 4.8968468738370244e-05, + "loss": 2.152, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.9865244030952454, + "learning_rate": 4.8933458108186606e-05, + "loss": 1.9623, + "step": 300 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 4027137608908800.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-300/training_args.bin b/checkpoint-300/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-300/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/checkpoint-400/README.md b/checkpoint-400/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-400/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-400/adapter_config.json b/checkpoint-400/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-400/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-400/adapter_model.safetensors b/checkpoint-400/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b6f5359a6e6859d44f55104fdfeb36a2ba9f73fb --- /dev/null +++ b/checkpoint-400/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20a6b0f10af55adab3ace13b190c0b88878638e41c43461278003f985d65f825 +size 2099272 diff --git a/checkpoint-400/optimizer.pt b/checkpoint-400/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..c3c9c9e793696be557d62f11ed6d3ed47e2fd604 --- /dev/null +++ b/checkpoint-400/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9ba74d329c1511a275caa5a4131244a3e823ad790b076dd9d194b5664fb13a8 +size 4208302 diff --git a/checkpoint-400/rng_state.pth b/checkpoint-400/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..2b8c7d84529cdcd0d1d41e3149831cf4666accad --- /dev/null +++ b/checkpoint-400/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00c6fb89b33a31f54b4c9fb5d197ffb1411fd724d6aa4a14ea649ff9c09f77ed +size 14244 diff --git a/checkpoint-400/scheduler.pt b/checkpoint-400/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4048d3f837df380c70e3220dc2809d5583e6278 --- /dev/null +++ b/checkpoint-400/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bab6de0379e26e33db70e69aa7be18f07acf14aa9554478a757b50620067887 +size 1064 diff --git a/checkpoint-400/special_tokens_map.json b/checkpoint-400/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-400/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-400/tokenizer.model b/checkpoint-400/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-400/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-400/tokenizer_config.json b/checkpoint-400/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-400/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-400/trainer_state.json b/checkpoint-400/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1bb9fb138240fed4a4ea095e3d84780bd7e37ac0 --- /dev/null +++ b/checkpoint-400/trainer_state.json @@ -0,0 +1,581 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.12438294398880553, + "eval_steps": 500, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.2905739545822144, + "learning_rate": 4.9500077405018807e-05, + "loss": 2.0987, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.8612006306648254, + "learning_rate": 4.9475480211302583e-05, + "loss": 2.1765, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 1.3128459453582764, + "learning_rate": 4.945029875489212e-05, + "loss": 1.9926, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 0.9610918164253235, + "learning_rate": 4.94245336369027e-05, + "loss": 2.0124, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.873160183429718, + "learning_rate": 4.939818547238241e-05, + "loss": 2.2229, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.5535285472869873, + "learning_rate": 4.9371254890297446e-05, + "loss": 2.2013, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951836347579956, + "learning_rate": 4.93437425335171e-05, + "loss": 2.014, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874170541763306, + "learning_rate": 4.9315649058798384e-05, + "loss": 2.1701, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 1.3503323793411255, + "learning_rate": 4.928697513677042e-05, + "loss": 2.1681, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091179132461548, + "learning_rate": 4.925772145191834e-05, + "loss": 2.1224, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.4428555965423584, + "learning_rate": 4.9227888702567044e-05, + "loss": 2.0512, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.8234395980834961, + "learning_rate": 4.9197477600864446e-05, + "loss": 2.1067, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 1.9094969034194946, + "learning_rate": 4.9166488872764526e-05, + "loss": 1.8884, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 1.0074087381362915, + "learning_rate": 4.913492325800999e-05, + "loss": 1.9345, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 1.0867297649383545, + "learning_rate": 4.910278151011458e-05, + "loss": 2.1928, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 0.6842357516288757, + "learning_rate": 4.907006439634516e-05, + "loss": 2.0407, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.8409023284912109, + "learning_rate": 4.903677269770329e-05, + "loss": 2.2344, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 0.8119503259658813, + "learning_rate": 4.900290720890671e-05, + "loss": 2.1296, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.9938147068023682, + "learning_rate": 4.8968468738370244e-05, + "loss": 2.152, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.9865244030952454, + "learning_rate": 4.8933458108186606e-05, + "loss": 1.9623, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944802284240723, + "learning_rate": 4.889787615410672e-05, + "loss": 1.915, + "step": 305 + }, + { + "epoch": 0.1, + "grad_norm": 1.3749767541885376, + "learning_rate": 4.886172372551977e-05, + "loss": 1.9934, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.9024938941001892, + "learning_rate": 4.882500168543294e-05, + "loss": 2.1541, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 1.1978263854980469, + "learning_rate": 4.878771091045082e-05, + "loss": 2.1688, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8360010981559753, + "learning_rate": 4.874985229075446e-05, + "loss": 2.1387, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 0.7683364152908325, + "learning_rate": 4.871142673008012e-05, + "loss": 2.0215, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 1.4230670928955078, + "learning_rate": 4.867243514569772e-05, + "loss": 1.9491, + "step": 335 + }, + { + "epoch": 0.11, + "grad_norm": 0.8198773860931396, + "learning_rate": 4.863287846838891e-05, + "loss": 2.0151, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 1.467207908630371, + "learning_rate": 4.85927576424249e-05, + "loss": 1.8906, + "step": 345 + }, + { + "epoch": 0.11, + "grad_norm": 0.9537095427513123, + "learning_rate": 4.855207362554385e-05, + "loss": 2.1844, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 1.0757155418395996, + "learning_rate": 4.851082738892809e-05, + "loss": 2.048, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6884938478469849, + "learning_rate": 4.8469019917180846e-05, + "loss": 1.9537, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 1.4680182933807373, + "learning_rate": 4.8426652208302814e-05, + "loss": 1.9731, + "step": 365 + }, + { + "epoch": 0.12, + "grad_norm": 1.1778632402420044, + "learning_rate": 4.83837252736683e-05, + "loss": 2.1395, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 1.2865056991577148, + "learning_rate": 4.834024013800108e-05, + "loss": 2.0016, + "step": 375 + }, + { + "epoch": 0.12, + "grad_norm": 1.055177092552185, + "learning_rate": 4.8296197839349944e-05, + "loss": 1.9632, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.0041871070861816, + "learning_rate": 4.825159942906389e-05, + "loss": 2.3302, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0026438236236572, + "learning_rate": 4.820644597176709e-05, + "loss": 2.1517, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 1.3532180786132812, + "learning_rate": 4.81607385453334e-05, + "loss": 2.1229, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 0.7670988440513611, + "learning_rate": 4.81144782408607e-05, + "loss": 2.1382, + "step": 400 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 5372900124917760.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-400/training_args.bin b/checkpoint-400/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-400/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/checkpoint-500/README.md b/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-500/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-500/adapter_config.json b/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-500/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-500/adapter_model.safetensors b/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ce59d72f30638abc6470079fc46d97c21dd9b9d4 --- /dev/null +++ b/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc6a81ba16da3040dc8f987b3953f997ea002d279c55b1a455bf16d9b741d1e8 +size 2099272 diff --git a/checkpoint-500/optimizer.pt b/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..aad26da08fd7264698e27092d22a00d8ed372b31 --- /dev/null +++ b/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d65d51cb7f7be7ea1dc66ab6fb909512ecada77e4eb38c86d926613b6ccff12 +size 4208302 diff --git a/checkpoint-500/rng_state.pth b/checkpoint-500/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..14976090b4da58d701f66845fdd790fbd4138524 --- /dev/null +++ b/checkpoint-500/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1b7f6e3ef779b6d2aa0b9376ece96587db1ce1d789a4526b7d146f0155f413d +size 14244 diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef610a81325f8e28f320a65009309eaf1ee311d2 --- /dev/null +++ b/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4db22f85d1cc41f0dad50b7443150040453aaada654020db42304ba5fb7c6a6f +size 1064 diff --git a/checkpoint-500/special_tokens_map.json b/checkpoint-500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-500/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-500/tokenizer.model b/checkpoint-500/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-500/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-500/tokenizer_config.json b/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-500/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b0748573f971e218114e99fecf76fc28cebc1ce8 --- /dev/null +++ b/checkpoint-500/trainer_state.json @@ -0,0 +1,721 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.15547867998600692, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.2905739545822144, + "learning_rate": 4.9500077405018807e-05, + "loss": 2.0987, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.8612006306648254, + "learning_rate": 4.9475480211302583e-05, + "loss": 2.1765, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 1.3128459453582764, + "learning_rate": 4.945029875489212e-05, + "loss": 1.9926, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 0.9610918164253235, + "learning_rate": 4.94245336369027e-05, + "loss": 2.0124, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.873160183429718, + "learning_rate": 4.939818547238241e-05, + "loss": 2.2229, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.5535285472869873, + "learning_rate": 4.9371254890297446e-05, + "loss": 2.2013, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951836347579956, + "learning_rate": 4.93437425335171e-05, + "loss": 2.014, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874170541763306, + "learning_rate": 4.9315649058798384e-05, + "loss": 2.1701, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 1.3503323793411255, + "learning_rate": 4.928697513677042e-05, + "loss": 2.1681, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091179132461548, + "learning_rate": 4.925772145191834e-05, + "loss": 2.1224, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.4428555965423584, + "learning_rate": 4.9227888702567044e-05, + "loss": 2.0512, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.8234395980834961, + "learning_rate": 4.9197477600864446e-05, + "loss": 2.1067, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 1.9094969034194946, + "learning_rate": 4.9166488872764526e-05, + "loss": 1.8884, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 1.0074087381362915, + "learning_rate": 4.913492325800999e-05, + "loss": 1.9345, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 1.0867297649383545, + "learning_rate": 4.910278151011458e-05, + "loss": 2.1928, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 0.6842357516288757, + "learning_rate": 4.907006439634516e-05, + "loss": 2.0407, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.8409023284912109, + "learning_rate": 4.903677269770329e-05, + "loss": 2.2344, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 0.8119503259658813, + "learning_rate": 4.900290720890671e-05, + "loss": 2.1296, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.9938147068023682, + "learning_rate": 4.8968468738370244e-05, + "loss": 2.152, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.9865244030952454, + "learning_rate": 4.8933458108186606e-05, + "loss": 1.9623, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944802284240723, + "learning_rate": 4.889787615410672e-05, + "loss": 1.915, + "step": 305 + }, + { + "epoch": 0.1, + "grad_norm": 1.3749767541885376, + "learning_rate": 4.886172372551977e-05, + "loss": 1.9934, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.9024938941001892, + "learning_rate": 4.882500168543294e-05, + "loss": 2.1541, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 1.1978263854980469, + "learning_rate": 4.878771091045082e-05, + "loss": 2.1688, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8360010981559753, + "learning_rate": 4.874985229075446e-05, + "loss": 2.1387, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 0.7683364152908325, + "learning_rate": 4.871142673008012e-05, + "loss": 2.0215, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 1.4230670928955078, + "learning_rate": 4.867243514569772e-05, + "loss": 1.9491, + "step": 335 + }, + { + "epoch": 0.11, + "grad_norm": 0.8198773860931396, + "learning_rate": 4.863287846838891e-05, + "loss": 2.0151, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 1.467207908630371, + "learning_rate": 4.85927576424249e-05, + "loss": 1.8906, + "step": 345 + }, + { + "epoch": 0.11, + "grad_norm": 0.9537095427513123, + "learning_rate": 4.855207362554385e-05, + "loss": 2.1844, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 1.0757155418395996, + "learning_rate": 4.851082738892809e-05, + "loss": 2.048, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6884938478469849, + "learning_rate": 4.8469019917180846e-05, + "loss": 1.9537, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 1.4680182933807373, + "learning_rate": 4.8426652208302814e-05, + "loss": 1.9731, + "step": 365 + }, + { + "epoch": 0.12, + "grad_norm": 1.1778632402420044, + "learning_rate": 4.83837252736683e-05, + "loss": 2.1395, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 1.2865056991577148, + "learning_rate": 4.834024013800108e-05, + "loss": 2.0016, + "step": 375 + }, + { + "epoch": 0.12, + "grad_norm": 1.055177092552185, + "learning_rate": 4.8296197839349944e-05, + "loss": 1.9632, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.0041871070861816, + "learning_rate": 4.825159942906389e-05, + "loss": 2.3302, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0026438236236572, + "learning_rate": 4.820644597176709e-05, + "loss": 2.1517, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 1.3532180786132812, + "learning_rate": 4.81607385453334e-05, + "loss": 2.1229, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 0.7670988440513611, + "learning_rate": 4.81144782408607e-05, + "loss": 2.1382, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 1.0405700206756592, + "learning_rate": 4.8067666162644774e-05, + "loss": 1.9614, + "step": 405 + }, + { + "epoch": 0.13, + "grad_norm": 1.2252662181854248, + "learning_rate": 4.802030342815304e-05, + "loss": 2.1399, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 1.237946629524231, + "learning_rate": 4.7972391167997754e-05, + "loss": 1.9034, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 0.8064705729484558, + "learning_rate": 4.7923930525909156e-05, + "loss": 2.0075, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 0.8717565536499023, + "learning_rate": 4.7874922658708065e-05, + "loss": 2.0105, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 1.6693098545074463, + "learning_rate": 4.782536873627832e-05, + "loss": 2.0242, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.82447350025177, + "learning_rate": 4.777526994153882e-05, + "loss": 2.0267, + "step": 435 + }, + { + "epoch": 0.14, + "grad_norm": 0.9926588535308838, + "learning_rate": 4.7724627470415307e-05, + "loss": 1.9119, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 1.0924450159072876, + "learning_rate": 4.7673442531811796e-05, + "loss": 2.2653, + "step": 445 + }, + { + "epoch": 0.14, + "grad_norm": 1.1592103242874146, + "learning_rate": 4.762171634758177e-05, + "loss": 2.0017, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.9172110557556152, + "learning_rate": 4.7569450152498927e-05, + "loss": 2.1408, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 1.1897525787353516, + "learning_rate": 4.751664519422778e-05, + "loss": 2.0935, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 0.8793094158172607, + "learning_rate": 4.746330273329386e-05, + "loss": 2.1142, + "step": 465 + }, + { + "epoch": 0.15, + "grad_norm": 1.4337489604949951, + "learning_rate": 4.740942404305356e-05, + "loss": 2.1289, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 1.0251764059066772, + "learning_rate": 4.735501040966383e-05, + "loss": 1.9741, + "step": 475 + }, + { + "epoch": 0.15, + "grad_norm": 1.2659822702407837, + "learning_rate": 4.730006313205143e-05, + "loss": 2.088, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.8884140849113464, + "learning_rate": 4.724458352188192e-05, + "loss": 2.2079, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 1.1937768459320068, + "learning_rate": 4.718857290352835e-05, + "loss": 2.048, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 0.9741552472114563, + "learning_rate": 4.713203261403966e-05, + "loss": 2.2569, + "step": 495 + }, + { + "epoch": 0.16, + "grad_norm": 0.7996780872344971, + "learning_rate": 4.707496400310874e-05, + "loss": 1.9574, + "step": 500 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 6734882641674240.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/checkpoint-600/README.md b/checkpoint-600/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-600/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-600/adapter_config.json b/checkpoint-600/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-600/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-600/adapter_model.safetensors b/checkpoint-600/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..865c68246181451b05a7f0e1e45240b0ef786591 --- /dev/null +++ b/checkpoint-600/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec6b39a908aacd2318834cc4f8a720c19a8db28fd18792db522289592735a1ad +size 2099272 diff --git a/checkpoint-600/optimizer.pt b/checkpoint-600/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..acb1794b9077f485c2076bd60ee8e711e0788beb --- /dev/null +++ b/checkpoint-600/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c177b9e5841ef74c12f1bcf3fc62942e696f5958905438ccc1487083e06592fd +size 4208302 diff --git a/checkpoint-600/rng_state.pth b/checkpoint-600/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..55ce195caac088df17b4aef698450a05f08155e0 --- /dev/null +++ b/checkpoint-600/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8337806e5c44ce711d9636e9c6a356ba5a3009d12ca530b6fdc463557f027171 +size 14244 diff --git a/checkpoint-600/scheduler.pt b/checkpoint-600/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..6311365e307c137be082e0a60309c9521b796c3e --- /dev/null +++ b/checkpoint-600/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b9e75e4472c974f751926764bde84f22df5de25c6e4a0448342940bc46aedcb +size 1064 diff --git a/checkpoint-600/special_tokens_map.json b/checkpoint-600/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-600/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-600/tokenizer.model b/checkpoint-600/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-600/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-600/tokenizer_config.json b/checkpoint-600/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-600/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-600/trainer_state.json b/checkpoint-600/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..bcc0116e4e0116c68cda931099cc32e00407862b --- /dev/null +++ b/checkpoint-600/trainer_state.json @@ -0,0 +1,861 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.1865744159832083, + "eval_steps": 500, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.2905739545822144, + "learning_rate": 4.9500077405018807e-05, + "loss": 2.0987, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.8612006306648254, + "learning_rate": 4.9475480211302583e-05, + "loss": 2.1765, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 1.3128459453582764, + "learning_rate": 4.945029875489212e-05, + "loss": 1.9926, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 0.9610918164253235, + "learning_rate": 4.94245336369027e-05, + "loss": 2.0124, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.873160183429718, + "learning_rate": 4.939818547238241e-05, + "loss": 2.2229, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.5535285472869873, + "learning_rate": 4.9371254890297446e-05, + "loss": 2.2013, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951836347579956, + "learning_rate": 4.93437425335171e-05, + "loss": 2.014, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874170541763306, + "learning_rate": 4.9315649058798384e-05, + "loss": 2.1701, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 1.3503323793411255, + "learning_rate": 4.928697513677042e-05, + "loss": 2.1681, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091179132461548, + "learning_rate": 4.925772145191834e-05, + "loss": 2.1224, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.4428555965423584, + "learning_rate": 4.9227888702567044e-05, + "loss": 2.0512, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.8234395980834961, + "learning_rate": 4.9197477600864446e-05, + "loss": 2.1067, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 1.9094969034194946, + "learning_rate": 4.9166488872764526e-05, + "loss": 1.8884, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 1.0074087381362915, + "learning_rate": 4.913492325800999e-05, + "loss": 1.9345, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 1.0867297649383545, + "learning_rate": 4.910278151011458e-05, + "loss": 2.1928, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 0.6842357516288757, + "learning_rate": 4.907006439634516e-05, + "loss": 2.0407, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.8409023284912109, + "learning_rate": 4.903677269770329e-05, + "loss": 2.2344, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 0.8119503259658813, + "learning_rate": 4.900290720890671e-05, + "loss": 2.1296, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.9938147068023682, + "learning_rate": 4.8968468738370244e-05, + "loss": 2.152, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.9865244030952454, + "learning_rate": 4.8933458108186606e-05, + "loss": 1.9623, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944802284240723, + "learning_rate": 4.889787615410672e-05, + "loss": 1.915, + "step": 305 + }, + { + "epoch": 0.1, + "grad_norm": 1.3749767541885376, + "learning_rate": 4.886172372551977e-05, + "loss": 1.9934, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.9024938941001892, + "learning_rate": 4.882500168543294e-05, + "loss": 2.1541, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 1.1978263854980469, + "learning_rate": 4.878771091045082e-05, + "loss": 2.1688, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8360010981559753, + "learning_rate": 4.874985229075446e-05, + "loss": 2.1387, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 0.7683364152908325, + "learning_rate": 4.871142673008012e-05, + "loss": 2.0215, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 1.4230670928955078, + "learning_rate": 4.867243514569772e-05, + "loss": 1.9491, + "step": 335 + }, + { + "epoch": 0.11, + "grad_norm": 0.8198773860931396, + "learning_rate": 4.863287846838891e-05, + "loss": 2.0151, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 1.467207908630371, + "learning_rate": 4.85927576424249e-05, + "loss": 1.8906, + "step": 345 + }, + { + "epoch": 0.11, + "grad_norm": 0.9537095427513123, + "learning_rate": 4.855207362554385e-05, + "loss": 2.1844, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 1.0757155418395996, + "learning_rate": 4.851082738892809e-05, + "loss": 2.048, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6884938478469849, + "learning_rate": 4.8469019917180846e-05, + "loss": 1.9537, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 1.4680182933807373, + "learning_rate": 4.8426652208302814e-05, + "loss": 1.9731, + "step": 365 + }, + { + "epoch": 0.12, + "grad_norm": 1.1778632402420044, + "learning_rate": 4.83837252736683e-05, + "loss": 2.1395, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 1.2865056991577148, + "learning_rate": 4.834024013800108e-05, + "loss": 2.0016, + "step": 375 + }, + { + "epoch": 0.12, + "grad_norm": 1.055177092552185, + "learning_rate": 4.8296197839349944e-05, + "loss": 1.9632, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.0041871070861816, + "learning_rate": 4.825159942906389e-05, + "loss": 2.3302, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0026438236236572, + "learning_rate": 4.820644597176709e-05, + "loss": 2.1517, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 1.3532180786132812, + "learning_rate": 4.81607385453334e-05, + "loss": 2.1229, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 0.7670988440513611, + "learning_rate": 4.81144782408607e-05, + "loss": 2.1382, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 1.0405700206756592, + "learning_rate": 4.8067666162644774e-05, + "loss": 1.9614, + "step": 405 + }, + { + "epoch": 0.13, + "grad_norm": 1.2252662181854248, + "learning_rate": 4.802030342815304e-05, + "loss": 2.1399, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 1.237946629524231, + "learning_rate": 4.7972391167997754e-05, + "loss": 1.9034, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 0.8064705729484558, + "learning_rate": 4.7923930525909156e-05, + "loss": 2.0075, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 0.8717565536499023, + "learning_rate": 4.7874922658708065e-05, + "loss": 2.0105, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 1.6693098545074463, + "learning_rate": 4.782536873627832e-05, + "loss": 2.0242, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.82447350025177, + "learning_rate": 4.777526994153882e-05, + "loss": 2.0267, + "step": 435 + }, + { + "epoch": 0.14, + "grad_norm": 0.9926588535308838, + "learning_rate": 4.7724627470415307e-05, + "loss": 1.9119, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 1.0924450159072876, + "learning_rate": 4.7673442531811796e-05, + "loss": 2.2653, + "step": 445 + }, + { + "epoch": 0.14, + "grad_norm": 1.1592103242874146, + "learning_rate": 4.762171634758177e-05, + "loss": 2.0017, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.9172110557556152, + "learning_rate": 4.7569450152498927e-05, + "loss": 2.1408, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 1.1897525787353516, + "learning_rate": 4.751664519422778e-05, + "loss": 2.0935, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 0.8793094158172607, + "learning_rate": 4.746330273329386e-05, + "loss": 2.1142, + "step": 465 + }, + { + "epoch": 0.15, + "grad_norm": 1.4337489604949951, + "learning_rate": 4.740942404305356e-05, + "loss": 2.1289, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 1.0251764059066772, + "learning_rate": 4.735501040966383e-05, + "loss": 1.9741, + "step": 475 + }, + { + "epoch": 0.15, + "grad_norm": 1.2659822702407837, + "learning_rate": 4.730006313205143e-05, + "loss": 2.088, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.8884140849113464, + "learning_rate": 4.724458352188192e-05, + "loss": 2.2079, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 1.1937768459320068, + "learning_rate": 4.718857290352835e-05, + "loss": 2.048, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 0.9741552472114563, + "learning_rate": 4.713203261403966e-05, + "loss": 2.2569, + "step": 495 + }, + { + "epoch": 0.16, + "grad_norm": 0.7996780872344971, + "learning_rate": 4.707496400310874e-05, + "loss": 1.9574, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 1.8182051181793213, + "learning_rate": 4.701736843304025e-05, + "loss": 2.0951, + "step": 505 + }, + { + "epoch": 0.16, + "grad_norm": 1.507320761680603, + "learning_rate": 4.695924727871805e-05, + "loss": 2.0253, + "step": 510 + }, + { + "epoch": 0.16, + "grad_norm": 0.759121835231781, + "learning_rate": 4.690060192757242e-05, + "loss": 2.0602, + "step": 515 + }, + { + "epoch": 0.16, + "grad_norm": 1.5943195819854736, + "learning_rate": 4.684143377954691e-05, + "loss": 2.0386, + "step": 520 + }, + { + "epoch": 0.16, + "grad_norm": 0.8568710088729858, + "learning_rate": 4.6781744247064955e-05, + "loss": 2.073, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 1.3352620601654053, + "learning_rate": 4.6721534754996125e-05, + "loss": 2.1443, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 1.3417474031448364, + "learning_rate": 4.666080674062213e-05, + "loss": 2.0288, + "step": 535 + }, + { + "epoch": 0.17, + "grad_norm": 1.5334464311599731, + "learning_rate": 4.659956165360251e-05, + "loss": 2.0609, + "step": 540 + }, + { + "epoch": 0.17, + "grad_norm": 0.9658721089363098, + "learning_rate": 4.6537800955940005e-05, + "loss": 1.9539, + "step": 545 + }, + { + "epoch": 0.17, + "grad_norm": 1.9197947978973389, + "learning_rate": 4.647552612194572e-05, + "loss": 2.149, + "step": 550 + }, + { + "epoch": 0.17, + "grad_norm": 0.8512137532234192, + "learning_rate": 4.641273863820383e-05, + "loss": 1.9722, + "step": 555 + }, + { + "epoch": 0.17, + "grad_norm": 1.827289342880249, + "learning_rate": 4.634944000353622e-05, + "loss": 2.0729, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 1.088416337966919, + "learning_rate": 4.628563172896655e-05, + "loss": 1.9507, + "step": 565 + }, + { + "epoch": 0.18, + "grad_norm": 1.3566908836364746, + "learning_rate": 4.6221315337684353e-05, + "loss": 2.1643, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 1.3541293144226074, + "learning_rate": 4.615649236500854e-05, + "loss": 2.1839, + "step": 575 + }, + { + "epoch": 0.18, + "grad_norm": 0.991269588470459, + "learning_rate": 4.609116435835083e-05, + "loss": 2.0976, + "step": 580 + }, + { + "epoch": 0.18, + "grad_norm": 1.0280535221099854, + "learning_rate": 4.602533287717877e-05, + "loss": 2.1474, + "step": 585 + }, + { + "epoch": 0.18, + "grad_norm": 1.013123631477356, + "learning_rate": 4.5958999492978524e-05, + "loss": 2.1873, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 1.1753040552139282, + "learning_rate": 4.589216578921737e-05, + "loss": 2.1744, + "step": 595 + }, + { + "epoch": 0.19, + "grad_norm": 1.1839090585708618, + "learning_rate": 4.582483336130586e-05, + "loss": 1.9982, + "step": 600 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 8075669697331200.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-600/training_args.bin b/checkpoint-600/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-600/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/checkpoint-700/README.md b/checkpoint-700/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-700/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-700/adapter_config.json b/checkpoint-700/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-700/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-700/adapter_model.safetensors b/checkpoint-700/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..519010c05036fbfefd4f68b724a4cab3f8613d37 --- /dev/null +++ b/checkpoint-700/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06ca6573229cb446680565985ccf24a8351147a4edf2cfc1f7cc011a62d73564 +size 2099272 diff --git a/checkpoint-700/optimizer.pt b/checkpoint-700/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..36e3a37d6c821fb93ca878f9ea471e22b0fed49a --- /dev/null +++ b/checkpoint-700/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4ed15662356c964d79f923ef7abd0a52daf0bdb1b2281a961ef94b297806469 +size 4208302 diff --git a/checkpoint-700/rng_state.pth b/checkpoint-700/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..3aa193cfb710cbce7e0e5f0aa628c65f70ef170c --- /dev/null +++ b/checkpoint-700/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31d2256cedcdcf481db016cd4710c79ea21043a9ffa511d02107ef4ae12f26c6 +size 14244 diff --git a/checkpoint-700/scheduler.pt b/checkpoint-700/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a6c68f89990ea6c342bb897e29231458a217de4 --- /dev/null +++ b/checkpoint-700/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e3948b82370fb4926bdeac6e49739187cc941849d68582f9a3e787358b0b457 +size 1064 diff --git a/checkpoint-700/special_tokens_map.json b/checkpoint-700/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-700/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-700/tokenizer.model b/checkpoint-700/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-700/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-700/tokenizer_config.json b/checkpoint-700/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-700/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-700/trainer_state.json b/checkpoint-700/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0767ae05e7ada5d329693b9453a6d4de971d6239 --- /dev/null +++ b/checkpoint-700/trainer_state.json @@ -0,0 +1,1001 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.21767015198040968, + "eval_steps": 500, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.2905739545822144, + "learning_rate": 4.9500077405018807e-05, + "loss": 2.0987, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.8612006306648254, + "learning_rate": 4.9475480211302583e-05, + "loss": 2.1765, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 1.3128459453582764, + "learning_rate": 4.945029875489212e-05, + "loss": 1.9926, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 0.9610918164253235, + "learning_rate": 4.94245336369027e-05, + "loss": 2.0124, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.873160183429718, + "learning_rate": 4.939818547238241e-05, + "loss": 2.2229, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.5535285472869873, + "learning_rate": 4.9371254890297446e-05, + "loss": 2.2013, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951836347579956, + "learning_rate": 4.93437425335171e-05, + "loss": 2.014, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874170541763306, + "learning_rate": 4.9315649058798384e-05, + "loss": 2.1701, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 1.3503323793411255, + "learning_rate": 4.928697513677042e-05, + "loss": 2.1681, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091179132461548, + "learning_rate": 4.925772145191834e-05, + "loss": 2.1224, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.4428555965423584, + "learning_rate": 4.9227888702567044e-05, + "loss": 2.0512, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.8234395980834961, + "learning_rate": 4.9197477600864446e-05, + "loss": 2.1067, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 1.9094969034194946, + "learning_rate": 4.9166488872764526e-05, + "loss": 1.8884, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 1.0074087381362915, + "learning_rate": 4.913492325800999e-05, + "loss": 1.9345, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 1.0867297649383545, + "learning_rate": 4.910278151011458e-05, + "loss": 2.1928, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 0.6842357516288757, + "learning_rate": 4.907006439634516e-05, + "loss": 2.0407, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.8409023284912109, + "learning_rate": 4.903677269770329e-05, + "loss": 2.2344, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 0.8119503259658813, + "learning_rate": 4.900290720890671e-05, + "loss": 2.1296, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.9938147068023682, + "learning_rate": 4.8968468738370244e-05, + "loss": 2.152, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.9865244030952454, + "learning_rate": 4.8933458108186606e-05, + "loss": 1.9623, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944802284240723, + "learning_rate": 4.889787615410672e-05, + "loss": 1.915, + "step": 305 + }, + { + "epoch": 0.1, + "grad_norm": 1.3749767541885376, + "learning_rate": 4.886172372551977e-05, + "loss": 1.9934, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.9024938941001892, + "learning_rate": 4.882500168543294e-05, + "loss": 2.1541, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 1.1978263854980469, + "learning_rate": 4.878771091045082e-05, + "loss": 2.1688, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8360010981559753, + "learning_rate": 4.874985229075446e-05, + "loss": 2.1387, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 0.7683364152908325, + "learning_rate": 4.871142673008012e-05, + "loss": 2.0215, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 1.4230670928955078, + "learning_rate": 4.867243514569772e-05, + "loss": 1.9491, + "step": 335 + }, + { + "epoch": 0.11, + "grad_norm": 0.8198773860931396, + "learning_rate": 4.863287846838891e-05, + "loss": 2.0151, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 1.467207908630371, + "learning_rate": 4.85927576424249e-05, + "loss": 1.8906, + "step": 345 + }, + { + "epoch": 0.11, + "grad_norm": 0.9537095427513123, + "learning_rate": 4.855207362554385e-05, + "loss": 2.1844, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 1.0757155418395996, + "learning_rate": 4.851082738892809e-05, + "loss": 2.048, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6884938478469849, + "learning_rate": 4.8469019917180846e-05, + "loss": 1.9537, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 1.4680182933807373, + "learning_rate": 4.8426652208302814e-05, + "loss": 1.9731, + "step": 365 + }, + { + "epoch": 0.12, + "grad_norm": 1.1778632402420044, + "learning_rate": 4.83837252736683e-05, + "loss": 2.1395, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 1.2865056991577148, + "learning_rate": 4.834024013800108e-05, + "loss": 2.0016, + "step": 375 + }, + { + "epoch": 0.12, + "grad_norm": 1.055177092552185, + "learning_rate": 4.8296197839349944e-05, + "loss": 1.9632, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.0041871070861816, + "learning_rate": 4.825159942906389e-05, + "loss": 2.3302, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0026438236236572, + "learning_rate": 4.820644597176709e-05, + "loss": 2.1517, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 1.3532180786132812, + "learning_rate": 4.81607385453334e-05, + "loss": 2.1229, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 0.7670988440513611, + "learning_rate": 4.81144782408607e-05, + "loss": 2.1382, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 1.0405700206756592, + "learning_rate": 4.8067666162644774e-05, + "loss": 1.9614, + "step": 405 + }, + { + "epoch": 0.13, + "grad_norm": 1.2252662181854248, + "learning_rate": 4.802030342815304e-05, + "loss": 2.1399, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 1.237946629524231, + "learning_rate": 4.7972391167997754e-05, + "loss": 1.9034, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 0.8064705729484558, + "learning_rate": 4.7923930525909156e-05, + "loss": 2.0075, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 0.8717565536499023, + "learning_rate": 4.7874922658708065e-05, + "loss": 2.0105, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 1.6693098545074463, + "learning_rate": 4.782536873627832e-05, + "loss": 2.0242, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.82447350025177, + "learning_rate": 4.777526994153882e-05, + "loss": 2.0267, + "step": 435 + }, + { + "epoch": 0.14, + "grad_norm": 0.9926588535308838, + "learning_rate": 4.7724627470415307e-05, + "loss": 1.9119, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 1.0924450159072876, + "learning_rate": 4.7673442531811796e-05, + "loss": 2.2653, + "step": 445 + }, + { + "epoch": 0.14, + "grad_norm": 1.1592103242874146, + "learning_rate": 4.762171634758177e-05, + "loss": 2.0017, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.9172110557556152, + "learning_rate": 4.7569450152498927e-05, + "loss": 2.1408, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 1.1897525787353516, + "learning_rate": 4.751664519422778e-05, + "loss": 2.0935, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 0.8793094158172607, + "learning_rate": 4.746330273329386e-05, + "loss": 2.1142, + "step": 465 + }, + { + "epoch": 0.15, + "grad_norm": 1.4337489604949951, + "learning_rate": 4.740942404305356e-05, + "loss": 2.1289, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 1.0251764059066772, + "learning_rate": 4.735501040966383e-05, + "loss": 1.9741, + "step": 475 + }, + { + "epoch": 0.15, + "grad_norm": 1.2659822702407837, + "learning_rate": 4.730006313205143e-05, + "loss": 2.088, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.8884140849113464, + "learning_rate": 4.724458352188192e-05, + "loss": 2.2079, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 1.1937768459320068, + "learning_rate": 4.718857290352835e-05, + "loss": 2.048, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 0.9741552472114563, + "learning_rate": 4.713203261403966e-05, + "loss": 2.2569, + "step": 495 + }, + { + "epoch": 0.16, + "grad_norm": 0.7996780872344971, + "learning_rate": 4.707496400310874e-05, + "loss": 1.9574, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 1.8182051181793213, + "learning_rate": 4.701736843304025e-05, + "loss": 2.0951, + "step": 505 + }, + { + "epoch": 0.16, + "grad_norm": 1.507320761680603, + "learning_rate": 4.695924727871805e-05, + "loss": 2.0253, + "step": 510 + }, + { + "epoch": 0.16, + "grad_norm": 0.759121835231781, + "learning_rate": 4.690060192757242e-05, + "loss": 2.0602, + "step": 515 + }, + { + "epoch": 0.16, + "grad_norm": 1.5943195819854736, + "learning_rate": 4.684143377954691e-05, + "loss": 2.0386, + "step": 520 + }, + { + "epoch": 0.16, + "grad_norm": 0.8568710088729858, + "learning_rate": 4.6781744247064955e-05, + "loss": 2.073, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 1.3352620601654053, + "learning_rate": 4.6721534754996125e-05, + "loss": 2.1443, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 1.3417474031448364, + "learning_rate": 4.666080674062213e-05, + "loss": 2.0288, + "step": 535 + }, + { + "epoch": 0.17, + "grad_norm": 1.5334464311599731, + "learning_rate": 4.659956165360251e-05, + "loss": 2.0609, + "step": 540 + }, + { + "epoch": 0.17, + "grad_norm": 0.9658721089363098, + "learning_rate": 4.6537800955940005e-05, + "loss": 1.9539, + "step": 545 + }, + { + "epoch": 0.17, + "grad_norm": 1.9197947978973389, + "learning_rate": 4.647552612194572e-05, + "loss": 2.149, + "step": 550 + }, + { + "epoch": 0.17, + "grad_norm": 0.8512137532234192, + "learning_rate": 4.641273863820383e-05, + "loss": 1.9722, + "step": 555 + }, + { + "epoch": 0.17, + "grad_norm": 1.827289342880249, + "learning_rate": 4.634944000353622e-05, + "loss": 2.0729, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 1.088416337966919, + "learning_rate": 4.628563172896655e-05, + "loss": 1.9507, + "step": 565 + }, + { + "epoch": 0.18, + "grad_norm": 1.3566908836364746, + "learning_rate": 4.6221315337684353e-05, + "loss": 2.1643, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 1.3541293144226074, + "learning_rate": 4.615649236500854e-05, + "loss": 2.1839, + "step": 575 + }, + { + "epoch": 0.18, + "grad_norm": 0.991269588470459, + "learning_rate": 4.609116435835083e-05, + "loss": 2.0976, + "step": 580 + }, + { + "epoch": 0.18, + "grad_norm": 1.0280535221099854, + "learning_rate": 4.602533287717877e-05, + "loss": 2.1474, + "step": 585 + }, + { + "epoch": 0.18, + "grad_norm": 1.013123631477356, + "learning_rate": 4.5958999492978524e-05, + "loss": 2.1873, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 1.1753040552139282, + "learning_rate": 4.589216578921737e-05, + "loss": 2.1744, + "step": 595 + }, + { + "epoch": 0.19, + "grad_norm": 1.1839090585708618, + "learning_rate": 4.582483336130586e-05, + "loss": 1.9982, + "step": 600 + }, + { + "epoch": 0.19, + "grad_norm": 1.0724798440933228, + "learning_rate": 4.575700381655979e-05, + "loss": 2.1234, + "step": 605 + }, + { + "epoch": 0.19, + "grad_norm": 2.009913682937622, + "learning_rate": 4.5688678774161796e-05, + "loss": 1.9478, + "step": 610 + }, + { + "epoch": 0.19, + "grad_norm": 0.9897060394287109, + "learning_rate": 4.561985986512271e-05, + "loss": 1.8268, + "step": 615 + }, + { + "epoch": 0.19, + "grad_norm": 0.8881808519363403, + "learning_rate": 4.555054873224263e-05, + "loss": 1.9887, + "step": 620 + }, + { + "epoch": 0.19, + "grad_norm": 1.155900001525879, + "learning_rate": 4.54807470300717e-05, + "loss": 2.0777, + "step": 625 + }, + { + "epoch": 0.2, + "grad_norm": 0.8782421350479126, + "learning_rate": 4.5410456424870596e-05, + "loss": 2.0566, + "step": 630 + }, + { + "epoch": 0.2, + "grad_norm": 1.3324674367904663, + "learning_rate": 4.5339678594570795e-05, + "loss": 2.047, + "step": 635 + }, + { + "epoch": 0.2, + "grad_norm": 1.9805939197540283, + "learning_rate": 4.526841522873449e-05, + "loss": 1.962, + "step": 640 + }, + { + "epoch": 0.2, + "grad_norm": 1.4999943971633911, + "learning_rate": 4.519666802851422e-05, + "loss": 2.0972, + "step": 645 + }, + { + "epoch": 0.2, + "grad_norm": 1.4504961967468262, + "learning_rate": 4.5124438706612376e-05, + "loss": 2.0041, + "step": 650 + }, + { + "epoch": 0.2, + "grad_norm": 0.9078169465065002, + "learning_rate": 4.505172898724018e-05, + "loss": 2.1229, + "step": 655 + }, + { + "epoch": 0.21, + "grad_norm": 1.1635804176330566, + "learning_rate": 4.497854060607662e-05, + "loss": 2.0195, + "step": 660 + }, + { + "epoch": 0.21, + "grad_norm": 1.46576726436615, + "learning_rate": 4.490487531022699e-05, + "loss": 2.0745, + "step": 665 + }, + { + "epoch": 0.21, + "grad_norm": 1.2094652652740479, + "learning_rate": 4.4830734858181145e-05, + "loss": 2.1068, + "step": 670 + }, + { + "epoch": 0.21, + "grad_norm": 1.4738895893096924, + "learning_rate": 4.47561210197716e-05, + "loss": 1.8088, + "step": 675 + }, + { + "epoch": 0.21, + "grad_norm": 1.23384690284729, + "learning_rate": 4.4681035576131215e-05, + "loss": 2.0995, + "step": 680 + }, + { + "epoch": 0.21, + "grad_norm": 0.8332946300506592, + "learning_rate": 4.46054803196507e-05, + "loss": 2.0541, + "step": 685 + }, + { + "epoch": 0.21, + "grad_norm": 0.9207485318183899, + "learning_rate": 4.452945705393586e-05, + "loss": 2.166, + "step": 690 + }, + { + "epoch": 0.22, + "grad_norm": 1.292945146560669, + "learning_rate": 4.445296759376449e-05, + "loss": 2.0784, + "step": 695 + }, + { + "epoch": 0.22, + "grad_norm": 0.9874763488769531, + "learning_rate": 4.437601376504307e-05, + "loss": 2.2087, + "step": 700 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 9413869513605120.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-700/training_args.bin b/checkpoint-700/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-700/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/checkpoint-800/README.md b/checkpoint-800/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-800/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-800/adapter_config.json b/checkpoint-800/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-800/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-800/adapter_model.safetensors b/checkpoint-800/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e2eb95ceed0046683758566699bf45cf72b882f8 --- /dev/null +++ b/checkpoint-800/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b49633f49bfaaa33680226ba7e593b872d26c76c02bab2bccf843da4a96db0f4 +size 2099272 diff --git a/checkpoint-800/optimizer.pt b/checkpoint-800/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c459361f2f3ea9a7eaef5d3ab71c45ef7a08e69 --- /dev/null +++ b/checkpoint-800/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9657b76e7d26778e03c8a161f0d330310b39906b799f62b65cf291f9e944ec0 +size 4208302 diff --git a/checkpoint-800/rng_state.pth b/checkpoint-800/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..0f0a85a9c207a534562cf3b0e4c1bff185f5c576 --- /dev/null +++ b/checkpoint-800/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c85a13b289667334e96b297491f81dd36f3f6c0aaebbb285549a5b298e8a6667 +size 14244 diff --git a/checkpoint-800/scheduler.pt b/checkpoint-800/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..f98cf0fe651ddf3567fd51c8ae4994213637588a --- /dev/null +++ b/checkpoint-800/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40325a00ffe4a46be0d94decbcacf24860f4d1c73596744fe87f05daf976c07b +size 1064 diff --git a/checkpoint-800/special_tokens_map.json b/checkpoint-800/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-800/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-800/tokenizer.model b/checkpoint-800/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-800/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-800/tokenizer_config.json b/checkpoint-800/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-800/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-800/trainer_state.json b/checkpoint-800/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2ece309da91295224e67df51ca720d562eff65fd --- /dev/null +++ b/checkpoint-800/trainer_state.json @@ -0,0 +1,1141 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.24876588797761107, + "eval_steps": 500, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.2905739545822144, + "learning_rate": 4.9500077405018807e-05, + "loss": 2.0987, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.8612006306648254, + "learning_rate": 4.9475480211302583e-05, + "loss": 2.1765, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 1.3128459453582764, + "learning_rate": 4.945029875489212e-05, + "loss": 1.9926, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 0.9610918164253235, + "learning_rate": 4.94245336369027e-05, + "loss": 2.0124, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.873160183429718, + "learning_rate": 4.939818547238241e-05, + "loss": 2.2229, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.5535285472869873, + "learning_rate": 4.9371254890297446e-05, + "loss": 2.2013, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951836347579956, + "learning_rate": 4.93437425335171e-05, + "loss": 2.014, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874170541763306, + "learning_rate": 4.9315649058798384e-05, + "loss": 2.1701, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 1.3503323793411255, + "learning_rate": 4.928697513677042e-05, + "loss": 2.1681, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091179132461548, + "learning_rate": 4.925772145191834e-05, + "loss": 2.1224, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.4428555965423584, + "learning_rate": 4.9227888702567044e-05, + "loss": 2.0512, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.8234395980834961, + "learning_rate": 4.9197477600864446e-05, + "loss": 2.1067, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 1.9094969034194946, + "learning_rate": 4.9166488872764526e-05, + "loss": 1.8884, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 1.0074087381362915, + "learning_rate": 4.913492325800999e-05, + "loss": 1.9345, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 1.0867297649383545, + "learning_rate": 4.910278151011458e-05, + "loss": 2.1928, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 0.6842357516288757, + "learning_rate": 4.907006439634516e-05, + "loss": 2.0407, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.8409023284912109, + "learning_rate": 4.903677269770329e-05, + "loss": 2.2344, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 0.8119503259658813, + "learning_rate": 4.900290720890671e-05, + "loss": 2.1296, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.9938147068023682, + "learning_rate": 4.8968468738370244e-05, + "loss": 2.152, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.9865244030952454, + "learning_rate": 4.8933458108186606e-05, + "loss": 1.9623, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944802284240723, + "learning_rate": 4.889787615410672e-05, + "loss": 1.915, + "step": 305 + }, + { + "epoch": 0.1, + "grad_norm": 1.3749767541885376, + "learning_rate": 4.886172372551977e-05, + "loss": 1.9934, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.9024938941001892, + "learning_rate": 4.882500168543294e-05, + "loss": 2.1541, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 1.1978263854980469, + "learning_rate": 4.878771091045082e-05, + "loss": 2.1688, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8360010981559753, + "learning_rate": 4.874985229075446e-05, + "loss": 2.1387, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 0.7683364152908325, + "learning_rate": 4.871142673008012e-05, + "loss": 2.0215, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 1.4230670928955078, + "learning_rate": 4.867243514569772e-05, + "loss": 1.9491, + "step": 335 + }, + { + "epoch": 0.11, + "grad_norm": 0.8198773860931396, + "learning_rate": 4.863287846838891e-05, + "loss": 2.0151, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 1.467207908630371, + "learning_rate": 4.85927576424249e-05, + "loss": 1.8906, + "step": 345 + }, + { + "epoch": 0.11, + "grad_norm": 0.9537095427513123, + "learning_rate": 4.855207362554385e-05, + "loss": 2.1844, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 1.0757155418395996, + "learning_rate": 4.851082738892809e-05, + "loss": 2.048, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6884938478469849, + "learning_rate": 4.8469019917180846e-05, + "loss": 1.9537, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 1.4680182933807373, + "learning_rate": 4.8426652208302814e-05, + "loss": 1.9731, + "step": 365 + }, + { + "epoch": 0.12, + "grad_norm": 1.1778632402420044, + "learning_rate": 4.83837252736683e-05, + "loss": 2.1395, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 1.2865056991577148, + "learning_rate": 4.834024013800108e-05, + "loss": 2.0016, + "step": 375 + }, + { + "epoch": 0.12, + "grad_norm": 1.055177092552185, + "learning_rate": 4.8296197839349944e-05, + "loss": 1.9632, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.0041871070861816, + "learning_rate": 4.825159942906389e-05, + "loss": 2.3302, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0026438236236572, + "learning_rate": 4.820644597176709e-05, + "loss": 2.1517, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 1.3532180786132812, + "learning_rate": 4.81607385453334e-05, + "loss": 2.1229, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 0.7670988440513611, + "learning_rate": 4.81144782408607e-05, + "loss": 2.1382, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 1.0405700206756592, + "learning_rate": 4.8067666162644774e-05, + "loss": 1.9614, + "step": 405 + }, + { + "epoch": 0.13, + "grad_norm": 1.2252662181854248, + "learning_rate": 4.802030342815304e-05, + "loss": 2.1399, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 1.237946629524231, + "learning_rate": 4.7972391167997754e-05, + "loss": 1.9034, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 0.8064705729484558, + "learning_rate": 4.7923930525909156e-05, + "loss": 2.0075, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 0.8717565536499023, + "learning_rate": 4.7874922658708065e-05, + "loss": 2.0105, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 1.6693098545074463, + "learning_rate": 4.782536873627832e-05, + "loss": 2.0242, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.82447350025177, + "learning_rate": 4.777526994153882e-05, + "loss": 2.0267, + "step": 435 + }, + { + "epoch": 0.14, + "grad_norm": 0.9926588535308838, + "learning_rate": 4.7724627470415307e-05, + "loss": 1.9119, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 1.0924450159072876, + "learning_rate": 4.7673442531811796e-05, + "loss": 2.2653, + "step": 445 + }, + { + "epoch": 0.14, + "grad_norm": 1.1592103242874146, + "learning_rate": 4.762171634758177e-05, + "loss": 2.0017, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.9172110557556152, + "learning_rate": 4.7569450152498927e-05, + "loss": 2.1408, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 1.1897525787353516, + "learning_rate": 4.751664519422778e-05, + "loss": 2.0935, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 0.8793094158172607, + "learning_rate": 4.746330273329386e-05, + "loss": 2.1142, + "step": 465 + }, + { + "epoch": 0.15, + "grad_norm": 1.4337489604949951, + "learning_rate": 4.740942404305356e-05, + "loss": 2.1289, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 1.0251764059066772, + "learning_rate": 4.735501040966383e-05, + "loss": 1.9741, + "step": 475 + }, + { + "epoch": 0.15, + "grad_norm": 1.2659822702407837, + "learning_rate": 4.730006313205143e-05, + "loss": 2.088, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.8884140849113464, + "learning_rate": 4.724458352188192e-05, + "loss": 2.2079, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 1.1937768459320068, + "learning_rate": 4.718857290352835e-05, + "loss": 2.048, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 0.9741552472114563, + "learning_rate": 4.713203261403966e-05, + "loss": 2.2569, + "step": 495 + }, + { + "epoch": 0.16, + "grad_norm": 0.7996780872344971, + "learning_rate": 4.707496400310874e-05, + "loss": 1.9574, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 1.8182051181793213, + "learning_rate": 4.701736843304025e-05, + "loss": 2.0951, + "step": 505 + }, + { + "epoch": 0.16, + "grad_norm": 1.507320761680603, + "learning_rate": 4.695924727871805e-05, + "loss": 2.0253, + "step": 510 + }, + { + "epoch": 0.16, + "grad_norm": 0.759121835231781, + "learning_rate": 4.690060192757242e-05, + "loss": 2.0602, + "step": 515 + }, + { + "epoch": 0.16, + "grad_norm": 1.5943195819854736, + "learning_rate": 4.684143377954691e-05, + "loss": 2.0386, + "step": 520 + }, + { + "epoch": 0.16, + "grad_norm": 0.8568710088729858, + "learning_rate": 4.6781744247064955e-05, + "loss": 2.073, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 1.3352620601654053, + "learning_rate": 4.6721534754996125e-05, + "loss": 2.1443, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 1.3417474031448364, + "learning_rate": 4.666080674062213e-05, + "loss": 2.0288, + "step": 535 + }, + { + "epoch": 0.17, + "grad_norm": 1.5334464311599731, + "learning_rate": 4.659956165360251e-05, + "loss": 2.0609, + "step": 540 + }, + { + "epoch": 0.17, + "grad_norm": 0.9658721089363098, + "learning_rate": 4.6537800955940005e-05, + "loss": 1.9539, + "step": 545 + }, + { + "epoch": 0.17, + "grad_norm": 1.9197947978973389, + "learning_rate": 4.647552612194572e-05, + "loss": 2.149, + "step": 550 + }, + { + "epoch": 0.17, + "grad_norm": 0.8512137532234192, + "learning_rate": 4.641273863820383e-05, + "loss": 1.9722, + "step": 555 + }, + { + "epoch": 0.17, + "grad_norm": 1.827289342880249, + "learning_rate": 4.634944000353622e-05, + "loss": 2.0729, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 1.088416337966919, + "learning_rate": 4.628563172896655e-05, + "loss": 1.9507, + "step": 565 + }, + { + "epoch": 0.18, + "grad_norm": 1.3566908836364746, + "learning_rate": 4.6221315337684353e-05, + "loss": 2.1643, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 1.3541293144226074, + "learning_rate": 4.615649236500854e-05, + "loss": 2.1839, + "step": 575 + }, + { + "epoch": 0.18, + "grad_norm": 0.991269588470459, + "learning_rate": 4.609116435835083e-05, + "loss": 2.0976, + "step": 580 + }, + { + "epoch": 0.18, + "grad_norm": 1.0280535221099854, + "learning_rate": 4.602533287717877e-05, + "loss": 2.1474, + "step": 585 + }, + { + "epoch": 0.18, + "grad_norm": 1.013123631477356, + "learning_rate": 4.5958999492978524e-05, + "loss": 2.1873, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 1.1753040552139282, + "learning_rate": 4.589216578921737e-05, + "loss": 2.1744, + "step": 595 + }, + { + "epoch": 0.19, + "grad_norm": 1.1839090585708618, + "learning_rate": 4.582483336130586e-05, + "loss": 1.9982, + "step": 600 + }, + { + "epoch": 0.19, + "grad_norm": 1.0724798440933228, + "learning_rate": 4.575700381655979e-05, + "loss": 2.1234, + "step": 605 + }, + { + "epoch": 0.19, + "grad_norm": 2.009913682937622, + "learning_rate": 4.5688678774161796e-05, + "loss": 1.9478, + "step": 610 + }, + { + "epoch": 0.19, + "grad_norm": 0.9897060394287109, + "learning_rate": 4.561985986512271e-05, + "loss": 1.8268, + "step": 615 + }, + { + "epoch": 0.19, + "grad_norm": 0.8881808519363403, + "learning_rate": 4.555054873224263e-05, + "loss": 1.9887, + "step": 620 + }, + { + "epoch": 0.19, + "grad_norm": 1.155900001525879, + "learning_rate": 4.54807470300717e-05, + "loss": 2.0777, + "step": 625 + }, + { + "epoch": 0.2, + "grad_norm": 0.8782421350479126, + "learning_rate": 4.5410456424870596e-05, + "loss": 2.0566, + "step": 630 + }, + { + "epoch": 0.2, + "grad_norm": 1.3324674367904663, + "learning_rate": 4.5339678594570795e-05, + "loss": 2.047, + "step": 635 + }, + { + "epoch": 0.2, + "grad_norm": 1.9805939197540283, + "learning_rate": 4.526841522873449e-05, + "loss": 1.962, + "step": 640 + }, + { + "epoch": 0.2, + "grad_norm": 1.4999943971633911, + "learning_rate": 4.519666802851422e-05, + "loss": 2.0972, + "step": 645 + }, + { + "epoch": 0.2, + "grad_norm": 1.4504961967468262, + "learning_rate": 4.5124438706612376e-05, + "loss": 2.0041, + "step": 650 + }, + { + "epoch": 0.2, + "grad_norm": 0.9078169465065002, + "learning_rate": 4.505172898724018e-05, + "loss": 2.1229, + "step": 655 + }, + { + "epoch": 0.21, + "grad_norm": 1.1635804176330566, + "learning_rate": 4.497854060607662e-05, + "loss": 2.0195, + "step": 660 + }, + { + "epoch": 0.21, + "grad_norm": 1.46576726436615, + "learning_rate": 4.490487531022699e-05, + "loss": 2.0745, + "step": 665 + }, + { + "epoch": 0.21, + "grad_norm": 1.2094652652740479, + "learning_rate": 4.4830734858181145e-05, + "loss": 2.1068, + "step": 670 + }, + { + "epoch": 0.21, + "grad_norm": 1.4738895893096924, + "learning_rate": 4.47561210197716e-05, + "loss": 1.8088, + "step": 675 + }, + { + "epoch": 0.21, + "grad_norm": 1.23384690284729, + "learning_rate": 4.4681035576131215e-05, + "loss": 2.0995, + "step": 680 + }, + { + "epoch": 0.21, + "grad_norm": 0.8332946300506592, + "learning_rate": 4.46054803196507e-05, + "loss": 2.0541, + "step": 685 + }, + { + "epoch": 0.21, + "grad_norm": 0.9207485318183899, + "learning_rate": 4.452945705393586e-05, + "loss": 2.166, + "step": 690 + }, + { + "epoch": 0.22, + "grad_norm": 1.292945146560669, + "learning_rate": 4.445296759376449e-05, + "loss": 2.0784, + "step": 695 + }, + { + "epoch": 0.22, + "grad_norm": 0.9874763488769531, + "learning_rate": 4.437601376504307e-05, + "loss": 2.2087, + "step": 700 + }, + { + "epoch": 0.22, + "grad_norm": 0.9427415132522583, + "learning_rate": 4.4298597404763186e-05, + "loss": 2.1199, + "step": 705 + }, + { + "epoch": 0.22, + "grad_norm": 1.7369529008865356, + "learning_rate": 4.422072036095768e-05, + "loss": 2.0355, + "step": 710 + }, + { + "epoch": 0.22, + "grad_norm": 1.2423696517944336, + "learning_rate": 4.414238449265654e-05, + "loss": 2.0011, + "step": 715 + }, + { + "epoch": 0.22, + "grad_norm": 1.2304831743240356, + "learning_rate": 4.406359166984249e-05, + "loss": 2.0368, + "step": 720 + }, + { + "epoch": 0.23, + "grad_norm": 0.9090413451194763, + "learning_rate": 4.39843437734064e-05, + "loss": 1.9983, + "step": 725 + }, + { + "epoch": 0.23, + "grad_norm": 1.2729507684707642, + "learning_rate": 4.390464269510233e-05, + "loss": 2.021, + "step": 730 + }, + { + "epoch": 0.23, + "grad_norm": 1.3009227514266968, + "learning_rate": 4.382449033750244e-05, + "loss": 1.9743, + "step": 735 + }, + { + "epoch": 0.23, + "grad_norm": 1.5456056594848633, + "learning_rate": 4.37438886139515e-05, + "loss": 2.0689, + "step": 740 + }, + { + "epoch": 0.23, + "grad_norm": 1.3235007524490356, + "learning_rate": 4.3662839448521264e-05, + "loss": 2.0838, + "step": 745 + }, + { + "epoch": 0.23, + "grad_norm": 2.2074007987976074, + "learning_rate": 4.358134477596454e-05, + "loss": 2.0835, + "step": 750 + }, + { + "epoch": 0.23, + "grad_norm": 1.403738021850586, + "learning_rate": 4.3499406541668966e-05, + "loss": 2.0916, + "step": 755 + }, + { + "epoch": 0.24, + "grad_norm": 1.0940325260162354, + "learning_rate": 4.3417026701610616e-05, + "loss": 1.972, + "step": 760 + }, + { + "epoch": 0.24, + "grad_norm": 1.666353702545166, + "learning_rate": 4.3334207222307275e-05, + "loss": 1.927, + "step": 765 + }, + { + "epoch": 0.24, + "grad_norm": 1.0777515172958374, + "learning_rate": 4.325095008077154e-05, + "loss": 2.1192, + "step": 770 + }, + { + "epoch": 0.24, + "grad_norm": 1.7218186855316162, + "learning_rate": 4.316725726446353e-05, + "loss": 2.0774, + "step": 775 + }, + { + "epoch": 0.24, + "grad_norm": 1.356753945350647, + "learning_rate": 4.3083130771243586e-05, + "loss": 2.0847, + "step": 780 + }, + { + "epoch": 0.24, + "grad_norm": 0.9967429637908936, + "learning_rate": 4.299857260932445e-05, + "loss": 2.0485, + "step": 785 + }, + { + "epoch": 0.25, + "grad_norm": 1.6216442584991455, + "learning_rate": 4.2913584797223397e-05, + "loss": 2.1008, + "step": 790 + }, + { + "epoch": 0.25, + "grad_norm": 1.2556742429733276, + "learning_rate": 4.2828169363714016e-05, + "loss": 1.9209, + "step": 795 + }, + { + "epoch": 0.25, + "grad_norm": 1.1800439357757568, + "learning_rate": 4.274232834777782e-05, + "loss": 1.9722, + "step": 800 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.074142184472576e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-800/training_args.bin b/checkpoint-800/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-800/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/checkpoint-900/README.md b/checkpoint-900/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b21eb87c49b1ea11193cd8f664d04b06c22e6bb5 --- /dev/null +++ b/checkpoint-900/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: hfl/chinese-alpaca-2-1.3b +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.9.0 \ No newline at end of file diff --git a/checkpoint-900/adapter_config.json b/checkpoint-900/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..67326eac9d1aeaece3cee6be49e088077b9cebe9 --- /dev/null +++ b/checkpoint-900/adapter_config.json @@ -0,0 +1,28 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "hfl/chinese-alpaca-2-1.3b", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-900/adapter_model.safetensors b/checkpoint-900/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3682096744cfd33ce29dcfc37fc84c4fa52aea28 --- /dev/null +++ b/checkpoint-900/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ce94ecc5ab6d3759e8ea5e03085ec03b9c2a860b5f97c6276d570119a70b613 +size 2099272 diff --git a/checkpoint-900/optimizer.pt b/checkpoint-900/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ac741897a97b0a0468113a0f72331ee1bc43c5bb --- /dev/null +++ b/checkpoint-900/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6fb7ce91c15977fb46bfc4d6d25811ee80e88fe72c15709a09f6b5c8935ccea +size 4208302 diff --git a/checkpoint-900/rng_state.pth b/checkpoint-900/rng_state.pth new file mode 100644 index 0000000000000000000000000000000000000000..1b1e07051b86e94bd8d506559e5a65a7dc9aaa22 --- /dev/null +++ b/checkpoint-900/rng_state.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd244b4ac9acd153035e9e3d15a4607589d8225655e384d50928915a88f96274 +size 14244 diff --git a/checkpoint-900/scheduler.pt b/checkpoint-900/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ec24a4794e3cd9713adc315fc32ddc6b5a730510 --- /dev/null +++ b/checkpoint-900/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6949c1dce6aaba51a83cb179f28e554f3fe314da16271701fb1cbe14eb005cff +size 1064 diff --git a/checkpoint-900/special_tokens_map.json b/checkpoint-900/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/checkpoint-900/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-900/tokenizer.model b/checkpoint-900/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/checkpoint-900/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/checkpoint-900/tokenizer_config.json b/checkpoint-900/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/checkpoint-900/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/checkpoint-900/trainer_state.json b/checkpoint-900/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..56a11a2de1a41d799bd203e05a48ecb1f2d992e3 --- /dev/null +++ b/checkpoint-900/trainer_state.json @@ -0,0 +1,1281 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.27986162397481246, + "eval_steps": 500, + "global_step": 900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.2905739545822144, + "learning_rate": 4.9500077405018807e-05, + "loss": 2.0987, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.8612006306648254, + "learning_rate": 4.9475480211302583e-05, + "loss": 2.1765, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 1.3128459453582764, + "learning_rate": 4.945029875489212e-05, + "loss": 1.9926, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 0.9610918164253235, + "learning_rate": 4.94245336369027e-05, + "loss": 2.0124, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.873160183429718, + "learning_rate": 4.939818547238241e-05, + "loss": 2.2229, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.5535285472869873, + "learning_rate": 4.9371254890297446e-05, + "loss": 2.2013, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951836347579956, + "learning_rate": 4.93437425335171e-05, + "loss": 2.014, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874170541763306, + "learning_rate": 4.9315649058798384e-05, + "loss": 2.1701, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 1.3503323793411255, + "learning_rate": 4.928697513677042e-05, + "loss": 2.1681, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091179132461548, + "learning_rate": 4.925772145191834e-05, + "loss": 2.1224, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.4428555965423584, + "learning_rate": 4.9227888702567044e-05, + "loss": 2.0512, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.8234395980834961, + "learning_rate": 4.9197477600864446e-05, + "loss": 2.1067, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 1.9094969034194946, + "learning_rate": 4.9166488872764526e-05, + "loss": 1.8884, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 1.0074087381362915, + "learning_rate": 4.913492325800999e-05, + "loss": 1.9345, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 1.0867297649383545, + "learning_rate": 4.910278151011458e-05, + "loss": 2.1928, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 0.6842357516288757, + "learning_rate": 4.907006439634516e-05, + "loss": 2.0407, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.8409023284912109, + "learning_rate": 4.903677269770329e-05, + "loss": 2.2344, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 0.8119503259658813, + "learning_rate": 4.900290720890671e-05, + "loss": 2.1296, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.9938147068023682, + "learning_rate": 4.8968468738370244e-05, + "loss": 2.152, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.9865244030952454, + "learning_rate": 4.8933458108186606e-05, + "loss": 1.9623, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944802284240723, + "learning_rate": 4.889787615410672e-05, + "loss": 1.915, + "step": 305 + }, + { + "epoch": 0.1, + "grad_norm": 1.3749767541885376, + "learning_rate": 4.886172372551977e-05, + "loss": 1.9934, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.9024938941001892, + "learning_rate": 4.882500168543294e-05, + "loss": 2.1541, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 1.1978263854980469, + "learning_rate": 4.878771091045082e-05, + "loss": 2.1688, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8360010981559753, + "learning_rate": 4.874985229075446e-05, + "loss": 2.1387, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 0.7683364152908325, + "learning_rate": 4.871142673008012e-05, + "loss": 2.0215, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 1.4230670928955078, + "learning_rate": 4.867243514569772e-05, + "loss": 1.9491, + "step": 335 + }, + { + "epoch": 0.11, + "grad_norm": 0.8198773860931396, + "learning_rate": 4.863287846838891e-05, + "loss": 2.0151, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 1.467207908630371, + "learning_rate": 4.85927576424249e-05, + "loss": 1.8906, + "step": 345 + }, + { + "epoch": 0.11, + "grad_norm": 0.9537095427513123, + "learning_rate": 4.855207362554385e-05, + "loss": 2.1844, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 1.0757155418395996, + "learning_rate": 4.851082738892809e-05, + "loss": 2.048, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6884938478469849, + "learning_rate": 4.8469019917180846e-05, + "loss": 1.9537, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 1.4680182933807373, + "learning_rate": 4.8426652208302814e-05, + "loss": 1.9731, + "step": 365 + }, + { + "epoch": 0.12, + "grad_norm": 1.1778632402420044, + "learning_rate": 4.83837252736683e-05, + "loss": 2.1395, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 1.2865056991577148, + "learning_rate": 4.834024013800108e-05, + "loss": 2.0016, + "step": 375 + }, + { + "epoch": 0.12, + "grad_norm": 1.055177092552185, + "learning_rate": 4.8296197839349944e-05, + "loss": 1.9632, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.0041871070861816, + "learning_rate": 4.825159942906389e-05, + "loss": 2.3302, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0026438236236572, + "learning_rate": 4.820644597176709e-05, + "loss": 2.1517, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 1.3532180786132812, + "learning_rate": 4.81607385453334e-05, + "loss": 2.1229, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 0.7670988440513611, + "learning_rate": 4.81144782408607e-05, + "loss": 2.1382, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 1.0405700206756592, + "learning_rate": 4.8067666162644774e-05, + "loss": 1.9614, + "step": 405 + }, + { + "epoch": 0.13, + "grad_norm": 1.2252662181854248, + "learning_rate": 4.802030342815304e-05, + "loss": 2.1399, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 1.237946629524231, + "learning_rate": 4.7972391167997754e-05, + "loss": 1.9034, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 0.8064705729484558, + "learning_rate": 4.7923930525909156e-05, + "loss": 2.0075, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 0.8717565536499023, + "learning_rate": 4.7874922658708065e-05, + "loss": 2.0105, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 1.6693098545074463, + "learning_rate": 4.782536873627832e-05, + "loss": 2.0242, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.82447350025177, + "learning_rate": 4.777526994153882e-05, + "loss": 2.0267, + "step": 435 + }, + { + "epoch": 0.14, + "grad_norm": 0.9926588535308838, + "learning_rate": 4.7724627470415307e-05, + "loss": 1.9119, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 1.0924450159072876, + "learning_rate": 4.7673442531811796e-05, + "loss": 2.2653, + "step": 445 + }, + { + "epoch": 0.14, + "grad_norm": 1.1592103242874146, + "learning_rate": 4.762171634758177e-05, + "loss": 2.0017, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.9172110557556152, + "learning_rate": 4.7569450152498927e-05, + "loss": 2.1408, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 1.1897525787353516, + "learning_rate": 4.751664519422778e-05, + "loss": 2.0935, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 0.8793094158172607, + "learning_rate": 4.746330273329386e-05, + "loss": 2.1142, + "step": 465 + }, + { + "epoch": 0.15, + "grad_norm": 1.4337489604949951, + "learning_rate": 4.740942404305356e-05, + "loss": 2.1289, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 1.0251764059066772, + "learning_rate": 4.735501040966383e-05, + "loss": 1.9741, + "step": 475 + }, + { + "epoch": 0.15, + "grad_norm": 1.2659822702407837, + "learning_rate": 4.730006313205143e-05, + "loss": 2.088, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.8884140849113464, + "learning_rate": 4.724458352188192e-05, + "loss": 2.2079, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 1.1937768459320068, + "learning_rate": 4.718857290352835e-05, + "loss": 2.048, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 0.9741552472114563, + "learning_rate": 4.713203261403966e-05, + "loss": 2.2569, + "step": 495 + }, + { + "epoch": 0.16, + "grad_norm": 0.7996780872344971, + "learning_rate": 4.707496400310874e-05, + "loss": 1.9574, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 1.8182051181793213, + "learning_rate": 4.701736843304025e-05, + "loss": 2.0951, + "step": 505 + }, + { + "epoch": 0.16, + "grad_norm": 1.507320761680603, + "learning_rate": 4.695924727871805e-05, + "loss": 2.0253, + "step": 510 + }, + { + "epoch": 0.16, + "grad_norm": 0.759121835231781, + "learning_rate": 4.690060192757242e-05, + "loss": 2.0602, + "step": 515 + }, + { + "epoch": 0.16, + "grad_norm": 1.5943195819854736, + "learning_rate": 4.684143377954691e-05, + "loss": 2.0386, + "step": 520 + }, + { + "epoch": 0.16, + "grad_norm": 0.8568710088729858, + "learning_rate": 4.6781744247064955e-05, + "loss": 2.073, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 1.3352620601654053, + "learning_rate": 4.6721534754996125e-05, + "loss": 2.1443, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 1.3417474031448364, + "learning_rate": 4.666080674062213e-05, + "loss": 2.0288, + "step": 535 + }, + { + "epoch": 0.17, + "grad_norm": 1.5334464311599731, + "learning_rate": 4.659956165360251e-05, + "loss": 2.0609, + "step": 540 + }, + { + "epoch": 0.17, + "grad_norm": 0.9658721089363098, + "learning_rate": 4.6537800955940005e-05, + "loss": 1.9539, + "step": 545 + }, + { + "epoch": 0.17, + "grad_norm": 1.9197947978973389, + "learning_rate": 4.647552612194572e-05, + "loss": 2.149, + "step": 550 + }, + { + "epoch": 0.17, + "grad_norm": 0.8512137532234192, + "learning_rate": 4.641273863820383e-05, + "loss": 1.9722, + "step": 555 + }, + { + "epoch": 0.17, + "grad_norm": 1.827289342880249, + "learning_rate": 4.634944000353622e-05, + "loss": 2.0729, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 1.088416337966919, + "learning_rate": 4.628563172896655e-05, + "loss": 1.9507, + "step": 565 + }, + { + "epoch": 0.18, + "grad_norm": 1.3566908836364746, + "learning_rate": 4.6221315337684353e-05, + "loss": 2.1643, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 1.3541293144226074, + "learning_rate": 4.615649236500854e-05, + "loss": 2.1839, + "step": 575 + }, + { + "epoch": 0.18, + "grad_norm": 0.991269588470459, + "learning_rate": 4.609116435835083e-05, + "loss": 2.0976, + "step": 580 + }, + { + "epoch": 0.18, + "grad_norm": 1.0280535221099854, + "learning_rate": 4.602533287717877e-05, + "loss": 2.1474, + "step": 585 + }, + { + "epoch": 0.18, + "grad_norm": 1.013123631477356, + "learning_rate": 4.5958999492978524e-05, + "loss": 2.1873, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 1.1753040552139282, + "learning_rate": 4.589216578921737e-05, + "loss": 2.1744, + "step": 595 + }, + { + "epoch": 0.19, + "grad_norm": 1.1839090585708618, + "learning_rate": 4.582483336130586e-05, + "loss": 1.9982, + "step": 600 + }, + { + "epoch": 0.19, + "grad_norm": 1.0724798440933228, + "learning_rate": 4.575700381655979e-05, + "loss": 2.1234, + "step": 605 + }, + { + "epoch": 0.19, + "grad_norm": 2.009913682937622, + "learning_rate": 4.5688678774161796e-05, + "loss": 1.9478, + "step": 610 + }, + { + "epoch": 0.19, + "grad_norm": 0.9897060394287109, + "learning_rate": 4.561985986512271e-05, + "loss": 1.8268, + "step": 615 + }, + { + "epoch": 0.19, + "grad_norm": 0.8881808519363403, + "learning_rate": 4.555054873224263e-05, + "loss": 1.9887, + "step": 620 + }, + { + "epoch": 0.19, + "grad_norm": 1.155900001525879, + "learning_rate": 4.54807470300717e-05, + "loss": 2.0777, + "step": 625 + }, + { + "epoch": 0.2, + "grad_norm": 0.8782421350479126, + "learning_rate": 4.5410456424870596e-05, + "loss": 2.0566, + "step": 630 + }, + { + "epoch": 0.2, + "grad_norm": 1.3324674367904663, + "learning_rate": 4.5339678594570795e-05, + "loss": 2.047, + "step": 635 + }, + { + "epoch": 0.2, + "grad_norm": 1.9805939197540283, + "learning_rate": 4.526841522873449e-05, + "loss": 1.962, + "step": 640 + }, + { + "epoch": 0.2, + "grad_norm": 1.4999943971633911, + "learning_rate": 4.519666802851422e-05, + "loss": 2.0972, + "step": 645 + }, + { + "epoch": 0.2, + "grad_norm": 1.4504961967468262, + "learning_rate": 4.5124438706612376e-05, + "loss": 2.0041, + "step": 650 + }, + { + "epoch": 0.2, + "grad_norm": 0.9078169465065002, + "learning_rate": 4.505172898724018e-05, + "loss": 2.1229, + "step": 655 + }, + { + "epoch": 0.21, + "grad_norm": 1.1635804176330566, + "learning_rate": 4.497854060607662e-05, + "loss": 2.0195, + "step": 660 + }, + { + "epoch": 0.21, + "grad_norm": 1.46576726436615, + "learning_rate": 4.490487531022699e-05, + "loss": 2.0745, + "step": 665 + }, + { + "epoch": 0.21, + "grad_norm": 1.2094652652740479, + "learning_rate": 4.4830734858181145e-05, + "loss": 2.1068, + "step": 670 + }, + { + "epoch": 0.21, + "grad_norm": 1.4738895893096924, + "learning_rate": 4.47561210197716e-05, + "loss": 1.8088, + "step": 675 + }, + { + "epoch": 0.21, + "grad_norm": 1.23384690284729, + "learning_rate": 4.4681035576131215e-05, + "loss": 2.0995, + "step": 680 + }, + { + "epoch": 0.21, + "grad_norm": 0.8332946300506592, + "learning_rate": 4.46054803196507e-05, + "loss": 2.0541, + "step": 685 + }, + { + "epoch": 0.21, + "grad_norm": 0.9207485318183899, + "learning_rate": 4.452945705393586e-05, + "loss": 2.166, + "step": 690 + }, + { + "epoch": 0.22, + "grad_norm": 1.292945146560669, + "learning_rate": 4.445296759376449e-05, + "loss": 2.0784, + "step": 695 + }, + { + "epoch": 0.22, + "grad_norm": 0.9874763488769531, + "learning_rate": 4.437601376504307e-05, + "loss": 2.2087, + "step": 700 + }, + { + "epoch": 0.22, + "grad_norm": 0.9427415132522583, + "learning_rate": 4.4298597404763186e-05, + "loss": 2.1199, + "step": 705 + }, + { + "epoch": 0.22, + "grad_norm": 1.7369529008865356, + "learning_rate": 4.422072036095768e-05, + "loss": 2.0355, + "step": 710 + }, + { + "epoch": 0.22, + "grad_norm": 1.2423696517944336, + "learning_rate": 4.414238449265654e-05, + "loss": 2.0011, + "step": 715 + }, + { + "epoch": 0.22, + "grad_norm": 1.2304831743240356, + "learning_rate": 4.406359166984249e-05, + "loss": 2.0368, + "step": 720 + }, + { + "epoch": 0.23, + "grad_norm": 0.9090413451194763, + "learning_rate": 4.39843437734064e-05, + "loss": 1.9983, + "step": 725 + }, + { + "epoch": 0.23, + "grad_norm": 1.2729507684707642, + "learning_rate": 4.390464269510233e-05, + "loss": 2.021, + "step": 730 + }, + { + "epoch": 0.23, + "grad_norm": 1.3009227514266968, + "learning_rate": 4.382449033750244e-05, + "loss": 1.9743, + "step": 735 + }, + { + "epoch": 0.23, + "grad_norm": 1.5456056594848633, + "learning_rate": 4.37438886139515e-05, + "loss": 2.0689, + "step": 740 + }, + { + "epoch": 0.23, + "grad_norm": 1.3235007524490356, + "learning_rate": 4.3662839448521264e-05, + "loss": 2.0838, + "step": 745 + }, + { + "epoch": 0.23, + "grad_norm": 2.2074007987976074, + "learning_rate": 4.358134477596454e-05, + "loss": 2.0835, + "step": 750 + }, + { + "epoch": 0.23, + "grad_norm": 1.403738021850586, + "learning_rate": 4.3499406541668966e-05, + "loss": 2.0916, + "step": 755 + }, + { + "epoch": 0.24, + "grad_norm": 1.0940325260162354, + "learning_rate": 4.3417026701610616e-05, + "loss": 1.972, + "step": 760 + }, + { + "epoch": 0.24, + "grad_norm": 1.666353702545166, + "learning_rate": 4.3334207222307275e-05, + "loss": 1.927, + "step": 765 + }, + { + "epoch": 0.24, + "grad_norm": 1.0777515172958374, + "learning_rate": 4.325095008077154e-05, + "loss": 2.1192, + "step": 770 + }, + { + "epoch": 0.24, + "grad_norm": 1.7218186855316162, + "learning_rate": 4.316725726446353e-05, + "loss": 2.0774, + "step": 775 + }, + { + "epoch": 0.24, + "grad_norm": 1.356753945350647, + "learning_rate": 4.3083130771243586e-05, + "loss": 2.0847, + "step": 780 + }, + { + "epoch": 0.24, + "grad_norm": 0.9967429637908936, + "learning_rate": 4.299857260932445e-05, + "loss": 2.0485, + "step": 785 + }, + { + "epoch": 0.25, + "grad_norm": 1.6216442584991455, + "learning_rate": 4.2913584797223397e-05, + "loss": 2.1008, + "step": 790 + }, + { + "epoch": 0.25, + "grad_norm": 1.2556742429733276, + "learning_rate": 4.2828169363714016e-05, + "loss": 1.9209, + "step": 795 + }, + { + "epoch": 0.25, + "grad_norm": 1.1800439357757568, + "learning_rate": 4.274232834777782e-05, + "loss": 1.9722, + "step": 800 + }, + { + "epoch": 0.25, + "grad_norm": 1.1313499212265015, + "learning_rate": 4.2656063798555515e-05, + "loss": 1.9176, + "step": 805 + }, + { + "epoch": 0.25, + "grad_norm": 1.137534737586975, + "learning_rate": 4.256937777529815e-05, + "loss": 1.9929, + "step": 810 + }, + { + "epoch": 0.25, + "grad_norm": 1.0575093030929565, + "learning_rate": 4.2482272347317906e-05, + "loss": 2.166, + "step": 815 + }, + { + "epoch": 0.25, + "grad_norm": 1.5939594507217407, + "learning_rate": 4.2394749593938733e-05, + "loss": 2.1334, + "step": 820 + }, + { + "epoch": 0.26, + "grad_norm": 1.1045507192611694, + "learning_rate": 4.230681160444669e-05, + "loss": 2.0853, + "step": 825 + }, + { + "epoch": 0.26, + "grad_norm": 1.3480136394500732, + "learning_rate": 4.221846047804009e-05, + "loss": 2.1802, + "step": 830 + }, + { + "epoch": 0.26, + "grad_norm": 1.1822657585144043, + "learning_rate": 4.2129698323779366e-05, + "loss": 2.0739, + "step": 835 + }, + { + "epoch": 0.26, + "grad_norm": 1.1771117448806763, + "learning_rate": 4.204052726053676e-05, + "loss": 2.0238, + "step": 840 + }, + { + "epoch": 0.26, + "grad_norm": 1.4757814407348633, + "learning_rate": 4.195094941694571e-05, + "loss": 2.1557, + "step": 845 + }, + { + "epoch": 0.26, + "grad_norm": 0.9095075726509094, + "learning_rate": 4.1860966931350054e-05, + "loss": 2.1666, + "step": 850 + }, + { + "epoch": 0.27, + "grad_norm": 1.1039543151855469, + "learning_rate": 4.1770581951752976e-05, + "loss": 2.105, + "step": 855 + }, + { + "epoch": 0.27, + "grad_norm": 0.8517205119132996, + "learning_rate": 4.1679796635765735e-05, + "loss": 1.9656, + "step": 860 + }, + { + "epoch": 0.27, + "grad_norm": 1.239492654800415, + "learning_rate": 4.158861315055617e-05, + "loss": 2.0166, + "step": 865 + }, + { + "epoch": 0.27, + "grad_norm": 1.1358321905136108, + "learning_rate": 4.1497033672796924e-05, + "loss": 2.0076, + "step": 870 + }, + { + "epoch": 0.27, + "grad_norm": 1.6215249300003052, + "learning_rate": 4.140506038861356e-05, + "loss": 2.1594, + "step": 875 + }, + { + "epoch": 0.27, + "grad_norm": 1.0528080463409424, + "learning_rate": 4.131269549353229e-05, + "loss": 2.1416, + "step": 880 + }, + { + "epoch": 0.28, + "grad_norm": 0.8976901769638062, + "learning_rate": 4.1219941192427644e-05, + "loss": 2.1242, + "step": 885 + }, + { + "epoch": 0.28, + "grad_norm": 1.263594388961792, + "learning_rate": 4.112679969946977e-05, + "loss": 2.02, + "step": 890 + }, + { + "epoch": 0.28, + "grad_norm": 1.4173017740249634, + "learning_rate": 4.103327323807162e-05, + "loss": 2.0438, + "step": 895 + }, + { + "epoch": 0.28, + "grad_norm": 1.876170039176941, + "learning_rate": 4.093936404083585e-05, + "loss": 1.9806, + "step": 900 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.210101614051328e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-900/training_args.bin b/checkpoint-900/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/checkpoint-900/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..94c4595c4eb202faff8e45d273ceffbe19fe3036 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,30 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "unk_token": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.model b/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..3df664fcc67f61d0c7e635a60d9f55cd0624158a --- /dev/null +++ b/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3b8844863b200dfcca971db228e96ce388290dfcf72c15d7a9d2f604bac787c +size 844403 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6912f7cdcb84c3039edad59aa64f70dc4344a517 --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,54 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": true, + "rstrip": false, + "single_word": false, + "special": true + }, + "32000": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% set system_message = 'You are a helpful assistant. 你是一个乐于助人的助手。' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if loop.index0 == 0 and system_message is defined %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '' + '[INST] ' + content + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ content + '' }}{% endif %}{% endfor %}", + "clean_up_tokenization_spaces": false, + "eos_token": "", + "legacy": true, + "model_max_length": 1000000000000000019884624838656, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false, + "use_fast": false +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b5f4085a932af8484c59597eb4f5cc1bb81a42f8 --- /dev/null +++ b/train_results.json @@ -0,0 +1,7 @@ +{ + "epoch": 0.48, + "train_loss": 2.0602192145127516, + "train_runtime": 801.4891, + "train_samples_per_second": 64.198, + "train_steps_per_second": 4.011 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..7ee9f09e3b9bc99ef17371c007f04058a0fd7ae8 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,307 @@ +{"current_steps": 5, "total_steps": 3215, "loss": 2.0765, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.999970160815579e-05, "epoch": 0.0, "percentage": 0.16, "elapsed_time": "0:00:02", "remaining_time": "0:31:24"} +{"current_steps": 10, "total_steps": 3215, "loss": 2.2297, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.999880643974619e-05, "epoch": 0.0, "percentage": 0.31, "elapsed_time": "0:00:05", "remaining_time": "0:30:02"} +{"current_steps": 15, "total_steps": 3215, "loss": 2.1103, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9997314516140056e-05, "epoch": 0.0, "percentage": 0.47, "elapsed_time": "0:00:08", "remaining_time": "0:29:25"} +{"current_steps": 20, "total_steps": 3215, "loss": 2.0057, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.999522587295162e-05, "epoch": 0.01, "percentage": 0.62, "elapsed_time": "0:00:10", "remaining_time": "0:29:01"} +{"current_steps": 25, "total_steps": 3215, "loss": 2.1778, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.999254056003963e-05, "epoch": 0.01, "percentage": 0.78, "elapsed_time": "0:00:13", "remaining_time": "0:28:45"} +{"current_steps": 30, "total_steps": 3215, "loss": 2.2399, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.99892586415061e-05, "epoch": 0.01, "percentage": 0.93, "elapsed_time": "0:00:16", "remaining_time": "0:28:34"} +{"current_steps": 35, "total_steps": 3215, "loss": 2.3215, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9985380195694856e-05, "epoch": 0.01, "percentage": 1.09, "elapsed_time": "0:00:18", "remaining_time": "0:28:22"} +{"current_steps": 40, "total_steps": 3215, "loss": 1.8295, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.998090531518962e-05, "epoch": 0.01, "percentage": 1.24, "elapsed_time": "0:00:21", "remaining_time": "0:28:20"} +{"current_steps": 45, "total_steps": 3215, "loss": 2.0195, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9975834106811834e-05, "epoch": 0.01, "percentage": 1.4, "elapsed_time": "0:00:24", "remaining_time": "0:28:10"} +{"current_steps": 50, "total_steps": 3215, "loss": 2.1257, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.997016669161806e-05, "epoch": 0.02, "percentage": 1.56, "elapsed_time": "0:00:26", "remaining_time": "0:27:52"} +{"current_steps": 55, "total_steps": 3215, "loss": 2.057, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.996390320489715e-05, "epoch": 0.02, "percentage": 1.71, "elapsed_time": "0:00:29", "remaining_time": "0:28:02"} +{"current_steps": 60, "total_steps": 3215, "loss": 2.0753, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9957043796166966e-05, "epoch": 0.02, "percentage": 1.87, "elapsed_time": "0:00:31", "remaining_time": "0:27:56"} +{"current_steps": 65, "total_steps": 3215, "loss": 1.9736, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.994958862917083e-05, "epoch": 0.02, "percentage": 2.02, "elapsed_time": "0:00:34", "remaining_time": "0:27:43"} +{"current_steps": 70, "total_steps": 3215, "loss": 2.1572, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.994153788187363e-05, "epoch": 0.02, "percentage": 2.18, "elapsed_time": "0:00:37", "remaining_time": "0:27:52"} +{"current_steps": 75, "total_steps": 3215, "loss": 2.1491, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.993289174645757e-05, "epoch": 0.02, "percentage": 2.33, "elapsed_time": "0:00:39", "remaining_time": "0:27:47"} +{"current_steps": 80, "total_steps": 3215, "loss": 1.945, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.992365042931752e-05, "epoch": 0.02, "percentage": 2.49, "elapsed_time": "0:00:42", "remaining_time": "0:27:37"} +{"current_steps": 85, "total_steps": 3215, "loss": 2.0811, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.991381415105619e-05, "epoch": 0.03, "percentage": 2.64, "elapsed_time": "0:00:45", "remaining_time": "0:27:47"} +{"current_steps": 90, "total_steps": 3215, "loss": 1.961, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.990338314647881e-05, "epoch": 0.03, "percentage": 2.8, "elapsed_time": "0:00:47", "remaining_time": "0:27:45"} +{"current_steps": 95, "total_steps": 3215, "loss": 2.0653, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.98923576645875e-05, "epoch": 0.03, "percentage": 2.95, "elapsed_time": "0:00:50", "remaining_time": "0:27:44"} +{"current_steps": 100, "total_steps": 3215, "loss": 1.9999, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9880737968575365e-05, "epoch": 0.03, "percentage": 3.11, "elapsed_time": "0:00:53", "remaining_time": "0:27:43"} +{"current_steps": 105, "total_steps": 3215, "loss": 2.2258, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.986852433582022e-05, "epoch": 0.03, "percentage": 3.27, "elapsed_time": "0:00:56", "remaining_time": "0:27:57"} +{"current_steps": 110, "total_steps": 3215, "loss": 2.1034, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.985571705787793e-05, "epoch": 0.03, "percentage": 3.42, "elapsed_time": "0:00:59", "remaining_time": "0:28:00"} +{"current_steps": 115, "total_steps": 3215, "loss": 2.1753, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9842316440475475e-05, "epoch": 0.04, "percentage": 3.58, "elapsed_time": "0:01:02", "remaining_time": "0:27:57"} +{"current_steps": 120, "total_steps": 3215, "loss": 2.1384, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9828322803503665e-05, "epoch": 0.04, "percentage": 3.73, "elapsed_time": "0:01:04", "remaining_time": "0:27:53"} +{"current_steps": 125, "total_steps": 3215, "loss": 2.0521, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.981373648100946e-05, "epoch": 0.04, "percentage": 3.89, "elapsed_time": "0:01:07", "remaining_time": "0:27:52"} +{"current_steps": 130, "total_steps": 3215, "loss": 1.9256, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.979855782118802e-05, "epoch": 0.04, "percentage": 4.04, "elapsed_time": "0:01:10", "remaining_time": "0:27:48"} +{"current_steps": 135, "total_steps": 3215, "loss": 2.0882, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.978278718637443e-05, "epoch": 0.04, "percentage": 4.2, "elapsed_time": "0:01:13", "remaining_time": "0:27:47"} +{"current_steps": 140, "total_steps": 3215, "loss": 2.0724, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9766424953035e-05, "epoch": 0.04, "percentage": 4.35, "elapsed_time": "0:01:15", "remaining_time": "0:27:44"} +{"current_steps": 145, "total_steps": 3215, "loss": 2.1329, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.974947151175826e-05, "epoch": 0.05, "percentage": 4.51, "elapsed_time": "0:01:18", "remaining_time": "0:27:40"} +{"current_steps": 150, "total_steps": 3215, "loss": 2.082, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.973192726724572e-05, "epoch": 0.05, "percentage": 4.67, "elapsed_time": "0:01:21", "remaining_time": "0:27:38"} +{"current_steps": 155, "total_steps": 3215, "loss": 2.0366, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9713792638302145e-05, "epoch": 0.05, "percentage": 4.82, "elapsed_time": "0:01:23", "remaining_time": "0:27:36"} +{"current_steps": 160, "total_steps": 3215, "loss": 2.1481, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.969506805782555e-05, "epoch": 0.05, "percentage": 4.98, "elapsed_time": "0:01:26", "remaining_time": "0:27:33"} +{"current_steps": 165, "total_steps": 3215, "loss": 2.032, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.967575397279689e-05, "epoch": 0.05, "percentage": 5.13, "elapsed_time": "0:01:29", "remaining_time": "0:27:29"} +{"current_steps": 170, "total_steps": 3215, "loss": 2.0379, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.965585084426943e-05, "epoch": 0.05, "percentage": 5.29, "elapsed_time": "0:01:31", "remaining_time": "0:27:27"} +{"current_steps": 175, "total_steps": 3215, "loss": 2.1444, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9635359147357655e-05, "epoch": 0.05, "percentage": 5.44, "elapsed_time": "0:01:34", "remaining_time": "0:27:23"} +{"current_steps": 180, "total_steps": 3215, "loss": 1.9164, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.961427937122598e-05, "epoch": 0.06, "percentage": 5.6, "elapsed_time": "0:01:37", "remaining_time": "0:27:20"} +{"current_steps": 185, "total_steps": 3215, "loss": 2.0084, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.959261201907707e-05, "epoch": 0.06, "percentage": 5.75, "elapsed_time": "0:01:40", "remaining_time": "0:27:18"} +{"current_steps": 190, "total_steps": 3215, "loss": 2.2032, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.957035760813982e-05, "epoch": 0.06, "percentage": 5.91, "elapsed_time": "0:01:42", "remaining_time": "0:27:12"} +{"current_steps": 195, "total_steps": 3215, "loss": 2.2101, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.954751666965701e-05, "epoch": 0.06, "percentage": 6.07, "elapsed_time": "0:01:45", "remaining_time": "0:27:14"} +{"current_steps": 200, "total_steps": 3215, "loss": 2.0472, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9524089748872615e-05, "epoch": 0.06, "percentage": 6.22, "elapsed_time": "0:01:48", "remaining_time": "0:27:12"} +{"current_steps": 205, "total_steps": 3215, "loss": 2.0987, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9500077405018807e-05, "epoch": 0.06, "percentage": 6.38, "elapsed_time": "0:01:51", "remaining_time": "0:27:18"} +{"current_steps": 210, "total_steps": 3215, "loss": 2.1765, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9475480211302583e-05, "epoch": 0.07, "percentage": 6.53, "elapsed_time": "0:01:54", "remaining_time": "0:27:15"} +{"current_steps": 215, "total_steps": 3215, "loss": 1.9926, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.945029875489212e-05, "epoch": 0.07, "percentage": 6.69, "elapsed_time": "0:01:56", "remaining_time": "0:27:12"} +{"current_steps": 220, "total_steps": 3215, "loss": 2.0124, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.94245336369027e-05, "epoch": 0.07, "percentage": 6.84, "elapsed_time": "0:01:59", "remaining_time": "0:27:13"} +{"current_steps": 225, "total_steps": 3215, "loss": 2.2229, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.939818547238241e-05, "epoch": 0.07, "percentage": 7.0, "elapsed_time": "0:02:02", "remaining_time": "0:27:08"} +{"current_steps": 230, "total_steps": 3215, "loss": 2.2013, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9371254890297446e-05, "epoch": 0.07, "percentage": 7.15, "elapsed_time": "0:02:05", "remaining_time": "0:27:04"} +{"current_steps": 235, "total_steps": 3215, "loss": 2.014, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.93437425335171e-05, "epoch": 0.07, "percentage": 7.31, "elapsed_time": "0:02:07", "remaining_time": "0:27:00"} +{"current_steps": 240, "total_steps": 3215, "loss": 2.1701, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9315649058798384e-05, "epoch": 0.07, "percentage": 7.47, "elapsed_time": "0:02:10", "remaining_time": "0:26:56"} +{"current_steps": 245, "total_steps": 3215, "loss": 2.1681, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.928697513677042e-05, "epoch": 0.08, "percentage": 7.62, "elapsed_time": "0:02:13", "remaining_time": "0:26:53"} +{"current_steps": 250, "total_steps": 3215, "loss": 2.1224, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.925772145191834e-05, "epoch": 0.08, "percentage": 7.78, "elapsed_time": "0:02:15", "remaining_time": "0:26:48"} +{"current_steps": 255, "total_steps": 3215, "loss": 2.0512, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9227888702567044e-05, "epoch": 0.08, "percentage": 7.93, "elapsed_time": "0:02:18", "remaining_time": "0:26:44"} +{"current_steps": 260, "total_steps": 3215, "loss": 2.1067, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9197477600864446e-05, "epoch": 0.08, "percentage": 8.09, "elapsed_time": "0:02:20", "remaining_time": "0:26:41"} +{"current_steps": 265, "total_steps": 3215, "loss": 1.8884, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.9166488872764526e-05, "epoch": 0.08, "percentage": 8.24, "elapsed_time": "0:02:23", "remaining_time": "0:26:37"} +{"current_steps": 270, "total_steps": 3215, "loss": 1.9345, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.913492325800999e-05, "epoch": 0.08, "percentage": 8.4, "elapsed_time": "0:02:26", "remaining_time": "0:26:33"} +{"current_steps": 275, "total_steps": 3215, "loss": 2.1928, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.910278151011458e-05, "epoch": 0.09, "percentage": 8.55, "elapsed_time": "0:02:28", "remaining_time": "0:26:29"} +{"current_steps": 280, "total_steps": 3215, "loss": 2.0407, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.907006439634516e-05, "epoch": 0.09, "percentage": 8.71, "elapsed_time": "0:02:31", "remaining_time": "0:26:23"} +{"current_steps": 285, "total_steps": 3215, "loss": 2.2344, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.903677269770329e-05, "epoch": 0.09, "percentage": 8.86, "elapsed_time": "0:02:33", "remaining_time": "0:26:21"} +{"current_steps": 290, "total_steps": 3215, "loss": 2.1296, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.900290720890671e-05, "epoch": 0.09, "percentage": 9.02, "elapsed_time": "0:02:36", "remaining_time": "0:26:17"} +{"current_steps": 295, "total_steps": 3215, "loss": 2.152, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8968468738370244e-05, "epoch": 0.09, "percentage": 9.18, "elapsed_time": "0:02:38", "remaining_time": "0:26:10"} +{"current_steps": 300, "total_steps": 3215, "loss": 1.9623, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8933458108186606e-05, "epoch": 0.09, "percentage": 9.33, "elapsed_time": "0:02:41", "remaining_time": "0:26:06"} +{"current_steps": 305, "total_steps": 3215, "loss": 1.915, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.889787615410672e-05, "epoch": 0.09, "percentage": 9.49, "elapsed_time": "0:02:44", "remaining_time": "0:26:07"} +{"current_steps": 310, "total_steps": 3215, "loss": 1.9934, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.886172372551977e-05, "epoch": 0.1, "percentage": 9.64, "elapsed_time": "0:02:46", "remaining_time": "0:26:01"} +{"current_steps": 315, "total_steps": 3215, "loss": 2.1541, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.882500168543294e-05, "epoch": 0.1, "percentage": 9.8, "elapsed_time": "0:02:48", "remaining_time": "0:25:55"} +{"current_steps": 320, "total_steps": 3215, "loss": 2.1688, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.878771091045082e-05, "epoch": 0.1, "percentage": 9.95, "elapsed_time": "0:02:51", "remaining_time": "0:25:51"} +{"current_steps": 325, "total_steps": 3215, "loss": 2.1387, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.874985229075446e-05, "epoch": 0.1, "percentage": 10.11, "elapsed_time": "0:02:54", "remaining_time": "0:25:51"} +{"current_steps": 330, "total_steps": 3215, "loss": 2.0215, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.871142673008012e-05, "epoch": 0.1, "percentage": 10.26, "elapsed_time": "0:02:56", "remaining_time": "0:25:45"} +{"current_steps": 335, "total_steps": 3215, "loss": 1.9491, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.867243514569772e-05, "epoch": 0.1, "percentage": 10.42, "elapsed_time": "0:02:59", "remaining_time": "0:25:44"} +{"current_steps": 340, "total_steps": 3215, "loss": 2.0151, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.863287846838891e-05, "epoch": 0.11, "percentage": 10.58, "elapsed_time": "0:03:02", "remaining_time": "0:25:41"} +{"current_steps": 345, "total_steps": 3215, "loss": 1.8906, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.85927576424249e-05, "epoch": 0.11, "percentage": 10.73, "elapsed_time": "0:03:04", "remaining_time": "0:25:37"} +{"current_steps": 350, "total_steps": 3215, "loss": 2.1844, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.855207362554385e-05, "epoch": 0.11, "percentage": 10.89, "elapsed_time": "0:03:07", "remaining_time": "0:25:33"} +{"current_steps": 355, "total_steps": 3215, "loss": 2.048, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.851082738892809e-05, "epoch": 0.11, "percentage": 11.04, "elapsed_time": "0:03:10", "remaining_time": "0:25:31"} +{"current_steps": 360, "total_steps": 3215, "loss": 1.9537, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8469019917180846e-05, "epoch": 0.11, "percentage": 11.2, "elapsed_time": "0:03:12", "remaining_time": "0:25:28"} +{"current_steps": 365, "total_steps": 3215, "loss": 1.9731, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8426652208302814e-05, "epoch": 0.11, "percentage": 11.35, "elapsed_time": "0:03:15", "remaining_time": "0:25:22"} +{"current_steps": 370, "total_steps": 3215, "loss": 2.1395, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.83837252736683e-05, "epoch": 0.12, "percentage": 11.51, "elapsed_time": "0:03:17", "remaining_time": "0:25:21"} +{"current_steps": 375, "total_steps": 3215, "loss": 2.0016, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.834024013800108e-05, "epoch": 0.12, "percentage": 11.66, "elapsed_time": "0:03:20", "remaining_time": "0:25:19"} +{"current_steps": 380, "total_steps": 3215, "loss": 1.9632, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8296197839349944e-05, "epoch": 0.12, "percentage": 11.82, "elapsed_time": "0:03:23", "remaining_time": "0:25:15"} +{"current_steps": 385, "total_steps": 3215, "loss": 2.3302, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.825159942906389e-05, "epoch": 0.12, "percentage": 11.98, "elapsed_time": "0:03:25", "remaining_time": "0:25:13"} +{"current_steps": 390, "total_steps": 3215, "loss": 2.1517, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.820644597176709e-05, "epoch": 0.12, "percentage": 12.13, "elapsed_time": "0:03:28", "remaining_time": "0:25:10"} +{"current_steps": 395, "total_steps": 3215, "loss": 2.1229, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.81607385453334e-05, "epoch": 0.12, "percentage": 12.29, "elapsed_time": "0:03:31", "remaining_time": "0:25:07"} +{"current_steps": 400, "total_steps": 3215, "loss": 2.1382, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.81144782408607e-05, "epoch": 0.12, "percentage": 12.44, "elapsed_time": "0:03:33", "remaining_time": "0:25:05"} +{"current_steps": 405, "total_steps": 3215, "loss": 1.9614, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.8067666162644774e-05, "epoch": 0.13, "percentage": 12.6, "elapsed_time": "0:03:37", "remaining_time": "0:25:06"} +{"current_steps": 410, "total_steps": 3215, "loss": 2.1399, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.802030342815304e-05, "epoch": 0.13, "percentage": 12.75, "elapsed_time": "0:03:39", "remaining_time": "0:25:02"} +{"current_steps": 415, "total_steps": 3215, "loss": 1.9034, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7972391167997754e-05, "epoch": 0.13, "percentage": 12.91, "elapsed_time": "0:03:42", "remaining_time": "0:24:59"} +{"current_steps": 420, "total_steps": 3215, "loss": 2.0075, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7923930525909156e-05, "epoch": 0.13, "percentage": 13.06, "elapsed_time": "0:03:45", "remaining_time": "0:24:57"} +{"current_steps": 425, "total_steps": 3215, "loss": 2.0105, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7874922658708065e-05, "epoch": 0.13, "percentage": 13.22, "elapsed_time": "0:03:47", "remaining_time": "0:24:52"} +{"current_steps": 430, "total_steps": 3215, "loss": 2.0242, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.782536873627832e-05, "epoch": 0.13, "percentage": 13.37, "elapsed_time": "0:03:50", "remaining_time": "0:24:50"} +{"current_steps": 435, "total_steps": 3215, "loss": 2.0267, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.777526994153882e-05, "epoch": 0.14, "percentage": 13.53, "elapsed_time": "0:03:52", "remaining_time": "0:24:47"} +{"current_steps": 440, "total_steps": 3215, "loss": 1.9119, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7724627470415307e-05, "epoch": 0.14, "percentage": 13.69, "elapsed_time": "0:03:55", "remaining_time": "0:24:42"} +{"current_steps": 445, "total_steps": 3215, "loss": 2.2653, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7673442531811796e-05, "epoch": 0.14, "percentage": 13.84, "elapsed_time": "0:03:57", "remaining_time": "0:24:40"} +{"current_steps": 450, "total_steps": 3215, "loss": 2.0017, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.762171634758177e-05, "epoch": 0.14, "percentage": 14.0, "elapsed_time": "0:04:00", "remaining_time": "0:24:37"} +{"current_steps": 455, "total_steps": 3215, "loss": 2.1408, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.7569450152498927e-05, "epoch": 0.14, "percentage": 14.15, "elapsed_time": "0:04:03", "remaining_time": "0:24:34"} +{"current_steps": 460, "total_steps": 3215, "loss": 2.0935, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.751664519422778e-05, "epoch": 0.14, "percentage": 14.31, "elapsed_time": "0:04:05", "remaining_time": "0:24:31"} +{"current_steps": 465, "total_steps": 3215, "loss": 2.1142, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.746330273329386e-05, "epoch": 0.14, "percentage": 14.46, "elapsed_time": "0:04:08", "remaining_time": "0:24:29"} +{"current_steps": 470, "total_steps": 3215, "loss": 2.1289, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.740942404305356e-05, "epoch": 0.15, "percentage": 14.62, "elapsed_time": "0:04:11", "remaining_time": "0:24:26"} +{"current_steps": 475, "total_steps": 3215, "loss": 1.9741, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.735501040966383e-05, "epoch": 0.15, "percentage": 14.77, "elapsed_time": "0:04:13", "remaining_time": "0:24:24"} +{"current_steps": 480, "total_steps": 3215, "loss": 2.088, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.730006313205143e-05, "epoch": 0.15, "percentage": 14.93, "elapsed_time": "0:04:16", "remaining_time": "0:24:22"} +{"current_steps": 485, "total_steps": 3215, "loss": 2.2079, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.724458352188192e-05, "epoch": 0.15, "percentage": 15.09, "elapsed_time": "0:04:19", "remaining_time": "0:24:19"} +{"current_steps": 490, "total_steps": 3215, "loss": 2.048, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.718857290352835e-05, "epoch": 0.15, "percentage": 15.24, "elapsed_time": "0:04:21", "remaining_time": "0:24:14"} +{"current_steps": 495, "total_steps": 3215, "loss": 2.2569, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.713203261403966e-05, "epoch": 0.15, "percentage": 15.4, "elapsed_time": "0:04:24", "remaining_time": "0:24:11"} +{"current_steps": 500, "total_steps": 3215, "loss": 1.9574, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.707496400310874e-05, "epoch": 0.16, "percentage": 15.55, "elapsed_time": "0:04:26", "remaining_time": "0:24:09"} +{"current_steps": 505, "total_steps": 3215, "loss": 2.0951, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.701736843304025e-05, "epoch": 0.16, "percentage": 15.71, "elapsed_time": "0:04:29", "remaining_time": "0:24:08"} +{"current_steps": 510, "total_steps": 3215, "loss": 2.0253, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.695924727871805e-05, "epoch": 0.16, "percentage": 15.86, "elapsed_time": "0:04:32", "remaining_time": "0:24:05"} +{"current_steps": 515, "total_steps": 3215, "loss": 2.0602, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.690060192757242e-05, "epoch": 0.16, "percentage": 16.02, "elapsed_time": "0:04:35", "remaining_time": "0:24:02"} +{"current_steps": 520, "total_steps": 3215, "loss": 2.0386, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.684143377954691e-05, "epoch": 0.16, "percentage": 16.17, "elapsed_time": "0:04:37", "remaining_time": "0:23:57"} +{"current_steps": 525, "total_steps": 3215, "loss": 2.073, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6781744247064955e-05, "epoch": 0.16, "percentage": 16.33, "elapsed_time": "0:04:39", "remaining_time": "0:23:54"} +{"current_steps": 530, "total_steps": 3215, "loss": 2.1443, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6721534754996125e-05, "epoch": 0.16, "percentage": 16.49, "elapsed_time": "0:04:42", "remaining_time": "0:23:51"} +{"current_steps": 535, "total_steps": 3215, "loss": 2.0288, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.666080674062213e-05, "epoch": 0.17, "percentage": 16.64, "elapsed_time": "0:04:45", "remaining_time": "0:23:48"} +{"current_steps": 540, "total_steps": 3215, "loss": 2.0609, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.659956165360251e-05, "epoch": 0.17, "percentage": 16.8, "elapsed_time": "0:04:47", "remaining_time": "0:23:44"} +{"current_steps": 545, "total_steps": 3215, "loss": 1.9539, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6537800955940005e-05, "epoch": 0.17, "percentage": 16.95, "elapsed_time": "0:04:49", "remaining_time": "0:23:40"} +{"current_steps": 550, "total_steps": 3215, "loss": 2.149, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.647552612194572e-05, "epoch": 0.17, "percentage": 17.11, "elapsed_time": "0:04:52", "remaining_time": "0:23:37"} +{"current_steps": 555, "total_steps": 3215, "loss": 1.9722, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.641273863820383e-05, "epoch": 0.17, "percentage": 17.26, "elapsed_time": "0:04:55", "remaining_time": "0:23:35"} +{"current_steps": 560, "total_steps": 3215, "loss": 2.0729, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.634944000353622e-05, "epoch": 0.17, "percentage": 17.42, "elapsed_time": "0:04:57", "remaining_time": "0:23:31"} +{"current_steps": 565, "total_steps": 3215, "loss": 1.9507, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.628563172896655e-05, "epoch": 0.18, "percentage": 17.57, "elapsed_time": "0:05:00", "remaining_time": "0:23:29"} +{"current_steps": 570, "total_steps": 3215, "loss": 2.1643, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.6221315337684353e-05, "epoch": 0.18, "percentage": 17.73, "elapsed_time": "0:05:03", "remaining_time": "0:23:26"} +{"current_steps": 575, "total_steps": 3215, "loss": 2.1839, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.615649236500854e-05, "epoch": 0.18, "percentage": 17.88, "elapsed_time": "0:05:05", "remaining_time": "0:23:22"} +{"current_steps": 580, "total_steps": 3215, "loss": 2.0976, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.609116435835083e-05, "epoch": 0.18, "percentage": 18.04, "elapsed_time": "0:05:08", "remaining_time": "0:23:20"} +{"current_steps": 585, "total_steps": 3215, "loss": 2.1474, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.602533287717877e-05, "epoch": 0.18, "percentage": 18.2, "elapsed_time": "0:05:10", "remaining_time": "0:23:17"} +{"current_steps": 590, "total_steps": 3215, "loss": 2.1873, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5958999492978524e-05, "epoch": 0.18, "percentage": 18.35, "elapsed_time": "0:05:13", "remaining_time": "0:23:14"} +{"current_steps": 595, "total_steps": 3215, "loss": 2.1744, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.589216578921737e-05, "epoch": 0.19, "percentage": 18.51, "elapsed_time": "0:05:16", "remaining_time": "0:23:11"} +{"current_steps": 600, "total_steps": 3215, "loss": 1.9982, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.582483336130586e-05, "epoch": 0.19, "percentage": 18.66, "elapsed_time": "0:05:18", "remaining_time": "0:23:09"} +{"current_steps": 605, "total_steps": 3215, "loss": 2.1234, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.575700381655979e-05, "epoch": 0.19, "percentage": 18.82, "elapsed_time": "0:05:22", "remaining_time": "0:23:09"} +{"current_steps": 610, "total_steps": 3215, "loss": 1.9478, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5688678774161796e-05, "epoch": 0.19, "percentage": 18.97, "elapsed_time": "0:05:24", "remaining_time": "0:23:07"} +{"current_steps": 615, "total_steps": 3215, "loss": 1.8268, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.561985986512271e-05, "epoch": 0.19, "percentage": 19.13, "elapsed_time": "0:05:27", "remaining_time": "0:23:04"} +{"current_steps": 620, "total_steps": 3215, "loss": 1.9887, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.555054873224263e-05, "epoch": 0.19, "percentage": 19.28, "elapsed_time": "0:05:29", "remaining_time": "0:23:00"} +{"current_steps": 625, "total_steps": 3215, "loss": 2.0777, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.54807470300717e-05, "epoch": 0.19, "percentage": 19.44, "elapsed_time": "0:05:32", "remaining_time": "0:22:59"} +{"current_steps": 630, "total_steps": 3215, "loss": 2.0566, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5410456424870596e-05, "epoch": 0.2, "percentage": 19.6, "elapsed_time": "0:05:35", "remaining_time": "0:22:56"} +{"current_steps": 635, "total_steps": 3215, "loss": 2.047, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5339678594570795e-05, "epoch": 0.2, "percentage": 19.75, "elapsed_time": "0:05:38", "remaining_time": "0:22:53"} +{"current_steps": 640, "total_steps": 3215, "loss": 1.962, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.526841522873449e-05, "epoch": 0.2, "percentage": 19.91, "elapsed_time": "0:05:40", "remaining_time": "0:22:50"} +{"current_steps": 645, "total_steps": 3215, "loss": 2.0972, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.519666802851422e-05, "epoch": 0.2, "percentage": 20.06, "elapsed_time": "0:05:43", "remaining_time": "0:22:47"} +{"current_steps": 650, "total_steps": 3215, "loss": 2.0041, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.5124438706612376e-05, "epoch": 0.2, "percentage": 20.22, "elapsed_time": "0:05:45", "remaining_time": "0:22:43"} +{"current_steps": 655, "total_steps": 3215, "loss": 2.1229, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.505172898724018e-05, "epoch": 0.2, "percentage": 20.37, "elapsed_time": "0:05:48", "remaining_time": "0:22:41"} +{"current_steps": 660, "total_steps": 3215, "loss": 2.0195, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.497854060607662e-05, "epoch": 0.21, "percentage": 20.53, "elapsed_time": "0:05:50", "remaining_time": "0:22:37"} +{"current_steps": 665, "total_steps": 3215, "loss": 2.0745, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.490487531022699e-05, "epoch": 0.21, "percentage": 20.68, "elapsed_time": "0:05:53", "remaining_time": "0:22:34"} +{"current_steps": 670, "total_steps": 3215, "loss": 2.1068, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4830734858181145e-05, "epoch": 0.21, "percentage": 20.84, "elapsed_time": "0:05:55", "remaining_time": "0:22:31"} +{"current_steps": 675, "total_steps": 3215, "loss": 1.8088, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.47561210197716e-05, "epoch": 0.21, "percentage": 21.0, "elapsed_time": "0:05:58", "remaining_time": "0:22:28"} +{"current_steps": 680, "total_steps": 3215, "loss": 2.0995, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4681035576131215e-05, "epoch": 0.21, "percentage": 21.15, "elapsed_time": "0:06:01", "remaining_time": "0:22:25"} +{"current_steps": 685, "total_steps": 3215, "loss": 2.0541, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.46054803196507e-05, "epoch": 0.21, "percentage": 21.31, "elapsed_time": "0:06:03", "remaining_time": "0:22:23"} +{"current_steps": 690, "total_steps": 3215, "loss": 2.166, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.452945705393586e-05, "epoch": 0.21, "percentage": 21.46, "elapsed_time": "0:06:05", "remaining_time": "0:22:18"} +{"current_steps": 695, "total_steps": 3215, "loss": 2.0784, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.445296759376449e-05, "epoch": 0.22, "percentage": 21.62, "elapsed_time": "0:06:07", "remaining_time": "0:22:13"} +{"current_steps": 700, "total_steps": 3215, "loss": 2.2087, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.437601376504307e-05, "epoch": 0.22, "percentage": 21.77, "elapsed_time": "0:06:10", "remaining_time": "0:22:11"} +{"current_steps": 705, "total_steps": 3215, "loss": 2.1199, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.4298597404763186e-05, "epoch": 0.22, "percentage": 21.93, "elapsed_time": "0:06:14", "remaining_time": "0:22:11"} +{"current_steps": 710, "total_steps": 3215, "loss": 2.0355, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.422072036095768e-05, "epoch": 0.22, "percentage": 22.08, "elapsed_time": "0:06:16", "remaining_time": "0:22:10"} +{"current_steps": 715, "total_steps": 3215, "loss": 2.0011, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.414238449265654e-05, "epoch": 0.22, "percentage": 22.24, "elapsed_time": "0:06:19", "remaining_time": "0:22:07"} +{"current_steps": 720, "total_steps": 3215, "loss": 2.0368, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.406359166984249e-05, "epoch": 0.22, "percentage": 22.4, "elapsed_time": "0:06:22", "remaining_time": "0:22:03"} +{"current_steps": 725, "total_steps": 3215, "loss": 1.9983, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.39843437734064e-05, "epoch": 0.23, "percentage": 22.55, "elapsed_time": "0:06:24", "remaining_time": "0:22:02"} +{"current_steps": 730, "total_steps": 3215, "loss": 2.021, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.390464269510233e-05, "epoch": 0.23, "percentage": 22.71, "elapsed_time": "0:06:27", "remaining_time": "0:22:00"} +{"current_steps": 735, "total_steps": 3215, "loss": 1.9743, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.382449033750244e-05, "epoch": 0.23, "percentage": 22.86, "elapsed_time": "0:06:30", "remaining_time": "0:21:56"} +{"current_steps": 740, "total_steps": 3215, "loss": 2.0689, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.37438886139515e-05, "epoch": 0.23, "percentage": 23.02, "elapsed_time": "0:06:32", "remaining_time": "0:21:53"} +{"current_steps": 745, "total_steps": 3215, "loss": 2.0838, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3662839448521264e-05, "epoch": 0.23, "percentage": 23.17, "elapsed_time": "0:06:35", "remaining_time": "0:21:50"} +{"current_steps": 750, "total_steps": 3215, "loss": 2.0835, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.358134477596454e-05, "epoch": 0.23, "percentage": 23.33, "elapsed_time": "0:06:37", "remaining_time": "0:21:47"} +{"current_steps": 755, "total_steps": 3215, "loss": 2.0916, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3499406541668966e-05, "epoch": 0.23, "percentage": 23.48, "elapsed_time": "0:06:40", "remaining_time": "0:21:43"} +{"current_steps": 760, "total_steps": 3215, "loss": 1.972, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3417026701610616e-05, "epoch": 0.24, "percentage": 23.64, "elapsed_time": "0:06:43", "remaining_time": "0:21:41"} +{"current_steps": 765, "total_steps": 3215, "loss": 1.927, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3334207222307275e-05, "epoch": 0.24, "percentage": 23.79, "elapsed_time": "0:06:45", "remaining_time": "0:21:39"} +{"current_steps": 770, "total_steps": 3215, "loss": 2.1192, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.325095008077154e-05, "epoch": 0.24, "percentage": 23.95, "elapsed_time": "0:06:48", "remaining_time": "0:21:35"} +{"current_steps": 775, "total_steps": 3215, "loss": 2.0774, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.316725726446353e-05, "epoch": 0.24, "percentage": 24.11, "elapsed_time": "0:06:50", "remaining_time": "0:21:32"} +{"current_steps": 780, "total_steps": 3215, "loss": 2.0847, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.3083130771243586e-05, "epoch": 0.24, "percentage": 24.26, "elapsed_time": "0:06:52", "remaining_time": "0:21:29"} +{"current_steps": 785, "total_steps": 3215, "loss": 2.0485, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.299857260932445e-05, "epoch": 0.24, "percentage": 24.42, "elapsed_time": "0:06:55", "remaining_time": "0:21:26"} +{"current_steps": 790, "total_steps": 3215, "loss": 2.1008, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2913584797223397e-05, "epoch": 0.25, "percentage": 24.57, "elapsed_time": "0:06:58", "remaining_time": "0:21:23"} +{"current_steps": 795, "total_steps": 3215, "loss": 1.9209, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2828169363714016e-05, "epoch": 0.25, "percentage": 24.73, "elapsed_time": "0:07:00", "remaining_time": "0:21:19"} +{"current_steps": 800, "total_steps": 3215, "loss": 1.9722, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.274232834777782e-05, "epoch": 0.25, "percentage": 24.88, "elapsed_time": "0:07:03", "remaining_time": "0:21:17"} +{"current_steps": 805, "total_steps": 3215, "loss": 1.9176, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2656063798555515e-05, "epoch": 0.25, "percentage": 25.04, "elapsed_time": "0:07:06", "remaining_time": "0:21:16"} +{"current_steps": 810, "total_steps": 3215, "loss": 1.9929, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.256937777529815e-05, "epoch": 0.25, "percentage": 25.19, "elapsed_time": "0:07:08", "remaining_time": "0:21:12"} +{"current_steps": 815, "total_steps": 3215, "loss": 2.166, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2482272347317906e-05, "epoch": 0.25, "percentage": 25.35, "elapsed_time": "0:07:11", "remaining_time": "0:21:10"} +{"current_steps": 820, "total_steps": 3215, "loss": 2.1334, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2394749593938733e-05, "epoch": 0.25, "percentage": 25.51, "elapsed_time": "0:07:13", "remaining_time": "0:21:07"} +{"current_steps": 825, "total_steps": 3215, "loss": 2.0853, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.230681160444669e-05, "epoch": 0.26, "percentage": 25.66, "elapsed_time": "0:07:16", "remaining_time": "0:21:04"} +{"current_steps": 830, "total_steps": 3215, "loss": 2.1802, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.221846047804009e-05, "epoch": 0.26, "percentage": 25.82, "elapsed_time": "0:07:18", "remaining_time": "0:21:01"} +{"current_steps": 835, "total_steps": 3215, "loss": 2.0739, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.2129698323779366e-05, "epoch": 0.26, "percentage": 25.97, "elapsed_time": "0:07:21", "remaining_time": "0:20:58"} +{"current_steps": 840, "total_steps": 3215, "loss": 2.0238, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.204052726053676e-05, "epoch": 0.26, "percentage": 26.13, "elapsed_time": "0:07:23", "remaining_time": "0:20:54"} +{"current_steps": 845, "total_steps": 3215, "loss": 2.1557, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.195094941694571e-05, "epoch": 0.26, "percentage": 26.28, "elapsed_time": "0:07:26", "remaining_time": "0:20:51"} +{"current_steps": 850, "total_steps": 3215, "loss": 2.1666, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1860966931350054e-05, "epoch": 0.26, "percentage": 26.44, "elapsed_time": "0:07:28", "remaining_time": "0:20:48"} +{"current_steps": 855, "total_steps": 3215, "loss": 2.105, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1770581951752976e-05, "epoch": 0.27, "percentage": 26.59, "elapsed_time": "0:07:31", "remaining_time": "0:20:45"} +{"current_steps": 860, "total_steps": 3215, "loss": 1.9656, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1679796635765735e-05, "epoch": 0.27, "percentage": 26.75, "elapsed_time": "0:07:33", "remaining_time": "0:20:42"} +{"current_steps": 865, "total_steps": 3215, "loss": 2.0166, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.158861315055617e-05, "epoch": 0.27, "percentage": 26.91, "elapsed_time": "0:07:36", "remaining_time": "0:20:39"} +{"current_steps": 870, "total_steps": 3215, "loss": 2.0076, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1497033672796924e-05, "epoch": 0.27, "percentage": 27.06, "elapsed_time": "0:07:38", "remaining_time": "0:20:36"} +{"current_steps": 875, "total_steps": 3215, "loss": 2.1594, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.140506038861356e-05, "epoch": 0.27, "percentage": 27.22, "elapsed_time": "0:07:41", "remaining_time": "0:20:33"} +{"current_steps": 880, "total_steps": 3215, "loss": 2.1416, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.131269549353229e-05, "epoch": 0.27, "percentage": 27.37, "elapsed_time": "0:07:43", "remaining_time": "0:20:30"} +{"current_steps": 885, "total_steps": 3215, "loss": 2.1242, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.1219941192427644e-05, "epoch": 0.28, "percentage": 27.53, "elapsed_time": "0:07:46", "remaining_time": "0:20:27"} +{"current_steps": 890, "total_steps": 3215, "loss": 2.02, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.112679969946977e-05, "epoch": 0.28, "percentage": 27.68, "elapsed_time": "0:07:48", "remaining_time": "0:20:24"} +{"current_steps": 895, "total_steps": 3215, "loss": 2.0438, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.103327323807162e-05, "epoch": 0.28, "percentage": 27.84, "elapsed_time": "0:07:51", "remaining_time": "0:20:21"} +{"current_steps": 900, "total_steps": 3215, "loss": 1.9806, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.093936404083585e-05, "epoch": 0.28, "percentage": 27.99, "elapsed_time": "0:07:53", "remaining_time": "0:20:18"} +{"current_steps": 905, "total_steps": 3215, "loss": 2.1476, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.0845074349501544e-05, "epoch": 0.28, "percentage": 28.15, "elapsed_time": "0:07:56", "remaining_time": "0:20:16"} +{"current_steps": 910, "total_steps": 3215, "loss": 1.9672, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.0750406414890695e-05, "epoch": 0.28, "percentage": 28.3, "elapsed_time": "0:07:59", "remaining_time": "0:20:13"} +{"current_steps": 915, "total_steps": 3215, "loss": 1.9984, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.065536249685448e-05, "epoch": 0.28, "percentage": 28.46, "elapsed_time": "0:08:01", "remaining_time": "0:20:11"} +{"current_steps": 920, "total_steps": 3215, "loss": 2.1162, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.055994486421929e-05, "epoch": 0.29, "percentage": 28.62, "elapsed_time": "0:08:04", "remaining_time": "0:20:08"} +{"current_steps": 925, "total_steps": 3215, "loss": 2.0435, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.04641557947326e-05, "epoch": 0.29, "percentage": 28.77, "elapsed_time": "0:08:06", "remaining_time": "0:20:04"} +{"current_steps": 930, "total_steps": 3215, "loss": 2.0431, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.036799757500856e-05, "epoch": 0.29, "percentage": 28.93, "elapsed_time": "0:08:09", "remaining_time": "0:20:01"} +{"current_steps": 935, "total_steps": 3215, "loss": 2.2021, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.027147250047348e-05, "epoch": 0.29, "percentage": 29.08, "elapsed_time": "0:08:11", "remaining_time": "0:19:59"} +{"current_steps": 940, "total_steps": 3215, "loss": 1.997, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.017458287531094e-05, "epoch": 0.29, "percentage": 29.24, "elapsed_time": "0:08:14", "remaining_time": "0:19:56"} +{"current_steps": 945, "total_steps": 3215, "loss": 1.946, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 4.007733101240685e-05, "epoch": 0.29, "percentage": 29.39, "elapsed_time": "0:08:16", "remaining_time": "0:19:53"} +{"current_steps": 950, "total_steps": 3215, "loss": 2.0723, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.997971923329426e-05, "epoch": 0.3, "percentage": 29.55, "elapsed_time": "0:08:19", "remaining_time": "0:19:49"} +{"current_steps": 955, "total_steps": 3215, "loss": 2.034, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.988174986809783e-05, "epoch": 0.3, "percentage": 29.7, "elapsed_time": "0:08:21", "remaining_time": "0:19:47"} +{"current_steps": 960, "total_steps": 3215, "loss": 1.9736, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.9783425255478355e-05, "epoch": 0.3, "percentage": 29.86, "elapsed_time": "0:08:24", "remaining_time": "0:19:44"} +{"current_steps": 965, "total_steps": 3215, "loss": 1.9878, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.968474774257682e-05, "epoch": 0.3, "percentage": 30.02, "elapsed_time": "0:08:26", "remaining_time": "0:19:41"} +{"current_steps": 970, "total_steps": 3215, "loss": 2.117, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.9585719684958446e-05, "epoch": 0.3, "percentage": 30.17, "elapsed_time": "0:08:29", "remaining_time": "0:19:38"} +{"current_steps": 975, "total_steps": 3215, "loss": 2.0585, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.948634344655639e-05, "epoch": 0.3, "percentage": 30.33, "elapsed_time": "0:08:31", "remaining_time": "0:19:35"} +{"current_steps": 980, "total_steps": 3215, "loss": 2.0409, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.938662139961538e-05, "epoch": 0.3, "percentage": 30.48, "elapsed_time": "0:08:33", "remaining_time": "0:19:32"} +{"current_steps": 985, "total_steps": 3215, "loss": 2.0369, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.928655592463508e-05, "epoch": 0.31, "percentage": 30.64, "elapsed_time": "0:08:36", "remaining_time": "0:19:29"} +{"current_steps": 990, "total_steps": 3215, "loss": 1.967, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.918614941031319e-05, "epoch": 0.31, "percentage": 30.79, "elapsed_time": "0:08:38", "remaining_time": "0:19:26"} +{"current_steps": 995, "total_steps": 3215, "loss": 2.0037, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.908540425348852e-05, "epoch": 0.31, "percentage": 30.95, "elapsed_time": "0:08:41", "remaining_time": "0:19:23"} +{"current_steps": 1000, "total_steps": 3215, "loss": 1.9991, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.8984322859083725e-05, "epoch": 0.31, "percentage": 31.1, "elapsed_time": "0:08:44", "remaining_time": "0:19:20"} +{"current_steps": 1005, "total_steps": 3215, "loss": 2.0448, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.8882907640047896e-05, "epoch": 0.31, "percentage": 31.26, "elapsed_time": "0:08:47", "remaining_time": "0:19:19"} +{"current_steps": 1010, "total_steps": 3215, "loss": 2.0791, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.878116101729897e-05, "epoch": 0.31, "percentage": 31.42, "elapsed_time": "0:08:49", "remaining_time": "0:19:16"} +{"current_steps": 1015, "total_steps": 3215, "loss": 1.9997, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.867908541966594e-05, "epoch": 0.32, "percentage": 31.57, "elapsed_time": "0:08:51", "remaining_time": "0:19:13"} +{"current_steps": 1020, "total_steps": 3215, "loss": 2.0481, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.857668328383088e-05, "epoch": 0.32, "percentage": 31.73, "elapsed_time": "0:08:54", "remaining_time": "0:19:10"} +{"current_steps": 1025, "total_steps": 3215, "loss": 2.2664, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.847395705427075e-05, "epoch": 0.32, "percentage": 31.88, "elapsed_time": "0:08:57", "remaining_time": "0:19:07"} +{"current_steps": 1030, "total_steps": 3215, "loss": 1.9752, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.837090918319909e-05, "epoch": 0.32, "percentage": 32.04, "elapsed_time": "0:08:59", "remaining_time": "0:19:04"} +{"current_steps": 1035, "total_steps": 3215, "loss": 2.1332, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.8267542130507436e-05, "epoch": 0.32, "percentage": 32.19, "elapsed_time": "0:09:02", "remaining_time": "0:19:02"} +{"current_steps": 1040, "total_steps": 3215, "loss": 2.0432, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.816385836370663e-05, "epoch": 0.32, "percentage": 32.35, "elapsed_time": "0:09:04", "remaining_time": "0:18:59"} +{"current_steps": 1045, "total_steps": 3215, "loss": 1.9618, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.805986035786789e-05, "epoch": 0.32, "percentage": 32.5, "elapsed_time": "0:09:07", "remaining_time": "0:18:56"} +{"current_steps": 1050, "total_steps": 3215, "loss": 2.0267, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.795555059556378e-05, "epoch": 0.33, "percentage": 32.66, "elapsed_time": "0:09:09", "remaining_time": "0:18:53"} +{"current_steps": 1055, "total_steps": 3215, "loss": 2.1075, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7850931566808866e-05, "epoch": 0.33, "percentage": 32.81, "elapsed_time": "0:09:12", "remaining_time": "0:18:51"} +{"current_steps": 1060, "total_steps": 3215, "loss": 2.156, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7746005769000363e-05, "epoch": 0.33, "percentage": 32.97, "elapsed_time": "0:09:14", "remaining_time": "0:18:48"} +{"current_steps": 1065, "total_steps": 3215, "loss": 1.9615, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.764077570685844e-05, "epoch": 0.33, "percentage": 33.13, "elapsed_time": "0:09:17", "remaining_time": "0:18:45"} +{"current_steps": 1070, "total_steps": 3215, "loss": 2.0928, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.753524389236648e-05, "epoch": 0.33, "percentage": 33.28, "elapsed_time": "0:09:20", "remaining_time": "0:18:42"} +{"current_steps": 1075, "total_steps": 3215, "loss": 2.1074, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.742941284471111e-05, "epoch": 0.33, "percentage": 33.44, "elapsed_time": "0:09:22", "remaining_time": "0:18:39"} +{"current_steps": 1080, "total_steps": 3215, "loss": 1.9666, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7323285090222054e-05, "epoch": 0.34, "percentage": 33.59, "elapsed_time": "0:09:25", "remaining_time": "0:18:37"} +{"current_steps": 1085, "total_steps": 3215, "loss": 2.0468, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.721686316231181e-05, "epoch": 0.34, "percentage": 33.75, "elapsed_time": "0:09:28", "remaining_time": "0:18:35"} +{"current_steps": 1090, "total_steps": 3215, "loss": 2.0624, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.7110149601415215e-05, "epoch": 0.34, "percentage": 33.9, "elapsed_time": "0:09:30", "remaining_time": "0:18:32"} +{"current_steps": 1095, "total_steps": 3215, "loss": 1.9888, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.700314695492876e-05, "epoch": 0.34, "percentage": 34.06, "elapsed_time": "0:09:33", "remaining_time": "0:18:29"} +{"current_steps": 1100, "total_steps": 3215, "loss": 2.1013, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6895857777149825e-05, "epoch": 0.34, "percentage": 34.21, "elapsed_time": "0:09:35", "remaining_time": "0:18:27"} +{"current_steps": 1105, "total_steps": 3215, "loss": 1.875, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6788284629215624e-05, "epoch": 0.34, "percentage": 34.37, "elapsed_time": "0:09:38", "remaining_time": "0:18:25"} +{"current_steps": 1110, "total_steps": 3215, "loss": 1.9096, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.668043007904219e-05, "epoch": 0.35, "percentage": 34.53, "elapsed_time": "0:09:41", "remaining_time": "0:18:22"} +{"current_steps": 1115, "total_steps": 3215, "loss": 2.1859, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6572296701262966e-05, "epoch": 0.35, "percentage": 34.68, "elapsed_time": "0:09:44", "remaining_time": "0:18:19"} +{"current_steps": 1120, "total_steps": 3215, "loss": 2.2092, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.646388707716738e-05, "epoch": 0.35, "percentage": 34.84, "elapsed_time": "0:09:46", "remaining_time": "0:18:17"} +{"current_steps": 1125, "total_steps": 3215, "loss": 2.0026, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.635520379463926e-05, "epoch": 0.35, "percentage": 34.99, "elapsed_time": "0:09:49", "remaining_time": "0:18:14"} +{"current_steps": 1130, "total_steps": 3215, "loss": 2.2112, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6246249448095004e-05, "epoch": 0.35, "percentage": 35.15, "elapsed_time": "0:09:51", "remaining_time": "0:18:11"} +{"current_steps": 1135, "total_steps": 3215, "loss": 2.0221, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6137026638421696e-05, "epoch": 0.35, "percentage": 35.3, "elapsed_time": "0:09:54", "remaining_time": "0:18:09"} +{"current_steps": 1140, "total_steps": 3215, "loss": 1.9106, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.6027537972914974e-05, "epoch": 0.35, "percentage": 35.46, "elapsed_time": "0:09:57", "remaining_time": "0:18:06"} +{"current_steps": 1145, "total_steps": 3215, "loss": 2.0673, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.5917786065216826e-05, "epoch": 0.36, "percentage": 35.61, "elapsed_time": "0:09:59", "remaining_time": "0:18:03"} +{"current_steps": 1150, "total_steps": 3215, "loss": 2.1463, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.580777353525318e-05, "epoch": 0.36, "percentage": 35.77, "elapsed_time": "0:10:01", "remaining_time": "0:18:00"} +{"current_steps": 1155, "total_steps": 3215, "loss": 2.0255, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.5697503009171385e-05, "epoch": 0.36, "percentage": 35.93, "elapsed_time": "0:10:04", "remaining_time": "0:17:58"} +{"current_steps": 1160, "total_steps": 3215, "loss": 2.1348, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.558697711927748e-05, "epoch": 0.36, "percentage": 36.08, "elapsed_time": "0:10:07", "remaining_time": "0:17:55"} +{"current_steps": 1165, "total_steps": 3215, "loss": 2.1457, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.54761985039734e-05, "epoch": 0.36, "percentage": 36.24, "elapsed_time": "0:10:09", "remaining_time": "0:17:52"} +{"current_steps": 1170, "total_steps": 3215, "loss": 2.1256, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.5365169807693966e-05, "epoch": 0.36, "percentage": 36.39, "elapsed_time": "0:10:12", "remaining_time": "0:17:50"} +{"current_steps": 1175, "total_steps": 3215, "loss": 1.9587, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.525389368084379e-05, "epoch": 0.37, "percentage": 36.55, "elapsed_time": "0:10:14", "remaining_time": "0:17:47"} +{"current_steps": 1180, "total_steps": 3215, "loss": 1.8965, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.514237277973393e-05, "epoch": 0.37, "percentage": 36.7, "elapsed_time": "0:10:17", "remaining_time": "0:17:44"} +{"current_steps": 1185, "total_steps": 3215, "loss": 1.9669, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.503060976651862e-05, "epoch": 0.37, "percentage": 36.86, "elapsed_time": "0:10:20", "remaining_time": "0:17:42"} +{"current_steps": 1190, "total_steps": 3215, "loss": 2.003, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.491860730913156e-05, "epoch": 0.37, "percentage": 37.01, "elapsed_time": "0:10:22", "remaining_time": "0:17:39"} +{"current_steps": 1195, "total_steps": 3215, "loss": 2.1487, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.480636808122235e-05, "epoch": 0.37, "percentage": 37.17, "elapsed_time": "0:10:25", "remaining_time": "0:17:36"} +{"current_steps": 1200, "total_steps": 3215, "loss": 2.0686, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.469389476209259e-05, "epoch": 0.37, "percentage": 37.33, "elapsed_time": "0:10:27", "remaining_time": "0:17:33"} +{"current_steps": 1205, "total_steps": 3215, "loss": 2.0284, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.458119003663199e-05, "epoch": 0.37, "percentage": 37.48, "elapsed_time": "0:10:30", "remaining_time": "0:17:32"} +{"current_steps": 1210, "total_steps": 3215, "loss": 2.0555, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.446825659525421e-05, "epoch": 0.38, "percentage": 37.64, "elapsed_time": "0:10:33", "remaining_time": "0:17:30"} +{"current_steps": 1215, "total_steps": 3215, "loss": 1.9375, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.435509713383268e-05, "epoch": 0.38, "percentage": 37.79, "elapsed_time": "0:10:36", "remaining_time": "0:17:27"} +{"current_steps": 1220, "total_steps": 3215, "loss": 2.0271, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.424171435363623e-05, "epoch": 0.38, "percentage": 37.95, "elapsed_time": "0:10:38", "remaining_time": "0:17:24"} +{"current_steps": 1225, "total_steps": 3215, "loss": 2.1897, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.412811096126461e-05, "epoch": 0.38, "percentage": 38.1, "elapsed_time": "0:10:41", "remaining_time": "0:17:21"} +{"current_steps": 1230, "total_steps": 3215, "loss": 1.9978, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.401428966858387e-05, "epoch": 0.38, "percentage": 38.26, "elapsed_time": "0:10:44", "remaining_time": "0:17:19"} +{"current_steps": 1235, "total_steps": 3215, "loss": 2.0688, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.390025319266167e-05, "epoch": 0.38, "percentage": 38.41, "elapsed_time": "0:10:46", "remaining_time": "0:17:16"} +{"current_steps": 1240, "total_steps": 3215, "loss": 2.0396, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3786004255702336e-05, "epoch": 0.39, "percentage": 38.57, "elapsed_time": "0:10:49", "remaining_time": "0:17:13"} +{"current_steps": 1245, "total_steps": 3215, "loss": 1.9566, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3671545584981954e-05, "epoch": 0.39, "percentage": 38.72, "elapsed_time": "0:10:51", "remaining_time": "0:17:10"} +{"current_steps": 1250, "total_steps": 3215, "loss": 2.0474, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.355687991278324e-05, "epoch": 0.39, "percentage": 38.88, "elapsed_time": "0:10:54", "remaining_time": "0:17:08"} +{"current_steps": 1255, "total_steps": 3215, "loss": 2.2163, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3442009976330305e-05, "epoch": 0.39, "percentage": 39.04, "elapsed_time": "0:10:56", "remaining_time": "0:17:05"} +{"current_steps": 1260, "total_steps": 3215, "loss": 2.1088, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.332693851772331e-05, "epoch": 0.39, "percentage": 39.19, "elapsed_time": "0:10:59", "remaining_time": "0:17:02"} +{"current_steps": 1265, "total_steps": 3215, "loss": 1.8947, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3211668283873035e-05, "epoch": 0.39, "percentage": 39.35, "elapsed_time": "0:11:01", "remaining_time": "0:17:00"} +{"current_steps": 1270, "total_steps": 3215, "loss": 2.1748, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.3096202026435304e-05, "epoch": 0.39, "percentage": 39.5, "elapsed_time": "0:11:04", "remaining_time": "0:16:57"} +{"current_steps": 1275, "total_steps": 3215, "loss": 1.9218, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.298054250174527e-05, "epoch": 0.4, "percentage": 39.66, "elapsed_time": "0:11:06", "remaining_time": "0:16:54"} +{"current_steps": 1280, "total_steps": 3215, "loss": 2.2723, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2864692470751654e-05, "epoch": 0.4, "percentage": 39.81, "elapsed_time": "0:11:09", "remaining_time": "0:16:51"} +{"current_steps": 1285, "total_steps": 3215, "loss": 2.1456, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.27486546989508e-05, "epoch": 0.4, "percentage": 39.97, "elapsed_time": "0:11:11", "remaining_time": "0:16:49"} +{"current_steps": 1290, "total_steps": 3215, "loss": 1.8877, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.263243195632068e-05, "epoch": 0.4, "percentage": 40.12, "elapsed_time": "0:11:14", "remaining_time": "0:16:46"} +{"current_steps": 1295, "total_steps": 3215, "loss": 2.0615, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2516027017254785e-05, "epoch": 0.4, "percentage": 40.28, "elapsed_time": "0:11:16", "remaining_time": "0:16:43"} +{"current_steps": 1300, "total_steps": 3215, "loss": 2.0402, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.239944266049587e-05, "epoch": 0.4, "percentage": 40.44, "elapsed_time": "0:11:19", "remaining_time": "0:16:40"} +{"current_steps": 1305, "total_steps": 3215, "loss": 2.0728, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.228268166906962e-05, "epoch": 0.41, "percentage": 40.59, "elapsed_time": "0:11:22", "remaining_time": "0:16:39"} +{"current_steps": 1310, "total_steps": 3215, "loss": 2.1815, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.2165746830218254e-05, "epoch": 0.41, "percentage": 40.75, "elapsed_time": "0:11:25", "remaining_time": "0:16:36"} +{"current_steps": 1315, "total_steps": 3215, "loss": 1.8935, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.204864093533394e-05, "epoch": 0.41, "percentage": 40.9, "elapsed_time": "0:11:27", "remaining_time": "0:16:34"} +{"current_steps": 1320, "total_steps": 3215, "loss": 1.9567, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.193136677989221e-05, "epoch": 0.41, "percentage": 41.06, "elapsed_time": "0:11:30", "remaining_time": "0:16:31"} +{"current_steps": 1325, "total_steps": 3215, "loss": 2.055, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.181392716338516e-05, "epoch": 0.41, "percentage": 41.21, "elapsed_time": "0:11:33", "remaining_time": "0:16:28"} +{"current_steps": 1330, "total_steps": 3215, "loss": 1.8794, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.1696324889254716e-05, "epoch": 0.41, "percentage": 41.37, "elapsed_time": "0:11:35", "remaining_time": "0:16:25"} +{"current_steps": 1335, "total_steps": 3215, "loss": 2.0299, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.15785627648256e-05, "epoch": 0.42, "percentage": 41.52, "elapsed_time": "0:11:38", "remaining_time": "0:16:23"} +{"current_steps": 1340, "total_steps": 3215, "loss": 1.9342, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.146064360123846e-05, "epoch": 0.42, "percentage": 41.68, "elapsed_time": "0:11:40", "remaining_time": "0:16:20"} +{"current_steps": 1345, "total_steps": 3215, "loss": 2.0399, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.1342570213382594e-05, "epoch": 0.42, "percentage": 41.84, "elapsed_time": "0:11:43", "remaining_time": "0:16:17"} +{"current_steps": 1350, "total_steps": 3215, "loss": 2.1419, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.122434541982888e-05, "epoch": 0.42, "percentage": 41.99, "elapsed_time": "0:11:45", "remaining_time": "0:16:15"} +{"current_steps": 1355, "total_steps": 3215, "loss": 2.2932, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.110597204276247e-05, "epoch": 0.42, "percentage": 42.15, "elapsed_time": "0:11:48", "remaining_time": "0:16:12"} +{"current_steps": 1360, "total_steps": 3215, "loss": 1.8989, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.098745290791539e-05, "epoch": 0.42, "percentage": 42.3, "elapsed_time": "0:11:50", "remaining_time": "0:16:09"} +{"current_steps": 1365, "total_steps": 3215, "loss": 2.1214, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.086879084449907e-05, "epoch": 0.42, "percentage": 42.46, "elapsed_time": "0:11:53", "remaining_time": "0:16:07"} +{"current_steps": 1370, "total_steps": 3215, "loss": 2.2538, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.074998868513688e-05, "epoch": 0.43, "percentage": 42.61, "elapsed_time": "0:11:55", "remaining_time": "0:16:04"} +{"current_steps": 1375, "total_steps": 3215, "loss": 2.0974, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0631049265796465e-05, "epoch": 0.43, "percentage": 42.77, "elapsed_time": "0:11:58", "remaining_time": "0:16:01"} +{"current_steps": 1380, "total_steps": 3215, "loss": 2.054, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.051197542572203e-05, "epoch": 0.43, "percentage": 42.92, "elapsed_time": "0:12:00", "remaining_time": "0:15:58"} +{"current_steps": 1385, "total_steps": 3215, "loss": 1.9798, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0392770007366584e-05, "epoch": 0.43, "percentage": 43.08, "elapsed_time": "0:12:03", "remaining_time": "0:15:56"} +{"current_steps": 1390, "total_steps": 3215, "loss": 2.0796, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0273435856324112e-05, "epoch": 0.43, "percentage": 43.23, "elapsed_time": "0:12:06", "remaining_time": "0:15:53"} +{"current_steps": 1395, "total_steps": 3215, "loss": 1.9116, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0153975821261605e-05, "epoch": 0.43, "percentage": 43.39, "elapsed_time": "0:12:08", "remaining_time": "0:15:50"} +{"current_steps": 1400, "total_steps": 3215, "loss": 2.0235, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 3.0034392753851066e-05, "epoch": 0.44, "percentage": 43.55, "elapsed_time": "0:12:11", "remaining_time": "0:15:48"} +{"current_steps": 1405, "total_steps": 3215, "loss": 2.1455, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9914689508701476e-05, "epoch": 0.44, "percentage": 43.7, "elapsed_time": "0:12:14", "remaining_time": "0:15:46"} +{"current_steps": 1410, "total_steps": 3215, "loss": 2.0355, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.979486894329058e-05, "epoch": 0.44, "percentage": 43.86, "elapsed_time": "0:12:17", "remaining_time": "0:15:43"} +{"current_steps": 1415, "total_steps": 3215, "loss": 2.0379, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9674933917896747e-05, "epoch": 0.44, "percentage": 44.01, "elapsed_time": "0:12:19", "remaining_time": "0:15:41"} +{"current_steps": 1420, "total_steps": 3215, "loss": 2.0802, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9554887295530647e-05, "epoch": 0.44, "percentage": 44.17, "elapsed_time": "0:12:22", "remaining_time": "0:15:38"} +{"current_steps": 1425, "total_steps": 3215, "loss": 2.1044, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.943473194186693e-05, "epoch": 0.44, "percentage": 44.32, "elapsed_time": "0:12:24", "remaining_time": "0:15:35"} +{"current_steps": 1430, "total_steps": 3215, "loss": 2.0121, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.9314470725175792e-05, "epoch": 0.44, "percentage": 44.48, "elapsed_time": "0:12:27", "remaining_time": "0:15:32"} +{"current_steps": 1435, "total_steps": 3215, "loss": 2.0717, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.919410651625455e-05, "epoch": 0.45, "percentage": 44.63, "elapsed_time": "0:12:30", "remaining_time": "0:15:30"} +{"current_steps": 1440, "total_steps": 3215, "loss": 1.9522, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.907364218835904e-05, "epoch": 0.45, "percentage": 44.79, "elapsed_time": "0:12:32", "remaining_time": "0:15:27"} +{"current_steps": 1445, "total_steps": 3215, "loss": 1.9593, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8953080617135115e-05, "epoch": 0.45, "percentage": 44.95, "elapsed_time": "0:12:35", "remaining_time": "0:15:25"} +{"current_steps": 1450, "total_steps": 3215, "loss": 1.8073, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8832424680549937e-05, "epoch": 0.45, "percentage": 45.1, "elapsed_time": "0:12:37", "remaining_time": "0:15:22"} +{"current_steps": 1455, "total_steps": 3215, "loss": 2.0042, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8711677258823306e-05, "epoch": 0.45, "percentage": 45.26, "elapsed_time": "0:12:40", "remaining_time": "0:15:19"} +{"current_steps": 1460, "total_steps": 3215, "loss": 1.9931, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.859084123435887e-05, "epoch": 0.45, "percentage": 45.41, "elapsed_time": "0:12:42", "remaining_time": "0:15:16"} +{"current_steps": 1465, "total_steps": 3215, "loss": 2.1533, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.84699194916754e-05, "epoch": 0.46, "percentage": 45.57, "elapsed_time": "0:12:45", "remaining_time": "0:15:14"} +{"current_steps": 1470, "total_steps": 3215, "loss": 2.029, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.834891491733781e-05, "epoch": 0.46, "percentage": 45.72, "elapsed_time": "0:12:48", "remaining_time": "0:15:11"} +{"current_steps": 1475, "total_steps": 3215, "loss": 2.0241, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.822783039988836e-05, "epoch": 0.46, "percentage": 45.88, "elapsed_time": "0:12:50", "remaining_time": "0:15:08"} +{"current_steps": 1480, "total_steps": 3215, "loss": 2.0959, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.8106668829777645e-05, "epoch": 0.46, "percentage": 46.03, "elapsed_time": "0:12:53", "remaining_time": "0:15:06"} +{"current_steps": 1485, "total_steps": 3215, "loss": 1.8718, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7985433099295618e-05, "epoch": 0.46, "percentage": 46.19, "elapsed_time": "0:12:55", "remaining_time": "0:15:03"} +{"current_steps": 1490, "total_steps": 3215, "loss": 2.2397, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7864126102502524e-05, "epoch": 0.46, "percentage": 46.35, "elapsed_time": "0:12:58", "remaining_time": "0:15:01"} +{"current_steps": 1495, "total_steps": 3215, "loss": 2.1083, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.774275073515985e-05, "epoch": 0.46, "percentage": 46.5, "elapsed_time": "0:13:00", "remaining_time": "0:14:58"} +{"current_steps": 1500, "total_steps": 3215, "loss": 2.0764, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7621309894661167e-05, "epoch": 0.47, "percentage": 46.66, "elapsed_time": "0:13:03", "remaining_time": "0:14:55"} +{"current_steps": 1505, "total_steps": 3215, "loss": 2.0955, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7499806479962997e-05, "epoch": 0.47, "percentage": 46.81, "elapsed_time": "0:13:06", "remaining_time": "0:14:53"} +{"current_steps": 1510, "total_steps": 3215, "loss": 2.0449, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7378243391515558e-05, "epoch": 0.47, "percentage": 46.97, "elapsed_time": "0:13:09", "remaining_time": "0:14:50"} +{"current_steps": 1515, "total_steps": 3215, "loss": 1.8368, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7256623531193605e-05, "epoch": 0.47, "percentage": 47.12, "elapsed_time": "0:13:11", "remaining_time": "0:14:48"} +{"current_steps": 1520, "total_steps": 3215, "loss": 2.024, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7134949802227073e-05, "epoch": 0.47, "percentage": 47.28, "elapsed_time": "0:13:14", "remaining_time": "0:14:45"} +{"current_steps": 1525, "total_steps": 3215, "loss": 2.0699, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.7013225109131836e-05, "epoch": 0.47, "percentage": 47.43, "elapsed_time": "0:13:16", "remaining_time": "0:14:42"} +{"current_steps": 1530, "total_steps": 3215, "loss": 1.953, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": 2.689145235764035e-05, "epoch": 0.48, "percentage": 47.59, "elapsed_time": "0:13:19", "remaining_time": "0:14:40"} +{"current_steps": 1534, "total_steps": 3215, "loss": null, "eval_loss": null, "predict_loss": null, "reward": null, "learning_rate": null, "epoch": 0.48, "percentage": 47.71, "elapsed_time": "0:13:21", "remaining_time": "0:14:38"} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..904cb416e190d2053dec9a2ce80c8d85cbfe5c5b --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,2172 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4770085901970692, + "eval_steps": 500, + "global_step": 1534, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.47774845361709595, + "learning_rate": 4.999970160815579e-05, + "loss": 2.0765, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 0.6051416397094727, + "learning_rate": 4.999880643974619e-05, + "loss": 2.2297, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 0.6161717772483826, + "learning_rate": 4.9997314516140056e-05, + "loss": 2.1103, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.4686434268951416, + "learning_rate": 4.999522587295162e-05, + "loss": 2.0057, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.8412289023399353, + "learning_rate": 4.999254056003963e-05, + "loss": 2.1778, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 0.5333625078201294, + "learning_rate": 4.99892586415061e-05, + "loss": 2.2399, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 0.821148157119751, + "learning_rate": 4.9985380195694856e-05, + "loss": 2.3215, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 0.8403909206390381, + "learning_rate": 4.998090531518962e-05, + "loss": 1.8295, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.6633398532867432, + "learning_rate": 4.9975834106811834e-05, + "loss": 2.0195, + "step": 45 + }, + { + "epoch": 0.02, + "grad_norm": 0.6386868357658386, + "learning_rate": 4.997016669161806e-05, + "loss": 2.1257, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.7762248516082764, + "learning_rate": 4.996390320489715e-05, + "loss": 2.057, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 1.3192856311798096, + "learning_rate": 4.9957043796166966e-05, + "loss": 2.0753, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 0.9797518849372864, + "learning_rate": 4.994958862917083e-05, + "loss": 1.9736, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 1.000693440437317, + "learning_rate": 4.994153788187363e-05, + "loss": 2.1572, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.6852813959121704, + "learning_rate": 4.993289174645757e-05, + "loss": 2.1491, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 1.0075691938400269, + "learning_rate": 4.992365042931752e-05, + "loss": 1.945, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 1.1973133087158203, + "learning_rate": 4.991381415105619e-05, + "loss": 2.0811, + "step": 85 + }, + { + "epoch": 0.03, + "grad_norm": 0.9927239418029785, + "learning_rate": 4.990338314647881e-05, + "loss": 1.961, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 0.9499759674072266, + "learning_rate": 4.98923576645875e-05, + "loss": 2.0653, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 0.7233040928840637, + "learning_rate": 4.9880737968575365e-05, + "loss": 1.9999, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 1.55235755443573, + "learning_rate": 4.986852433582022e-05, + "loss": 2.2258, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.9007890820503235, + "learning_rate": 4.985571705787793e-05, + "loss": 2.1034, + "step": 110 + }, + { + "epoch": 0.04, + "grad_norm": 0.6774860620498657, + "learning_rate": 4.9842316440475475e-05, + "loss": 2.1753, + "step": 115 + }, + { + "epoch": 0.04, + "grad_norm": 0.7676737308502197, + "learning_rate": 4.9828322803503665e-05, + "loss": 2.1384, + "step": 120 + }, + { + "epoch": 0.04, + "grad_norm": 0.9624544978141785, + "learning_rate": 4.981373648100946e-05, + "loss": 2.0521, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 0.9315722584724426, + "learning_rate": 4.979855782118802e-05, + "loss": 1.9256, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.9035864472389221, + "learning_rate": 4.978278718637443e-05, + "loss": 2.0882, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.7997236251831055, + "learning_rate": 4.9766424953035e-05, + "loss": 2.0724, + "step": 140 + }, + { + "epoch": 0.05, + "grad_norm": 1.0692921876907349, + "learning_rate": 4.974947151175826e-05, + "loss": 2.1329, + "step": 145 + }, + { + "epoch": 0.05, + "grad_norm": 0.9506180286407471, + "learning_rate": 4.973192726724572e-05, + "loss": 2.082, + "step": 150 + }, + { + "epoch": 0.05, + "grad_norm": 0.8647387027740479, + "learning_rate": 4.9713792638302145e-05, + "loss": 2.0366, + "step": 155 + }, + { + "epoch": 0.05, + "grad_norm": 1.105302095413208, + "learning_rate": 4.969506805782555e-05, + "loss": 2.1481, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.7593303918838501, + "learning_rate": 4.967575397279689e-05, + "loss": 2.032, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7521979808807373, + "learning_rate": 4.965585084426943e-05, + "loss": 2.0379, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.947120726108551, + "learning_rate": 4.9635359147357655e-05, + "loss": 2.1444, + "step": 175 + }, + { + "epoch": 0.06, + "grad_norm": 1.2184454202651978, + "learning_rate": 4.961427937122598e-05, + "loss": 1.9164, + "step": 180 + }, + { + "epoch": 0.06, + "grad_norm": 1.221663475036621, + "learning_rate": 4.959261201907707e-05, + "loss": 2.0084, + "step": 185 + }, + { + "epoch": 0.06, + "grad_norm": 1.0457361936569214, + "learning_rate": 4.957035760813982e-05, + "loss": 2.2032, + "step": 190 + }, + { + "epoch": 0.06, + "grad_norm": 0.8834909200668335, + "learning_rate": 4.954751666965701e-05, + "loss": 2.2101, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.791902482509613, + "learning_rate": 4.9524089748872615e-05, + "loss": 2.0472, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 1.2905739545822144, + "learning_rate": 4.9500077405018807e-05, + "loss": 2.0987, + "step": 205 + }, + { + "epoch": 0.07, + "grad_norm": 0.8612006306648254, + "learning_rate": 4.9475480211302583e-05, + "loss": 2.1765, + "step": 210 + }, + { + "epoch": 0.07, + "grad_norm": 1.3128459453582764, + "learning_rate": 4.945029875489212e-05, + "loss": 1.9926, + "step": 215 + }, + { + "epoch": 0.07, + "grad_norm": 0.9610918164253235, + "learning_rate": 4.94245336369027e-05, + "loss": 2.0124, + "step": 220 + }, + { + "epoch": 0.07, + "grad_norm": 0.873160183429718, + "learning_rate": 4.939818547238241e-05, + "loss": 2.2229, + "step": 225 + }, + { + "epoch": 0.07, + "grad_norm": 1.5535285472869873, + "learning_rate": 4.9371254890297446e-05, + "loss": 2.2013, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 1.1951836347579956, + "learning_rate": 4.93437425335171e-05, + "loss": 2.014, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7874170541763306, + "learning_rate": 4.9315649058798384e-05, + "loss": 2.1701, + "step": 240 + }, + { + "epoch": 0.08, + "grad_norm": 1.3503323793411255, + "learning_rate": 4.928697513677042e-05, + "loss": 2.1681, + "step": 245 + }, + { + "epoch": 0.08, + "grad_norm": 1.3091179132461548, + "learning_rate": 4.925772145191834e-05, + "loss": 2.1224, + "step": 250 + }, + { + "epoch": 0.08, + "grad_norm": 1.4428555965423584, + "learning_rate": 4.9227888702567044e-05, + "loss": 2.0512, + "step": 255 + }, + { + "epoch": 0.08, + "grad_norm": 0.8234395980834961, + "learning_rate": 4.9197477600864446e-05, + "loss": 2.1067, + "step": 260 + }, + { + "epoch": 0.08, + "grad_norm": 1.9094969034194946, + "learning_rate": 4.9166488872764526e-05, + "loss": 1.8884, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 1.0074087381362915, + "learning_rate": 4.913492325800999e-05, + "loss": 1.9345, + "step": 270 + }, + { + "epoch": 0.09, + "grad_norm": 1.0867297649383545, + "learning_rate": 4.910278151011458e-05, + "loss": 2.1928, + "step": 275 + }, + { + "epoch": 0.09, + "grad_norm": 0.6842357516288757, + "learning_rate": 4.907006439634516e-05, + "loss": 2.0407, + "step": 280 + }, + { + "epoch": 0.09, + "grad_norm": 0.8409023284912109, + "learning_rate": 4.903677269770329e-05, + "loss": 2.2344, + "step": 285 + }, + { + "epoch": 0.09, + "grad_norm": 0.8119503259658813, + "learning_rate": 4.900290720890671e-05, + "loss": 2.1296, + "step": 290 + }, + { + "epoch": 0.09, + "grad_norm": 0.9938147068023682, + "learning_rate": 4.8968468738370244e-05, + "loss": 2.152, + "step": 295 + }, + { + "epoch": 0.09, + "grad_norm": 0.9865244030952454, + "learning_rate": 4.8933458108186606e-05, + "loss": 1.9623, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 1.3944802284240723, + "learning_rate": 4.889787615410672e-05, + "loss": 1.915, + "step": 305 + }, + { + "epoch": 0.1, + "grad_norm": 1.3749767541885376, + "learning_rate": 4.886172372551977e-05, + "loss": 1.9934, + "step": 310 + }, + { + "epoch": 0.1, + "grad_norm": 0.9024938941001892, + "learning_rate": 4.882500168543294e-05, + "loss": 2.1541, + "step": 315 + }, + { + "epoch": 0.1, + "grad_norm": 1.1978263854980469, + "learning_rate": 4.878771091045082e-05, + "loss": 2.1688, + "step": 320 + }, + { + "epoch": 0.1, + "grad_norm": 0.8360010981559753, + "learning_rate": 4.874985229075446e-05, + "loss": 2.1387, + "step": 325 + }, + { + "epoch": 0.1, + "grad_norm": 0.7683364152908325, + "learning_rate": 4.871142673008012e-05, + "loss": 2.0215, + "step": 330 + }, + { + "epoch": 0.1, + "grad_norm": 1.4230670928955078, + "learning_rate": 4.867243514569772e-05, + "loss": 1.9491, + "step": 335 + }, + { + "epoch": 0.11, + "grad_norm": 0.8198773860931396, + "learning_rate": 4.863287846838891e-05, + "loss": 2.0151, + "step": 340 + }, + { + "epoch": 0.11, + "grad_norm": 1.467207908630371, + "learning_rate": 4.85927576424249e-05, + "loss": 1.8906, + "step": 345 + }, + { + "epoch": 0.11, + "grad_norm": 0.9537095427513123, + "learning_rate": 4.855207362554385e-05, + "loss": 2.1844, + "step": 350 + }, + { + "epoch": 0.11, + "grad_norm": 1.0757155418395996, + "learning_rate": 4.851082738892809e-05, + "loss": 2.048, + "step": 355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6884938478469849, + "learning_rate": 4.8469019917180846e-05, + "loss": 1.9537, + "step": 360 + }, + { + "epoch": 0.11, + "grad_norm": 1.4680182933807373, + "learning_rate": 4.8426652208302814e-05, + "loss": 1.9731, + "step": 365 + }, + { + "epoch": 0.12, + "grad_norm": 1.1778632402420044, + "learning_rate": 4.83837252736683e-05, + "loss": 2.1395, + "step": 370 + }, + { + "epoch": 0.12, + "grad_norm": 1.2865056991577148, + "learning_rate": 4.834024013800108e-05, + "loss": 2.0016, + "step": 375 + }, + { + "epoch": 0.12, + "grad_norm": 1.055177092552185, + "learning_rate": 4.8296197839349944e-05, + "loss": 1.9632, + "step": 380 + }, + { + "epoch": 0.12, + "grad_norm": 1.0041871070861816, + "learning_rate": 4.825159942906389e-05, + "loss": 2.3302, + "step": 385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0026438236236572, + "learning_rate": 4.820644597176709e-05, + "loss": 2.1517, + "step": 390 + }, + { + "epoch": 0.12, + "grad_norm": 1.3532180786132812, + "learning_rate": 4.81607385453334e-05, + "loss": 2.1229, + "step": 395 + }, + { + "epoch": 0.12, + "grad_norm": 0.7670988440513611, + "learning_rate": 4.81144782408607e-05, + "loss": 2.1382, + "step": 400 + }, + { + "epoch": 0.13, + "grad_norm": 1.0405700206756592, + "learning_rate": 4.8067666162644774e-05, + "loss": 1.9614, + "step": 405 + }, + { + "epoch": 0.13, + "grad_norm": 1.2252662181854248, + "learning_rate": 4.802030342815304e-05, + "loss": 2.1399, + "step": 410 + }, + { + "epoch": 0.13, + "grad_norm": 1.237946629524231, + "learning_rate": 4.7972391167997754e-05, + "loss": 1.9034, + "step": 415 + }, + { + "epoch": 0.13, + "grad_norm": 0.8064705729484558, + "learning_rate": 4.7923930525909156e-05, + "loss": 2.0075, + "step": 420 + }, + { + "epoch": 0.13, + "grad_norm": 0.8717565536499023, + "learning_rate": 4.7874922658708065e-05, + "loss": 2.0105, + "step": 425 + }, + { + "epoch": 0.13, + "grad_norm": 1.6693098545074463, + "learning_rate": 4.782536873627832e-05, + "loss": 2.0242, + "step": 430 + }, + { + "epoch": 0.14, + "grad_norm": 0.82447350025177, + "learning_rate": 4.777526994153882e-05, + "loss": 2.0267, + "step": 435 + }, + { + "epoch": 0.14, + "grad_norm": 0.9926588535308838, + "learning_rate": 4.7724627470415307e-05, + "loss": 1.9119, + "step": 440 + }, + { + "epoch": 0.14, + "grad_norm": 1.0924450159072876, + "learning_rate": 4.7673442531811796e-05, + "loss": 2.2653, + "step": 445 + }, + { + "epoch": 0.14, + "grad_norm": 1.1592103242874146, + "learning_rate": 4.762171634758177e-05, + "loss": 2.0017, + "step": 450 + }, + { + "epoch": 0.14, + "grad_norm": 0.9172110557556152, + "learning_rate": 4.7569450152498927e-05, + "loss": 2.1408, + "step": 455 + }, + { + "epoch": 0.14, + "grad_norm": 1.1897525787353516, + "learning_rate": 4.751664519422778e-05, + "loss": 2.0935, + "step": 460 + }, + { + "epoch": 0.14, + "grad_norm": 0.8793094158172607, + "learning_rate": 4.746330273329386e-05, + "loss": 2.1142, + "step": 465 + }, + { + "epoch": 0.15, + "grad_norm": 1.4337489604949951, + "learning_rate": 4.740942404305356e-05, + "loss": 2.1289, + "step": 470 + }, + { + "epoch": 0.15, + "grad_norm": 1.0251764059066772, + "learning_rate": 4.735501040966383e-05, + "loss": 1.9741, + "step": 475 + }, + { + "epoch": 0.15, + "grad_norm": 1.2659822702407837, + "learning_rate": 4.730006313205143e-05, + "loss": 2.088, + "step": 480 + }, + { + "epoch": 0.15, + "grad_norm": 0.8884140849113464, + "learning_rate": 4.724458352188192e-05, + "loss": 2.2079, + "step": 485 + }, + { + "epoch": 0.15, + "grad_norm": 1.1937768459320068, + "learning_rate": 4.718857290352835e-05, + "loss": 2.048, + "step": 490 + }, + { + "epoch": 0.15, + "grad_norm": 0.9741552472114563, + "learning_rate": 4.713203261403966e-05, + "loss": 2.2569, + "step": 495 + }, + { + "epoch": 0.16, + "grad_norm": 0.7996780872344971, + "learning_rate": 4.707496400310874e-05, + "loss": 1.9574, + "step": 500 + }, + { + "epoch": 0.16, + "grad_norm": 1.8182051181793213, + "learning_rate": 4.701736843304025e-05, + "loss": 2.0951, + "step": 505 + }, + { + "epoch": 0.16, + "grad_norm": 1.507320761680603, + "learning_rate": 4.695924727871805e-05, + "loss": 2.0253, + "step": 510 + }, + { + "epoch": 0.16, + "grad_norm": 0.759121835231781, + "learning_rate": 4.690060192757242e-05, + "loss": 2.0602, + "step": 515 + }, + { + "epoch": 0.16, + "grad_norm": 1.5943195819854736, + "learning_rate": 4.684143377954691e-05, + "loss": 2.0386, + "step": 520 + }, + { + "epoch": 0.16, + "grad_norm": 0.8568710088729858, + "learning_rate": 4.6781744247064955e-05, + "loss": 2.073, + "step": 525 + }, + { + "epoch": 0.16, + "grad_norm": 1.3352620601654053, + "learning_rate": 4.6721534754996125e-05, + "loss": 2.1443, + "step": 530 + }, + { + "epoch": 0.17, + "grad_norm": 1.3417474031448364, + "learning_rate": 4.666080674062213e-05, + "loss": 2.0288, + "step": 535 + }, + { + "epoch": 0.17, + "grad_norm": 1.5334464311599731, + "learning_rate": 4.659956165360251e-05, + "loss": 2.0609, + "step": 540 + }, + { + "epoch": 0.17, + "grad_norm": 0.9658721089363098, + "learning_rate": 4.6537800955940005e-05, + "loss": 1.9539, + "step": 545 + }, + { + "epoch": 0.17, + "grad_norm": 1.9197947978973389, + "learning_rate": 4.647552612194572e-05, + "loss": 2.149, + "step": 550 + }, + { + "epoch": 0.17, + "grad_norm": 0.8512137532234192, + "learning_rate": 4.641273863820383e-05, + "loss": 1.9722, + "step": 555 + }, + { + "epoch": 0.17, + "grad_norm": 1.827289342880249, + "learning_rate": 4.634944000353622e-05, + "loss": 2.0729, + "step": 560 + }, + { + "epoch": 0.18, + "grad_norm": 1.088416337966919, + "learning_rate": 4.628563172896655e-05, + "loss": 1.9507, + "step": 565 + }, + { + "epoch": 0.18, + "grad_norm": 1.3566908836364746, + "learning_rate": 4.6221315337684353e-05, + "loss": 2.1643, + "step": 570 + }, + { + "epoch": 0.18, + "grad_norm": 1.3541293144226074, + "learning_rate": 4.615649236500854e-05, + "loss": 2.1839, + "step": 575 + }, + { + "epoch": 0.18, + "grad_norm": 0.991269588470459, + "learning_rate": 4.609116435835083e-05, + "loss": 2.0976, + "step": 580 + }, + { + "epoch": 0.18, + "grad_norm": 1.0280535221099854, + "learning_rate": 4.602533287717877e-05, + "loss": 2.1474, + "step": 585 + }, + { + "epoch": 0.18, + "grad_norm": 1.013123631477356, + "learning_rate": 4.5958999492978524e-05, + "loss": 2.1873, + "step": 590 + }, + { + "epoch": 0.19, + "grad_norm": 1.1753040552139282, + "learning_rate": 4.589216578921737e-05, + "loss": 2.1744, + "step": 595 + }, + { + "epoch": 0.19, + "grad_norm": 1.1839090585708618, + "learning_rate": 4.582483336130586e-05, + "loss": 1.9982, + "step": 600 + }, + { + "epoch": 0.19, + "grad_norm": 1.0724798440933228, + "learning_rate": 4.575700381655979e-05, + "loss": 2.1234, + "step": 605 + }, + { + "epoch": 0.19, + "grad_norm": 2.009913682937622, + "learning_rate": 4.5688678774161796e-05, + "loss": 1.9478, + "step": 610 + }, + { + "epoch": 0.19, + "grad_norm": 0.9897060394287109, + "learning_rate": 4.561985986512271e-05, + "loss": 1.8268, + "step": 615 + }, + { + "epoch": 0.19, + "grad_norm": 0.8881808519363403, + "learning_rate": 4.555054873224263e-05, + "loss": 1.9887, + "step": 620 + }, + { + "epoch": 0.19, + "grad_norm": 1.155900001525879, + "learning_rate": 4.54807470300717e-05, + "loss": 2.0777, + "step": 625 + }, + { + "epoch": 0.2, + "grad_norm": 0.8782421350479126, + "learning_rate": 4.5410456424870596e-05, + "loss": 2.0566, + "step": 630 + }, + { + "epoch": 0.2, + "grad_norm": 1.3324674367904663, + "learning_rate": 4.5339678594570795e-05, + "loss": 2.047, + "step": 635 + }, + { + "epoch": 0.2, + "grad_norm": 1.9805939197540283, + "learning_rate": 4.526841522873449e-05, + "loss": 1.962, + "step": 640 + }, + { + "epoch": 0.2, + "grad_norm": 1.4999943971633911, + "learning_rate": 4.519666802851422e-05, + "loss": 2.0972, + "step": 645 + }, + { + "epoch": 0.2, + "grad_norm": 1.4504961967468262, + "learning_rate": 4.5124438706612376e-05, + "loss": 2.0041, + "step": 650 + }, + { + "epoch": 0.2, + "grad_norm": 0.9078169465065002, + "learning_rate": 4.505172898724018e-05, + "loss": 2.1229, + "step": 655 + }, + { + "epoch": 0.21, + "grad_norm": 1.1635804176330566, + "learning_rate": 4.497854060607662e-05, + "loss": 2.0195, + "step": 660 + }, + { + "epoch": 0.21, + "grad_norm": 1.46576726436615, + "learning_rate": 4.490487531022699e-05, + "loss": 2.0745, + "step": 665 + }, + { + "epoch": 0.21, + "grad_norm": 1.2094652652740479, + "learning_rate": 4.4830734858181145e-05, + "loss": 2.1068, + "step": 670 + }, + { + "epoch": 0.21, + "grad_norm": 1.4738895893096924, + "learning_rate": 4.47561210197716e-05, + "loss": 1.8088, + "step": 675 + }, + { + "epoch": 0.21, + "grad_norm": 1.23384690284729, + "learning_rate": 4.4681035576131215e-05, + "loss": 2.0995, + "step": 680 + }, + { + "epoch": 0.21, + "grad_norm": 0.8332946300506592, + "learning_rate": 4.46054803196507e-05, + "loss": 2.0541, + "step": 685 + }, + { + "epoch": 0.21, + "grad_norm": 0.9207485318183899, + "learning_rate": 4.452945705393586e-05, + "loss": 2.166, + "step": 690 + }, + { + "epoch": 0.22, + "grad_norm": 1.292945146560669, + "learning_rate": 4.445296759376449e-05, + "loss": 2.0784, + "step": 695 + }, + { + "epoch": 0.22, + "grad_norm": 0.9874763488769531, + "learning_rate": 4.437601376504307e-05, + "loss": 2.2087, + "step": 700 + }, + { + "epoch": 0.22, + "grad_norm": 0.9427415132522583, + "learning_rate": 4.4298597404763186e-05, + "loss": 2.1199, + "step": 705 + }, + { + "epoch": 0.22, + "grad_norm": 1.7369529008865356, + "learning_rate": 4.422072036095768e-05, + "loss": 2.0355, + "step": 710 + }, + { + "epoch": 0.22, + "grad_norm": 1.2423696517944336, + "learning_rate": 4.414238449265654e-05, + "loss": 2.0011, + "step": 715 + }, + { + "epoch": 0.22, + "grad_norm": 1.2304831743240356, + "learning_rate": 4.406359166984249e-05, + "loss": 2.0368, + "step": 720 + }, + { + "epoch": 0.23, + "grad_norm": 0.9090413451194763, + "learning_rate": 4.39843437734064e-05, + "loss": 1.9983, + "step": 725 + }, + { + "epoch": 0.23, + "grad_norm": 1.2729507684707642, + "learning_rate": 4.390464269510233e-05, + "loss": 2.021, + "step": 730 + }, + { + "epoch": 0.23, + "grad_norm": 1.3009227514266968, + "learning_rate": 4.382449033750244e-05, + "loss": 1.9743, + "step": 735 + }, + { + "epoch": 0.23, + "grad_norm": 1.5456056594848633, + "learning_rate": 4.37438886139515e-05, + "loss": 2.0689, + "step": 740 + }, + { + "epoch": 0.23, + "grad_norm": 1.3235007524490356, + "learning_rate": 4.3662839448521264e-05, + "loss": 2.0838, + "step": 745 + }, + { + "epoch": 0.23, + "grad_norm": 2.2074007987976074, + "learning_rate": 4.358134477596454e-05, + "loss": 2.0835, + "step": 750 + }, + { + "epoch": 0.23, + "grad_norm": 1.403738021850586, + "learning_rate": 4.3499406541668966e-05, + "loss": 2.0916, + "step": 755 + }, + { + "epoch": 0.24, + "grad_norm": 1.0940325260162354, + "learning_rate": 4.3417026701610616e-05, + "loss": 1.972, + "step": 760 + }, + { + "epoch": 0.24, + "grad_norm": 1.666353702545166, + "learning_rate": 4.3334207222307275e-05, + "loss": 1.927, + "step": 765 + }, + { + "epoch": 0.24, + "grad_norm": 1.0777515172958374, + "learning_rate": 4.325095008077154e-05, + "loss": 2.1192, + "step": 770 + }, + { + "epoch": 0.24, + "grad_norm": 1.7218186855316162, + "learning_rate": 4.316725726446353e-05, + "loss": 2.0774, + "step": 775 + }, + { + "epoch": 0.24, + "grad_norm": 1.356753945350647, + "learning_rate": 4.3083130771243586e-05, + "loss": 2.0847, + "step": 780 + }, + { + "epoch": 0.24, + "grad_norm": 0.9967429637908936, + "learning_rate": 4.299857260932445e-05, + "loss": 2.0485, + "step": 785 + }, + { + "epoch": 0.25, + "grad_norm": 1.6216442584991455, + "learning_rate": 4.2913584797223397e-05, + "loss": 2.1008, + "step": 790 + }, + { + "epoch": 0.25, + "grad_norm": 1.2556742429733276, + "learning_rate": 4.2828169363714016e-05, + "loss": 1.9209, + "step": 795 + }, + { + "epoch": 0.25, + "grad_norm": 1.1800439357757568, + "learning_rate": 4.274232834777782e-05, + "loss": 1.9722, + "step": 800 + }, + { + "epoch": 0.25, + "grad_norm": 1.1313499212265015, + "learning_rate": 4.2656063798555515e-05, + "loss": 1.9176, + "step": 805 + }, + { + "epoch": 0.25, + "grad_norm": 1.137534737586975, + "learning_rate": 4.256937777529815e-05, + "loss": 1.9929, + "step": 810 + }, + { + "epoch": 0.25, + "grad_norm": 1.0575093030929565, + "learning_rate": 4.2482272347317906e-05, + "loss": 2.166, + "step": 815 + }, + { + "epoch": 0.25, + "grad_norm": 1.5939594507217407, + "learning_rate": 4.2394749593938733e-05, + "loss": 2.1334, + "step": 820 + }, + { + "epoch": 0.26, + "grad_norm": 1.1045507192611694, + "learning_rate": 4.230681160444669e-05, + "loss": 2.0853, + "step": 825 + }, + { + "epoch": 0.26, + "grad_norm": 1.3480136394500732, + "learning_rate": 4.221846047804009e-05, + "loss": 2.1802, + "step": 830 + }, + { + "epoch": 0.26, + "grad_norm": 1.1822657585144043, + "learning_rate": 4.2129698323779366e-05, + "loss": 2.0739, + "step": 835 + }, + { + "epoch": 0.26, + "grad_norm": 1.1771117448806763, + "learning_rate": 4.204052726053676e-05, + "loss": 2.0238, + "step": 840 + }, + { + "epoch": 0.26, + "grad_norm": 1.4757814407348633, + "learning_rate": 4.195094941694571e-05, + "loss": 2.1557, + "step": 845 + }, + { + "epoch": 0.26, + "grad_norm": 0.9095075726509094, + "learning_rate": 4.1860966931350054e-05, + "loss": 2.1666, + "step": 850 + }, + { + "epoch": 0.27, + "grad_norm": 1.1039543151855469, + "learning_rate": 4.1770581951752976e-05, + "loss": 2.105, + "step": 855 + }, + { + "epoch": 0.27, + "grad_norm": 0.8517205119132996, + "learning_rate": 4.1679796635765735e-05, + "loss": 1.9656, + "step": 860 + }, + { + "epoch": 0.27, + "grad_norm": 1.239492654800415, + "learning_rate": 4.158861315055617e-05, + "loss": 2.0166, + "step": 865 + }, + { + "epoch": 0.27, + "grad_norm": 1.1358321905136108, + "learning_rate": 4.1497033672796924e-05, + "loss": 2.0076, + "step": 870 + }, + { + "epoch": 0.27, + "grad_norm": 1.6215249300003052, + "learning_rate": 4.140506038861356e-05, + "loss": 2.1594, + "step": 875 + }, + { + "epoch": 0.27, + "grad_norm": 1.0528080463409424, + "learning_rate": 4.131269549353229e-05, + "loss": 2.1416, + "step": 880 + }, + { + "epoch": 0.28, + "grad_norm": 0.8976901769638062, + "learning_rate": 4.1219941192427644e-05, + "loss": 2.1242, + "step": 885 + }, + { + "epoch": 0.28, + "grad_norm": 1.263594388961792, + "learning_rate": 4.112679969946977e-05, + "loss": 2.02, + "step": 890 + }, + { + "epoch": 0.28, + "grad_norm": 1.4173017740249634, + "learning_rate": 4.103327323807162e-05, + "loss": 2.0438, + "step": 895 + }, + { + "epoch": 0.28, + "grad_norm": 1.876170039176941, + "learning_rate": 4.093936404083585e-05, + "loss": 1.9806, + "step": 900 + }, + { + "epoch": 0.28, + "grad_norm": 1.4649231433868408, + "learning_rate": 4.0845074349501544e-05, + "loss": 2.1476, + "step": 905 + }, + { + "epoch": 0.28, + "grad_norm": 1.0446043014526367, + "learning_rate": 4.0750406414890695e-05, + "loss": 1.9672, + "step": 910 + }, + { + "epoch": 0.28, + "grad_norm": 1.0225305557250977, + "learning_rate": 4.065536249685448e-05, + "loss": 1.9984, + "step": 915 + }, + { + "epoch": 0.29, + "grad_norm": 1.0120617151260376, + "learning_rate": 4.055994486421929e-05, + "loss": 2.1162, + "step": 920 + }, + { + "epoch": 0.29, + "grad_norm": 1.0469881296157837, + "learning_rate": 4.04641557947326e-05, + "loss": 2.0435, + "step": 925 + }, + { + "epoch": 0.29, + "grad_norm": 1.2435941696166992, + "learning_rate": 4.036799757500856e-05, + "loss": 2.0431, + "step": 930 + }, + { + "epoch": 0.29, + "grad_norm": 1.0055103302001953, + "learning_rate": 4.027147250047348e-05, + "loss": 2.2021, + "step": 935 + }, + { + "epoch": 0.29, + "grad_norm": 1.1212949752807617, + "learning_rate": 4.017458287531094e-05, + "loss": 1.997, + "step": 940 + }, + { + "epoch": 0.29, + "grad_norm": 1.1048357486724854, + "learning_rate": 4.007733101240685e-05, + "loss": 1.946, + "step": 945 + }, + { + "epoch": 0.3, + "grad_norm": 1.4721689224243164, + "learning_rate": 3.997971923329426e-05, + "loss": 2.0723, + "step": 950 + }, + { + "epoch": 0.3, + "grad_norm": 1.3793156147003174, + "learning_rate": 3.988174986809783e-05, + "loss": 2.034, + "step": 955 + }, + { + "epoch": 0.3, + "grad_norm": 0.9013482928276062, + "learning_rate": 3.9783425255478355e-05, + "loss": 1.9736, + "step": 960 + }, + { + "epoch": 0.3, + "grad_norm": 0.9192422032356262, + "learning_rate": 3.968474774257682e-05, + "loss": 1.9878, + "step": 965 + }, + { + "epoch": 0.3, + "grad_norm": 1.9304206371307373, + "learning_rate": 3.9585719684958446e-05, + "loss": 2.117, + "step": 970 + }, + { + "epoch": 0.3, + "grad_norm": 1.0435137748718262, + "learning_rate": 3.948634344655639e-05, + "loss": 2.0585, + "step": 975 + }, + { + "epoch": 0.3, + "grad_norm": 1.4636590480804443, + "learning_rate": 3.938662139961538e-05, + "loss": 2.0409, + "step": 980 + }, + { + "epoch": 0.31, + "grad_norm": 1.8014529943466187, + "learning_rate": 3.928655592463508e-05, + "loss": 2.0369, + "step": 985 + }, + { + "epoch": 0.31, + "grad_norm": 1.2412620782852173, + "learning_rate": 3.918614941031319e-05, + "loss": 1.967, + "step": 990 + }, + { + "epoch": 0.31, + "grad_norm": 1.3581103086471558, + "learning_rate": 3.908540425348852e-05, + "loss": 2.0037, + "step": 995 + }, + { + "epoch": 0.31, + "grad_norm": 1.2377780675888062, + "learning_rate": 3.8984322859083725e-05, + "loss": 1.9991, + "step": 1000 + }, + { + "epoch": 0.31, + "grad_norm": 0.9209259748458862, + "learning_rate": 3.8882907640047896e-05, + "loss": 2.0448, + "step": 1005 + }, + { + "epoch": 0.31, + "grad_norm": 1.0150959491729736, + "learning_rate": 3.878116101729897e-05, + "loss": 2.0791, + "step": 1010 + }, + { + "epoch": 0.32, + "grad_norm": 1.5959141254425049, + "learning_rate": 3.867908541966594e-05, + "loss": 1.9997, + "step": 1015 + }, + { + "epoch": 0.32, + "grad_norm": 1.3945012092590332, + "learning_rate": 3.857668328383088e-05, + "loss": 2.0481, + "step": 1020 + }, + { + "epoch": 0.32, + "grad_norm": 1.2361671924591064, + "learning_rate": 3.847395705427075e-05, + "loss": 2.2664, + "step": 1025 + }, + { + "epoch": 0.32, + "grad_norm": 1.9661719799041748, + "learning_rate": 3.837090918319909e-05, + "loss": 1.9752, + "step": 1030 + }, + { + "epoch": 0.32, + "grad_norm": 1.6995949745178223, + "learning_rate": 3.8267542130507436e-05, + "loss": 2.1332, + "step": 1035 + }, + { + "epoch": 0.32, + "grad_norm": 1.1248412132263184, + "learning_rate": 3.816385836370663e-05, + "loss": 2.0432, + "step": 1040 + }, + { + "epoch": 0.32, + "grad_norm": 0.8734235763549805, + "learning_rate": 3.805986035786789e-05, + "loss": 1.9618, + "step": 1045 + }, + { + "epoch": 0.33, + "grad_norm": 1.322766661643982, + "learning_rate": 3.795555059556378e-05, + "loss": 2.0267, + "step": 1050 + }, + { + "epoch": 0.33, + "grad_norm": 1.0396028757095337, + "learning_rate": 3.7850931566808866e-05, + "loss": 2.1075, + "step": 1055 + }, + { + "epoch": 0.33, + "grad_norm": 0.9574625492095947, + "learning_rate": 3.7746005769000363e-05, + "loss": 2.156, + "step": 1060 + }, + { + "epoch": 0.33, + "grad_norm": 1.4480133056640625, + "learning_rate": 3.764077570685844e-05, + "loss": 1.9615, + "step": 1065 + }, + { + "epoch": 0.33, + "grad_norm": 1.5908560752868652, + "learning_rate": 3.753524389236648e-05, + "loss": 2.0928, + "step": 1070 + }, + { + "epoch": 0.33, + "grad_norm": 1.2628813982009888, + "learning_rate": 3.742941284471111e-05, + "loss": 2.1074, + "step": 1075 + }, + { + "epoch": 0.34, + "grad_norm": 1.2687503099441528, + "learning_rate": 3.7323285090222054e-05, + "loss": 1.9666, + "step": 1080 + }, + { + "epoch": 0.34, + "grad_norm": 1.2571731805801392, + "learning_rate": 3.721686316231181e-05, + "loss": 2.0468, + "step": 1085 + }, + { + "epoch": 0.34, + "grad_norm": 1.007453441619873, + "learning_rate": 3.7110149601415215e-05, + "loss": 2.0624, + "step": 1090 + }, + { + "epoch": 0.34, + "grad_norm": 1.2390377521514893, + "learning_rate": 3.700314695492876e-05, + "loss": 1.9888, + "step": 1095 + }, + { + "epoch": 0.34, + "grad_norm": 1.0878371000289917, + "learning_rate": 3.6895857777149825e-05, + "loss": 2.1013, + "step": 1100 + }, + { + "epoch": 0.34, + "grad_norm": 0.8759217262268066, + "learning_rate": 3.6788284629215624e-05, + "loss": 1.875, + "step": 1105 + }, + { + "epoch": 0.35, + "grad_norm": 1.1345970630645752, + "learning_rate": 3.668043007904219e-05, + "loss": 1.9096, + "step": 1110 + }, + { + "epoch": 0.35, + "grad_norm": 1.253629446029663, + "learning_rate": 3.6572296701262966e-05, + "loss": 2.1859, + "step": 1115 + }, + { + "epoch": 0.35, + "grad_norm": 0.9796190857887268, + "learning_rate": 3.646388707716738e-05, + "loss": 2.2092, + "step": 1120 + }, + { + "epoch": 0.35, + "grad_norm": 1.3893767595291138, + "learning_rate": 3.635520379463926e-05, + "loss": 2.0026, + "step": 1125 + }, + { + "epoch": 0.35, + "grad_norm": 0.8778309226036072, + "learning_rate": 3.6246249448095004e-05, + "loss": 2.2112, + "step": 1130 + }, + { + "epoch": 0.35, + "grad_norm": 1.2479698657989502, + "learning_rate": 3.6137026638421696e-05, + "loss": 2.0221, + "step": 1135 + }, + { + "epoch": 0.35, + "grad_norm": 1.3813824653625488, + "learning_rate": 3.6027537972914974e-05, + "loss": 1.9106, + "step": 1140 + }, + { + "epoch": 0.36, + "grad_norm": 1.2043218612670898, + "learning_rate": 3.5917786065216826e-05, + "loss": 2.0673, + "step": 1145 + }, + { + "epoch": 0.36, + "grad_norm": 1.5337340831756592, + "learning_rate": 3.580777353525318e-05, + "loss": 2.1463, + "step": 1150 + }, + { + "epoch": 0.36, + "grad_norm": 1.155813455581665, + "learning_rate": 3.5697503009171385e-05, + "loss": 2.0255, + "step": 1155 + }, + { + "epoch": 0.36, + "grad_norm": 1.034644365310669, + "learning_rate": 3.558697711927748e-05, + "loss": 2.1348, + "step": 1160 + }, + { + "epoch": 0.36, + "grad_norm": 1.0959795713424683, + "learning_rate": 3.54761985039734e-05, + "loss": 2.1457, + "step": 1165 + }, + { + "epoch": 0.36, + "grad_norm": 1.1938838958740234, + "learning_rate": 3.5365169807693966e-05, + "loss": 2.1256, + "step": 1170 + }, + { + "epoch": 0.37, + "grad_norm": 0.8162047863006592, + "learning_rate": 3.525389368084379e-05, + "loss": 1.9587, + "step": 1175 + }, + { + "epoch": 0.37, + "grad_norm": 0.9358930587768555, + "learning_rate": 3.514237277973393e-05, + "loss": 1.8965, + "step": 1180 + }, + { + "epoch": 0.37, + "grad_norm": 0.9210988879203796, + "learning_rate": 3.503060976651862e-05, + "loss": 1.9669, + "step": 1185 + }, + { + "epoch": 0.37, + "grad_norm": 1.4641343355178833, + "learning_rate": 3.491860730913156e-05, + "loss": 2.003, + "step": 1190 + }, + { + "epoch": 0.37, + "grad_norm": 1.2458257675170898, + "learning_rate": 3.480636808122235e-05, + "loss": 2.1487, + "step": 1195 + }, + { + "epoch": 0.37, + "grad_norm": 1.6770122051239014, + "learning_rate": 3.469389476209259e-05, + "loss": 2.0686, + "step": 1200 + }, + { + "epoch": 0.37, + "grad_norm": 0.9083845019340515, + "learning_rate": 3.458119003663199e-05, + "loss": 2.0284, + "step": 1205 + }, + { + "epoch": 0.38, + "grad_norm": 1.2679696083068848, + "learning_rate": 3.446825659525421e-05, + "loss": 2.0555, + "step": 1210 + }, + { + "epoch": 0.38, + "grad_norm": 1.3823720216751099, + "learning_rate": 3.435509713383268e-05, + "loss": 1.9375, + "step": 1215 + }, + { + "epoch": 0.38, + "grad_norm": 1.5862077474594116, + "learning_rate": 3.424171435363623e-05, + "loss": 2.0271, + "step": 1220 + }, + { + "epoch": 0.38, + "grad_norm": 2.0107533931732178, + "learning_rate": 3.412811096126461e-05, + "loss": 2.1897, + "step": 1225 + }, + { + "epoch": 0.38, + "grad_norm": 1.4544458389282227, + "learning_rate": 3.401428966858387e-05, + "loss": 1.9978, + "step": 1230 + }, + { + "epoch": 0.38, + "grad_norm": 1.188170075416565, + "learning_rate": 3.390025319266167e-05, + "loss": 2.0688, + "step": 1235 + }, + { + "epoch": 0.39, + "grad_norm": 1.1016322374343872, + "learning_rate": 3.3786004255702336e-05, + "loss": 2.0396, + "step": 1240 + }, + { + "epoch": 0.39, + "grad_norm": 1.6623334884643555, + "learning_rate": 3.3671545584981954e-05, + "loss": 1.9566, + "step": 1245 + }, + { + "epoch": 0.39, + "grad_norm": 0.9161584377288818, + "learning_rate": 3.355687991278324e-05, + "loss": 2.0474, + "step": 1250 + }, + { + "epoch": 0.39, + "grad_norm": 0.9911025166511536, + "learning_rate": 3.3442009976330305e-05, + "loss": 2.2163, + "step": 1255 + }, + { + "epoch": 0.39, + "grad_norm": 1.1504255533218384, + "learning_rate": 3.332693851772331e-05, + "loss": 2.1088, + "step": 1260 + }, + { + "epoch": 0.39, + "grad_norm": 0.9544184803962708, + "learning_rate": 3.3211668283873035e-05, + "loss": 1.8947, + "step": 1265 + }, + { + "epoch": 0.39, + "grad_norm": 1.4625756740570068, + "learning_rate": 3.3096202026435304e-05, + "loss": 2.1748, + "step": 1270 + }, + { + "epoch": 0.4, + "grad_norm": 1.3267475366592407, + "learning_rate": 3.298054250174527e-05, + "loss": 1.9218, + "step": 1275 + }, + { + "epoch": 0.4, + "grad_norm": 0.9869363903999329, + "learning_rate": 3.2864692470751654e-05, + "loss": 2.2723, + "step": 1280 + }, + { + "epoch": 0.4, + "grad_norm": 1.5177838802337646, + "learning_rate": 3.27486546989508e-05, + "loss": 2.1456, + "step": 1285 + }, + { + "epoch": 0.4, + "grad_norm": 1.1998714208602905, + "learning_rate": 3.263243195632068e-05, + "loss": 1.8877, + "step": 1290 + }, + { + "epoch": 0.4, + "grad_norm": 1.2112164497375488, + "learning_rate": 3.2516027017254785e-05, + "loss": 2.0615, + "step": 1295 + }, + { + "epoch": 0.4, + "grad_norm": 1.0616129636764526, + "learning_rate": 3.239944266049587e-05, + "loss": 2.0402, + "step": 1300 + }, + { + "epoch": 0.41, + "grad_norm": 1.4537287950515747, + "learning_rate": 3.228268166906962e-05, + "loss": 2.0728, + "step": 1305 + }, + { + "epoch": 0.41, + "grad_norm": 1.3899391889572144, + "learning_rate": 3.2165746830218254e-05, + "loss": 2.1815, + "step": 1310 + }, + { + "epoch": 0.41, + "grad_norm": 1.332529067993164, + "learning_rate": 3.204864093533394e-05, + "loss": 1.8935, + "step": 1315 + }, + { + "epoch": 0.41, + "grad_norm": 1.4466496706008911, + "learning_rate": 3.193136677989221e-05, + "loss": 1.9567, + "step": 1320 + }, + { + "epoch": 0.41, + "grad_norm": 1.1781721115112305, + "learning_rate": 3.181392716338516e-05, + "loss": 2.055, + "step": 1325 + }, + { + "epoch": 0.41, + "grad_norm": 0.9411901831626892, + "learning_rate": 3.1696324889254716e-05, + "loss": 1.8794, + "step": 1330 + }, + { + "epoch": 0.42, + "grad_norm": 1.2628341913223267, + "learning_rate": 3.15785627648256e-05, + "loss": 2.0299, + "step": 1335 + }, + { + "epoch": 0.42, + "grad_norm": 1.4857370853424072, + "learning_rate": 3.146064360123846e-05, + "loss": 1.9342, + "step": 1340 + }, + { + "epoch": 0.42, + "grad_norm": 1.661470651626587, + "learning_rate": 3.1342570213382594e-05, + "loss": 2.0399, + "step": 1345 + }, + { + "epoch": 0.42, + "grad_norm": 1.522845983505249, + "learning_rate": 3.122434541982888e-05, + "loss": 2.1419, + "step": 1350 + }, + { + "epoch": 0.42, + "grad_norm": 1.5679118633270264, + "learning_rate": 3.110597204276247e-05, + "loss": 2.2932, + "step": 1355 + }, + { + "epoch": 0.42, + "grad_norm": 1.3367788791656494, + "learning_rate": 3.098745290791539e-05, + "loss": 1.8989, + "step": 1360 + }, + { + "epoch": 0.42, + "grad_norm": 1.3873472213745117, + "learning_rate": 3.086879084449907e-05, + "loss": 2.1214, + "step": 1365 + }, + { + "epoch": 0.43, + "grad_norm": 1.2957035303115845, + "learning_rate": 3.074998868513688e-05, + "loss": 2.2538, + "step": 1370 + }, + { + "epoch": 0.43, + "grad_norm": 1.122176170349121, + "learning_rate": 3.0631049265796465e-05, + "loss": 2.0974, + "step": 1375 + }, + { + "epoch": 0.43, + "grad_norm": 1.0422618389129639, + "learning_rate": 3.051197542572203e-05, + "loss": 2.054, + "step": 1380 + }, + { + "epoch": 0.43, + "grad_norm": 1.1926140785217285, + "learning_rate": 3.0392770007366584e-05, + "loss": 1.9798, + "step": 1385 + }, + { + "epoch": 0.43, + "grad_norm": 0.8764025568962097, + "learning_rate": 3.0273435856324112e-05, + "loss": 2.0796, + "step": 1390 + }, + { + "epoch": 0.43, + "grad_norm": 0.8200764656066895, + "learning_rate": 3.0153975821261605e-05, + "loss": 1.9116, + "step": 1395 + }, + { + "epoch": 0.44, + "grad_norm": 1.0340498685836792, + "learning_rate": 3.0034392753851066e-05, + "loss": 2.0235, + "step": 1400 + }, + { + "epoch": 0.44, + "grad_norm": 1.0799012184143066, + "learning_rate": 2.9914689508701476e-05, + "loss": 2.1455, + "step": 1405 + }, + { + "epoch": 0.44, + "grad_norm": 1.301015853881836, + "learning_rate": 2.979486894329058e-05, + "loss": 2.0355, + "step": 1410 + }, + { + "epoch": 0.44, + "grad_norm": 1.2926914691925049, + "learning_rate": 2.9674933917896747e-05, + "loss": 2.0379, + "step": 1415 + }, + { + "epoch": 0.44, + "grad_norm": 1.4712942838668823, + "learning_rate": 2.9554887295530647e-05, + "loss": 2.0802, + "step": 1420 + }, + { + "epoch": 0.44, + "grad_norm": 1.1957335472106934, + "learning_rate": 2.943473194186693e-05, + "loss": 2.1044, + "step": 1425 + }, + { + "epoch": 0.44, + "grad_norm": 1.568293571472168, + "learning_rate": 2.9314470725175792e-05, + "loss": 2.0121, + "step": 1430 + }, + { + "epoch": 0.45, + "grad_norm": 1.4844893217086792, + "learning_rate": 2.919410651625455e-05, + "loss": 2.0717, + "step": 1435 + }, + { + "epoch": 0.45, + "grad_norm": 1.3942641019821167, + "learning_rate": 2.907364218835904e-05, + "loss": 1.9522, + "step": 1440 + }, + { + "epoch": 0.45, + "grad_norm": 0.7795314788818359, + "learning_rate": 2.8953080617135115e-05, + "loss": 1.9593, + "step": 1445 + }, + { + "epoch": 0.45, + "grad_norm": 1.751107931137085, + "learning_rate": 2.8832424680549937e-05, + "loss": 1.8073, + "step": 1450 + }, + { + "epoch": 0.45, + "grad_norm": 1.2202279567718506, + "learning_rate": 2.8711677258823306e-05, + "loss": 2.0042, + "step": 1455 + }, + { + "epoch": 0.45, + "grad_norm": 1.5163853168487549, + "learning_rate": 2.859084123435887e-05, + "loss": 1.9931, + "step": 1460 + }, + { + "epoch": 0.46, + "grad_norm": 0.94038987159729, + "learning_rate": 2.84699194916754e-05, + "loss": 2.1533, + "step": 1465 + }, + { + "epoch": 0.46, + "grad_norm": 1.4618102312088013, + "learning_rate": 2.834891491733781e-05, + "loss": 2.029, + "step": 1470 + }, + { + "epoch": 0.46, + "grad_norm": 0.9747155904769897, + "learning_rate": 2.822783039988836e-05, + "loss": 2.0241, + "step": 1475 + }, + { + "epoch": 0.46, + "grad_norm": 1.0887038707733154, + "learning_rate": 2.8106668829777645e-05, + "loss": 2.0959, + "step": 1480 + }, + { + "epoch": 0.46, + "grad_norm": 1.2170171737670898, + "learning_rate": 2.7985433099295618e-05, + "loss": 1.8718, + "step": 1485 + }, + { + "epoch": 0.46, + "grad_norm": 1.1366883516311646, + "learning_rate": 2.7864126102502524e-05, + "loss": 2.2397, + "step": 1490 + }, + { + "epoch": 0.46, + "grad_norm": 1.1206785440444946, + "learning_rate": 2.774275073515985e-05, + "loss": 2.1083, + "step": 1495 + }, + { + "epoch": 0.47, + "grad_norm": 1.126807451248169, + "learning_rate": 2.7621309894661167e-05, + "loss": 2.0764, + "step": 1500 + }, + { + "epoch": 0.47, + "grad_norm": 1.0077627897262573, + "learning_rate": 2.7499806479962997e-05, + "loss": 2.0955, + "step": 1505 + }, + { + "epoch": 0.47, + "grad_norm": 0.9740080833435059, + "learning_rate": 2.7378243391515558e-05, + "loss": 2.0449, + "step": 1510 + }, + { + "epoch": 0.47, + "grad_norm": 1.100853681564331, + "learning_rate": 2.7256623531193605e-05, + "loss": 1.8368, + "step": 1515 + }, + { + "epoch": 0.47, + "grad_norm": 1.147560954093933, + "learning_rate": 2.7134949802227073e-05, + "loss": 2.024, + "step": 1520 + }, + { + "epoch": 0.47, + "grad_norm": 0.8977387547492981, + "learning_rate": 2.7013225109131836e-05, + "loss": 2.0699, + "step": 1525 + }, + { + "epoch": 0.48, + "grad_norm": 1.5398712158203125, + "learning_rate": 2.689145235764035e-05, + "loss": 1.953, + "step": 1530 + }, + { + "epoch": 0.48, + "step": 1534, + "total_flos": 2.060795874115584e+16, + "train_loss": 2.0602192145127516, + "train_runtime": 801.4891, + "train_samples_per_second": 64.198, + "train_steps_per_second": 4.011 + } + ], + "logging_steps": 5, + "max_steps": 3215, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 2.060795874115584e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..dbb5f752acaebe6718c05e59ced2a70ce6dfb6a0 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e80b13ff00898b4493d440a3c1a1eb234c0ae541cbca8a8b1befef97a354c9 +size 5112