diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..cc1259d472ba4bdf769bb49f1d2cb573809a800c 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-1086/tokenizer.json filter=lfs diff=lfs merge=lfs -text +checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..19dec661f75227aa6a7568fd3c4c31d5ca436efe --- /dev/null +++ b/README.md @@ -0,0 +1,62 @@ +--- +library_name: peft +license: other +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B +tags: +- llama-factory +- lora +- generated_from_trainer +model-index: +- name: DeepSeek-R1-Distill-Qwen-32B + results: [] +--- + + + +# DeepSeek-R1-Distill-Qwen-32B + +This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) on the alpaca_thinking dataset. + +## Model description + +More information needed + +## Intended uses & limitations + +More information needed + +## Training and evaluation data + +More information needed + +## Training procedure + +### Training hyperparameters + +The following hyperparameters were used during training: +- learning_rate: 0.0001 +- train_batch_size: 1 +- eval_batch_size: 8 +- seed: 42 +- distributed_type: multi-GPU +- num_devices: 4 +- gradient_accumulation_steps: 8 +- total_train_batch_size: 32 +- total_eval_batch_size: 32 +- optimizer: Use adamw_torch with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments +- lr_scheduler_type: cosine +- lr_scheduler_warmup_ratio: 0.1 +- num_epochs: 3.0 + +### Training results + + + +### Framework versions + +- PEFT 0.12.0 +- Transformers 4.49.0 +- Pytorch 2.5.1+cu124 +- Datasets 3.2.0 +- Tokenizers 0.21.0 \ No newline at end of file diff --git a/adapter_config.json b/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..5a8646ffc043b2e177162e505e4ddf95a34ee35d --- /dev/null +++ b/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "pissa_niter_16", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/adapter_model.safetensors b/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..476a84d19c4420d1d508ed6ebb6e7ce24ac1217b --- /dev/null +++ b/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d2b60858f8196e9c42a81145ebed482335b25f140656f334689c5cd28feb329 +size 268555264 diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6f40b85634398802ef051edb9b192fe596a15ba6 --- /dev/null +++ b/all_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 2.9937888198757765, + "total_flos": 6.17252944434797e+18, + "train_loss": 0.3891050570797086, + "train_runtime": 6764.7829, + "train_samples_per_second": 5.14, + "train_steps_per_second": 0.161 +} \ No newline at end of file diff --git a/checkpoint-1000/README.md b/checkpoint-1000/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d823a0d9f612b6fd128ea17ebbddb8df140520ef --- /dev/null +++ b/checkpoint-1000/README.md @@ -0,0 +1,202 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.12.0 \ No newline at end of file diff --git a/checkpoint-1000/adapter_config.json b/checkpoint-1000/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..652c79f18ba64f20d9cd4fc1eff31c4b47afb1c6 --- /dev/null +++ b/checkpoint-1000/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "pissa_niter_16", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1000/adapter_model.safetensors b/checkpoint-1000/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bc3d36841ae3d8d34206aa710a640da75f6b0dfa --- /dev/null +++ b/checkpoint-1000/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e2f0472d2f83d4810a9f24f4b41cb5e2509fd4645e31a85981bfa3ddc9ac9e6 +size 268555264 diff --git a/checkpoint-1000/optimizer.pt b/checkpoint-1000/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..23ddc2178ea6bde9bccfe4ab185f2bc35c3c4494 --- /dev/null +++ b/checkpoint-1000/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:858a7d2c7117fc10a925bae6ddadff03e49c6742e566c172c0005a2591dd963b +size 537626770 diff --git a/checkpoint-1000/rng_state_0.pth b/checkpoint-1000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5a7c482c30381cd512ccc35fe322d8a34fbf5207 --- /dev/null +++ b/checkpoint-1000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:308f94f9a5c24e1bad5c393d56ae7af7782600f4e791d9c6ac35b22fff2105b6 +size 15024 diff --git a/checkpoint-1000/rng_state_1.pth b/checkpoint-1000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7b862c21b28bbd89ce6b4fb681d41be05f175599 --- /dev/null +++ b/checkpoint-1000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b056f3c23cb32dc77a2ec9e7651e0b64e4440e21f0fdf969b86bfc56a1cbdf06 +size 15024 diff --git a/checkpoint-1000/rng_state_2.pth b/checkpoint-1000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..d86ce886844e0298f058d67065e5eeb27ffe7e48 --- /dev/null +++ b/checkpoint-1000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3f8a05714bc528f4885a2816181652f2303b3e8150f89b56aaee6bec56aa520 +size 15024 diff --git a/checkpoint-1000/rng_state_3.pth b/checkpoint-1000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..10733f5da657367adf3f67760028644c0839660f --- /dev/null +++ b/checkpoint-1000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f755bd3c330281961e5c03af9d10ce8c1e1678619d384f6f1fd5fd7dce2ff50 +size 15024 diff --git a/checkpoint-1000/scheduler.pt b/checkpoint-1000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..501b4491679d616789f8e0bc3fe01e337bbc5907 --- /dev/null +++ b/checkpoint-1000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2602cbf6571d5d825f6bf2d7375d253f1e29c737b5ba79bcc221ad05bf6a6b4 +size 1064 diff --git a/checkpoint-1000/special_tokens_map.json b/checkpoint-1000/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/checkpoint-1000/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1000/tokenizer.json b/checkpoint-1000/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c --- /dev/null +++ b/checkpoint-1000/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 +size 11422778 diff --git a/checkpoint-1000/tokenizer_config.json b/checkpoint-1000/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a414ab9b6f7fec711d4c1346f5847dd0d5bd0ff --- /dev/null +++ b/checkpoint-1000/tokenizer_config.json @@ -0,0 +1,197 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 2048, + "pad_token": "<|end▁of▁sentence|>", + "padding_side": "right", + "sp_model_kwargs": {}, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/checkpoint-1000/trainer_state.json b/checkpoint-1000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..53fb03d060735d857fc8e921fba71d7e28b1dc1b --- /dev/null +++ b/checkpoint-1000/trainer_state.json @@ -0,0 +1,733 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.756383712905452, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.027605244996549344, + "grad_norm": 1.6335422992706299, + "learning_rate": 9.174311926605506e-06, + "loss": 0.8102, + "step": 10 + }, + { + "epoch": 0.05521048999309869, + "grad_norm": 0.8111785054206848, + "learning_rate": 1.834862385321101e-05, + "loss": 0.6999, + "step": 20 + }, + { + "epoch": 0.08281573498964803, + "grad_norm": 0.4619831144809723, + "learning_rate": 2.7522935779816515e-05, + "loss": 0.5682, + "step": 30 + }, + { + "epoch": 0.11042097998619738, + "grad_norm": 0.4434720575809479, + "learning_rate": 3.669724770642202e-05, + "loss": 0.5232, + "step": 40 + }, + { + "epoch": 0.13802622498274672, + "grad_norm": 0.44054797291755676, + "learning_rate": 4.587155963302753e-05, + "loss": 0.5084, + "step": 50 + }, + { + "epoch": 0.16563146997929606, + "grad_norm": 0.42256447672843933, + "learning_rate": 5.504587155963303e-05, + "loss": 0.477, + "step": 60 + }, + { + "epoch": 0.1932367149758454, + "grad_norm": 0.4349405765533447, + "learning_rate": 6.422018348623854e-05, + "loss": 0.4841, + "step": 70 + }, + { + "epoch": 0.22084195997239475, + "grad_norm": 0.4515930712223053, + "learning_rate": 7.339449541284404e-05, + "loss": 0.4704, + "step": 80 + }, + { + "epoch": 0.2484472049689441, + "grad_norm": 0.45412737131118774, + "learning_rate": 8.256880733944955e-05, + "loss": 0.4718, + "step": 90 + }, + { + "epoch": 0.27605244996549344, + "grad_norm": 0.49010995030403137, + "learning_rate": 9.174311926605506e-05, + "loss": 0.4496, + "step": 100 + }, + { + "epoch": 0.3036576949620428, + "grad_norm": 0.4931396245956421, + "learning_rate": 9.999974150612772e-05, + "loss": 0.4524, + "step": 110 + }, + { + "epoch": 0.33126293995859213, + "grad_norm": 1.1270735263824463, + "learning_rate": 9.996872547536591e-05, + "loss": 0.4503, + "step": 120 + }, + { + "epoch": 0.3588681849551415, + "grad_norm": 0.48991507291793823, + "learning_rate": 9.988604741439287e-05, + "loss": 0.4399, + "step": 130 + }, + { + "epoch": 0.3864734299516908, + "grad_norm": 0.45801088213920593, + "learning_rate": 9.975179280300506e-05, + "loss": 0.4524, + "step": 140 + }, + { + "epoch": 0.4140786749482402, + "grad_norm": 0.420897901058197, + "learning_rate": 9.956610044533896e-05, + "loss": 0.4281, + "step": 150 + }, + { + "epoch": 0.4416839199447895, + "grad_norm": 0.4336962103843689, + "learning_rate": 9.932916232636318e-05, + "loss": 0.4305, + "step": 160 + }, + { + "epoch": 0.4692891649413389, + "grad_norm": 0.44120800495147705, + "learning_rate": 9.904122341338765e-05, + "loss": 0.4208, + "step": 170 + }, + { + "epoch": 0.4968944099378882, + "grad_norm": 0.9154078364372253, + "learning_rate": 9.870258140279503e-05, + "loss": 0.4436, + "step": 180 + }, + { + "epoch": 0.5244996549344375, + "grad_norm": 0.4551916718482971, + "learning_rate": 9.831358641225624e-05, + "loss": 0.4288, + "step": 190 + }, + { + "epoch": 0.5521048999309869, + "grad_norm": 0.4513665437698364, + "learning_rate": 9.787464061874825e-05, + "loss": 0.4384, + "step": 200 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.43779632449150085, + "learning_rate": 9.738619784274833e-05, + "loss": 0.4178, + "step": 210 + }, + { + "epoch": 0.6073153899240856, + "grad_norm": 0.4170076847076416, + "learning_rate": 9.684876307903494e-05, + "loss": 0.42, + "step": 220 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.4370488226413727, + "learning_rate": 9.626289197457994e-05, + "loss": 0.4296, + "step": 230 + }, + { + "epoch": 0.6625258799171843, + "grad_norm": 0.42547333240509033, + "learning_rate": 9.562919025407236e-05, + "loss": 0.4264, + "step": 240 + }, + { + "epoch": 0.6901311249137336, + "grad_norm": 0.4317057430744171, + "learning_rate": 9.494831309366723e-05, + "loss": 0.4052, + "step": 250 + }, + { + "epoch": 0.717736369910283, + "grad_norm": 0.40589675307273865, + "learning_rate": 9.422096444360735e-05, + "loss": 0.41, + "step": 260 + }, + { + "epoch": 0.7453416149068323, + "grad_norm": 0.44671744108200073, + "learning_rate": 9.34478963004181e-05, + "loss": 0.4162, + "step": 270 + }, + { + "epoch": 0.7729468599033816, + "grad_norm": 0.41162508726119995, + "learning_rate": 9.262990792942768e-05, + "loss": 0.4183, + "step": 280 + }, + { + "epoch": 0.800552104899931, + "grad_norm": 0.483149915933609, + "learning_rate": 9.176784503841697e-05, + "loss": 0.4174, + "step": 290 + }, + { + "epoch": 0.8281573498964804, + "grad_norm": 0.4605332612991333, + "learning_rate": 9.086259890325297e-05, + "loss": 0.4191, + "step": 300 + }, + { + "epoch": 0.8557625948930296, + "grad_norm": 0.4153307378292084, + "learning_rate": 8.991510544640991e-05, + "loss": 0.4253, + "step": 310 + }, + { + "epoch": 0.883367839889579, + "grad_norm": 0.43806084990501404, + "learning_rate": 8.892634426933106e-05, + "loss": 0.4265, + "step": 320 + }, + { + "epoch": 0.9109730848861284, + "grad_norm": 0.45412200689315796, + "learning_rate": 8.78973376396311e-05, + "loss": 0.4365, + "step": 330 + }, + { + "epoch": 0.9385783298826778, + "grad_norm": 0.3769752085208893, + "learning_rate": 8.682914943418676e-05, + "loss": 0.4058, + "step": 340 + }, + { + "epoch": 0.966183574879227, + "grad_norm": 0.4275883436203003, + "learning_rate": 8.572288403920792e-05, + "loss": 0.4078, + "step": 350 + }, + { + "epoch": 0.9937888198757764, + "grad_norm": 0.43371307849884033, + "learning_rate": 8.45796852084268e-05, + "loss": 0.4063, + "step": 360 + }, + { + "epoch": 1.0193236714975846, + "grad_norm": 0.4527032673358917, + "learning_rate": 8.340073488058552e-05, + "loss": 0.3742, + "step": 370 + }, + { + "epoch": 1.0469289164941338, + "grad_norm": 0.5205631256103516, + "learning_rate": 8.218725195744463e-05, + "loss": 0.3809, + "step": 380 + }, + { + "epoch": 1.0745341614906831, + "grad_norm": 0.4031950533390045, + "learning_rate": 8.094049104357609e-05, + "loss": 0.3823, + "step": 390 + }, + { + "epoch": 1.1021394064872325, + "grad_norm": 0.41949087381362915, + "learning_rate": 7.966174114924351e-05, + "loss": 0.3765, + "step": 400 + }, + { + "epoch": 1.129744651483782, + "grad_norm": 0.43814027309417725, + "learning_rate": 7.83523243577109e-05, + "loss": 0.3751, + "step": 410 + }, + { + "epoch": 1.1573498964803313, + "grad_norm": 0.4457204341888428, + "learning_rate": 7.70135944583575e-05, + "loss": 0.3869, + "step": 420 + }, + { + "epoch": 1.1849551414768806, + "grad_norm": 0.41421836614608765, + "learning_rate": 7.56469355470122e-05, + "loss": 0.3634, + "step": 430 + }, + { + "epoch": 1.21256038647343, + "grad_norm": 0.4416670799255371, + "learning_rate": 7.425376059495442e-05, + "loss": 0.3768, + "step": 440 + }, + { + "epoch": 1.2401656314699794, + "grad_norm": 0.44710710644721985, + "learning_rate": 7.283550998806108e-05, + "loss": 0.3669, + "step": 450 + }, + { + "epoch": 1.2677708764665288, + "grad_norm": 0.39852890372276306, + "learning_rate": 7.139365003760999e-05, + "loss": 0.3824, + "step": 460 + }, + { + "epoch": 1.295376121463078, + "grad_norm": 0.4412725269794464, + "learning_rate": 6.992967146427913e-05, + "loss": 0.3646, + "step": 470 + }, + { + "epoch": 1.3229813664596273, + "grad_norm": 0.41978228092193604, + "learning_rate": 6.844508785690964e-05, + "loss": 0.3755, + "step": 480 + }, + { + "epoch": 1.3505866114561766, + "grad_norm": 0.4214731752872467, + "learning_rate": 6.694143410762542e-05, + "loss": 0.3841, + "step": 490 + }, + { + "epoch": 1.378191856452726, + "grad_norm": 0.4128514230251312, + "learning_rate": 6.54202648249278e-05, + "loss": 0.3839, + "step": 500 + }, + { + "epoch": 1.4057971014492754, + "grad_norm": 0.3899001181125641, + "learning_rate": 6.388315272640544e-05, + "loss": 0.3726, + "step": 510 + }, + { + "epoch": 1.4334023464458248, + "grad_norm": 0.4347754120826721, + "learning_rate": 6.233168701272167e-05, + "loss": 0.3722, + "step": 520 + }, + { + "epoch": 1.4610075914423741, + "grad_norm": 0.3798378109931946, + "learning_rate": 6.076747172456015e-05, + "loss": 0.3623, + "step": 530 + }, + { + "epoch": 1.4886128364389233, + "grad_norm": 0.3879692256450653, + "learning_rate": 5.919212408422753e-05, + "loss": 0.3684, + "step": 540 + }, + { + "epoch": 1.5162180814354729, + "grad_norm": 0.4210754930973053, + "learning_rate": 5.76072728236279e-05, + "loss": 0.3674, + "step": 550 + }, + { + "epoch": 1.543823326432022, + "grad_norm": 0.4184245467185974, + "learning_rate": 5.6014556500337534e-05, + "loss": 0.3602, + "step": 560 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.43027910590171814, + "learning_rate": 5.44156218035211e-05, + "loss": 0.3872, + "step": 570 + }, + { + "epoch": 1.5990338164251208, + "grad_norm": 0.38721945881843567, + "learning_rate": 5.28121218514406e-05, + "loss": 0.3678, + "step": 580 + }, + { + "epoch": 1.6266390614216701, + "grad_norm": 0.4199799597263336, + "learning_rate": 5.1205714482317455e-05, + "loss": 0.3652, + "step": 590 + }, + { + "epoch": 1.6542443064182195, + "grad_norm": 0.40728333592414856, + "learning_rate": 4.95980605403146e-05, + "loss": 0.3786, + "step": 600 + }, + { + "epoch": 1.6818495514147687, + "grad_norm": 0.41107377409935, + "learning_rate": 4.79908221584108e-05, + "loss": 0.3715, + "step": 610 + }, + { + "epoch": 1.7094547964113183, + "grad_norm": 0.45491889119148254, + "learning_rate": 4.638566103994258e-05, + "loss": 0.386, + "step": 620 + }, + { + "epoch": 1.7370600414078674, + "grad_norm": 0.4167945683002472, + "learning_rate": 4.478423674059015e-05, + "loss": 0.3723, + "step": 630 + }, + { + "epoch": 1.764665286404417, + "grad_norm": 0.4188650846481323, + "learning_rate": 4.318820495258396e-05, + "loss": 0.3794, + "step": 640 + }, + { + "epoch": 1.7922705314009661, + "grad_norm": 0.45200666785240173, + "learning_rate": 4.159921579290546e-05, + "loss": 0.3641, + "step": 650 + }, + { + "epoch": 1.8198757763975155, + "grad_norm": 0.42524534463882446, + "learning_rate": 4.0018912097252234e-05, + "loss": 0.3727, + "step": 660 + }, + { + "epoch": 1.847481021394065, + "grad_norm": 0.4238753318786621, + "learning_rate": 3.8448927721530967e-05, + "loss": 0.3666, + "step": 670 + }, + { + "epoch": 1.8750862663906143, + "grad_norm": 0.3949458599090576, + "learning_rate": 3.6890885852634635e-05, + "loss": 0.3707, + "step": 680 + }, + { + "epoch": 1.9026915113871636, + "grad_norm": 0.4040445387363434, + "learning_rate": 3.534639733025017e-05, + "loss": 0.3793, + "step": 690 + }, + { + "epoch": 1.9302967563837128, + "grad_norm": 0.42878955602645874, + "learning_rate": 3.3817058981431784e-05, + "loss": 0.3623, + "step": 700 + }, + { + "epoch": 1.9579020013802624, + "grad_norm": 0.42626291513442993, + "learning_rate": 3.230445196966181e-05, + "loss": 0.3564, + "step": 710 + }, + { + "epoch": 1.9855072463768115, + "grad_norm": 0.43052035570144653, + "learning_rate": 3.081014016010584e-05, + "loss": 0.3681, + "step": 720 + }, + { + "epoch": 2.0110420979986197, + "grad_norm": 0.4627828896045685, + "learning_rate": 2.9335668502752394e-05, + "loss": 0.359, + "step": 730 + }, + { + "epoch": 2.0386473429951693, + "grad_norm": 0.45345333218574524, + "learning_rate": 2.7882561435108824e-05, + "loss": 0.3189, + "step": 740 + }, + { + "epoch": 2.0662525879917184, + "grad_norm": 0.40497517585754395, + "learning_rate": 2.6452321306104634e-05, + "loss": 0.3409, + "step": 750 + }, + { + "epoch": 2.0938578329882676, + "grad_norm": 0.4666087329387665, + "learning_rate": 2.5046426822832175e-05, + "loss": 0.3354, + "step": 760 + }, + { + "epoch": 2.121463077984817, + "grad_norm": 0.38220757246017456, + "learning_rate": 2.3666331521730024e-05, + "loss": 0.3366, + "step": 770 + }, + { + "epoch": 2.1490683229813663, + "grad_norm": 0.4605223536491394, + "learning_rate": 2.2313462265790196e-05, + "loss": 0.3231, + "step": 780 + }, + { + "epoch": 2.176673567977916, + "grad_norm": 0.558403730392456, + "learning_rate": 2.098921776934269e-05, + "loss": 0.3333, + "step": 790 + }, + { + "epoch": 2.204278812974465, + "grad_norm": 0.45217105746269226, + "learning_rate": 1.96949671519424e-05, + "loss": 0.3401, + "step": 800 + }, + { + "epoch": 2.2318840579710146, + "grad_norm": 0.4413389563560486, + "learning_rate": 1.843204852285389e-05, + "loss": 0.3453, + "step": 810 + }, + { + "epoch": 2.259489302967564, + "grad_norm": 0.3977566063404083, + "learning_rate": 1.7201767597597196e-05, + "loss": 0.338, + "step": 820 + }, + { + "epoch": 2.287094547964113, + "grad_norm": 0.4817161560058594, + "learning_rate": 1.60053963479852e-05, + "loss": 0.3334, + "step": 830 + }, + { + "epoch": 2.3146997929606625, + "grad_norm": 0.4438902735710144, + "learning_rate": 1.4844171687048058e-05, + "loss": 0.3359, + "step": 840 + }, + { + "epoch": 2.3423050379572117, + "grad_norm": 0.45830076932907104, + "learning_rate": 1.371929419020459e-05, + "loss": 0.3534, + "step": 850 + }, + { + "epoch": 2.3699102829537613, + "grad_norm": 0.48253732919692993, + "learning_rate": 1.2631926854002574e-05, + "loss": 0.3247, + "step": 860 + }, + { + "epoch": 2.3975155279503104, + "grad_norm": 0.4572385549545288, + "learning_rate": 1.1583193893711475e-05, + "loss": 0.3309, + "step": 870 + }, + { + "epoch": 2.42512077294686, + "grad_norm": 0.4570174217224121, + "learning_rate": 1.0574179581010468e-05, + "loss": 0.3408, + "step": 880 + }, + { + "epoch": 2.452726017943409, + "grad_norm": 0.5289928913116455, + "learning_rate": 9.60592712297379e-06, + "loss": 0.3338, + "step": 890 + }, + { + "epoch": 2.4803312629399588, + "grad_norm": 0.49394240975379944, + "learning_rate": 8.679437583512168e-06, + "loss": 0.3398, + "step": 900 + }, + { + "epoch": 2.507936507936508, + "grad_norm": 0.412822425365448, + "learning_rate": 7.795668848385623e-06, + "loss": 0.333, + "step": 910 + }, + { + "epoch": 2.5355417529330575, + "grad_norm": 0.4305315911769867, + "learning_rate": 6.95553463485748e-06, + "loss": 0.342, + "step": 920 + }, + { + "epoch": 2.5631469979296067, + "grad_norm": 0.43158090114593506, + "learning_rate": 6.159903547013746e-06, + "loss": 0.3335, + "step": 930 + }, + { + "epoch": 2.590752242926156, + "grad_norm": 0.4319579005241394, + "learning_rate": 5.409598177724401e-06, + "loss": 0.3426, + "step": 940 + }, + { + "epoch": 2.6183574879227054, + "grad_norm": 0.4702156186103821, + "learning_rate": 4.7053942581750385e-06, + "loss": 0.3463, + "step": 950 + }, + { + "epoch": 2.6459627329192545, + "grad_norm": 0.38157370686531067, + "learning_rate": 4.048019855848273e-06, + "loss": 0.3331, + "step": 960 + }, + { + "epoch": 2.673567977915804, + "grad_norm": 0.4141283631324768, + "learning_rate": 3.438154621784029e-06, + "loss": 0.3422, + "step": 970 + }, + { + "epoch": 2.7011732229123533, + "grad_norm": 0.42628729343414307, + "learning_rate": 2.8764290878969756e-06, + "loss": 0.3262, + "step": 980 + }, + { + "epoch": 2.728778467908903, + "grad_norm": 0.4850899577140808, + "learning_rate": 2.3634240150775646e-06, + "loss": 0.3303, + "step": 990 + }, + { + "epoch": 2.756383712905452, + "grad_norm": 0.4277842938899994, + "learning_rate": 1.8996697927507468e-06, + "loss": 0.3446, + "step": 1000 + } + ], + "logging_steps": 10, + "max_steps": 1086, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.683655210640081e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1000/training_args.bin b/checkpoint-1000/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..fd86c75750949f0ca2ee56bc27dadb57430a90de --- /dev/null +++ b/checkpoint-1000/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b52a6484c213110d668e89b1ff8d77bac863e0460a3e92ff200a8df3f14879a5 +size 5688 diff --git a/checkpoint-1086/README.md b/checkpoint-1086/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d823a0d9f612b6fd128ea17ebbddb8df140520ef --- /dev/null +++ b/checkpoint-1086/README.md @@ -0,0 +1,202 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.12.0 \ No newline at end of file diff --git a/checkpoint-1086/adapter_config.json b/checkpoint-1086/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..652c79f18ba64f20d9cd4fc1eff31c4b47afb1c6 --- /dev/null +++ b/checkpoint-1086/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "pissa_niter_16", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-1086/adapter_model.safetensors b/checkpoint-1086/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..476a84d19c4420d1d508ed6ebb6e7ce24ac1217b --- /dev/null +++ b/checkpoint-1086/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d2b60858f8196e9c42a81145ebed482335b25f140656f334689c5cd28feb329 +size 268555264 diff --git a/checkpoint-1086/optimizer.pt b/checkpoint-1086/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f838dc40133181a49acc50142de136902a66da8 --- /dev/null +++ b/checkpoint-1086/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe095d3033abf7b29e4e0ba117b60632305190fb88b5c72cb944e0725c597755 +size 537626770 diff --git a/checkpoint-1086/rng_state_0.pth b/checkpoint-1086/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..5a7c482c30381cd512ccc35fe322d8a34fbf5207 --- /dev/null +++ b/checkpoint-1086/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:308f94f9a5c24e1bad5c393d56ae7af7782600f4e791d9c6ac35b22fff2105b6 +size 15024 diff --git a/checkpoint-1086/rng_state_1.pth b/checkpoint-1086/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..7b862c21b28bbd89ce6b4fb681d41be05f175599 --- /dev/null +++ b/checkpoint-1086/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b056f3c23cb32dc77a2ec9e7651e0b64e4440e21f0fdf969b86bfc56a1cbdf06 +size 15024 diff --git a/checkpoint-1086/rng_state_2.pth b/checkpoint-1086/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..d86ce886844e0298f058d67065e5eeb27ffe7e48 --- /dev/null +++ b/checkpoint-1086/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3f8a05714bc528f4885a2816181652f2303b3e8150f89b56aaee6bec56aa520 +size 15024 diff --git a/checkpoint-1086/rng_state_3.pth b/checkpoint-1086/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..10733f5da657367adf3f67760028644c0839660f --- /dev/null +++ b/checkpoint-1086/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f755bd3c330281961e5c03af9d10ce8c1e1678619d384f6f1fd5fd7dce2ff50 +size 15024 diff --git a/checkpoint-1086/scheduler.pt b/checkpoint-1086/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..51a4f7baffe19964a4cdff00ef358614ffef4608 --- /dev/null +++ b/checkpoint-1086/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2ec3825cfc68e7c0813fa8f1ff9533ce9c3275fb09281f03c2130ac24e05f2d +size 1064 diff --git a/checkpoint-1086/special_tokens_map.json b/checkpoint-1086/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/checkpoint-1086/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-1086/tokenizer.json b/checkpoint-1086/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c --- /dev/null +++ b/checkpoint-1086/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 +size 11422778 diff --git a/checkpoint-1086/tokenizer_config.json b/checkpoint-1086/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a414ab9b6f7fec711d4c1346f5847dd0d5bd0ff --- /dev/null +++ b/checkpoint-1086/tokenizer_config.json @@ -0,0 +1,197 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 2048, + "pad_token": "<|end▁of▁sentence|>", + "padding_side": "right", + "sp_model_kwargs": {}, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/checkpoint-1086/trainer_state.json b/checkpoint-1086/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..9dff5814e4b39dc5512660ab9005aaf49076d2a6 --- /dev/null +++ b/checkpoint-1086/trainer_state.json @@ -0,0 +1,789 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9937888198757765, + "eval_steps": 500, + "global_step": 1086, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.027605244996549344, + "grad_norm": 1.6335422992706299, + "learning_rate": 9.174311926605506e-06, + "loss": 0.8102, + "step": 10 + }, + { + "epoch": 0.05521048999309869, + "grad_norm": 0.8111785054206848, + "learning_rate": 1.834862385321101e-05, + "loss": 0.6999, + "step": 20 + }, + { + "epoch": 0.08281573498964803, + "grad_norm": 0.4619831144809723, + "learning_rate": 2.7522935779816515e-05, + "loss": 0.5682, + "step": 30 + }, + { + "epoch": 0.11042097998619738, + "grad_norm": 0.4434720575809479, + "learning_rate": 3.669724770642202e-05, + "loss": 0.5232, + "step": 40 + }, + { + "epoch": 0.13802622498274672, + "grad_norm": 0.44054797291755676, + "learning_rate": 4.587155963302753e-05, + "loss": 0.5084, + "step": 50 + }, + { + "epoch": 0.16563146997929606, + "grad_norm": 0.42256447672843933, + "learning_rate": 5.504587155963303e-05, + "loss": 0.477, + "step": 60 + }, + { + "epoch": 0.1932367149758454, + "grad_norm": 0.4349405765533447, + "learning_rate": 6.422018348623854e-05, + "loss": 0.4841, + "step": 70 + }, + { + "epoch": 0.22084195997239475, + "grad_norm": 0.4515930712223053, + "learning_rate": 7.339449541284404e-05, + "loss": 0.4704, + "step": 80 + }, + { + "epoch": 0.2484472049689441, + "grad_norm": 0.45412737131118774, + "learning_rate": 8.256880733944955e-05, + "loss": 0.4718, + "step": 90 + }, + { + "epoch": 0.27605244996549344, + "grad_norm": 0.49010995030403137, + "learning_rate": 9.174311926605506e-05, + "loss": 0.4496, + "step": 100 + }, + { + "epoch": 0.3036576949620428, + "grad_norm": 0.4931396245956421, + "learning_rate": 9.999974150612772e-05, + "loss": 0.4524, + "step": 110 + }, + { + "epoch": 0.33126293995859213, + "grad_norm": 1.1270735263824463, + "learning_rate": 9.996872547536591e-05, + "loss": 0.4503, + "step": 120 + }, + { + "epoch": 0.3588681849551415, + "grad_norm": 0.48991507291793823, + "learning_rate": 9.988604741439287e-05, + "loss": 0.4399, + "step": 130 + }, + { + "epoch": 0.3864734299516908, + "grad_norm": 0.45801088213920593, + "learning_rate": 9.975179280300506e-05, + "loss": 0.4524, + "step": 140 + }, + { + "epoch": 0.4140786749482402, + "grad_norm": 0.420897901058197, + "learning_rate": 9.956610044533896e-05, + "loss": 0.4281, + "step": 150 + }, + { + "epoch": 0.4416839199447895, + "grad_norm": 0.4336962103843689, + "learning_rate": 9.932916232636318e-05, + "loss": 0.4305, + "step": 160 + }, + { + "epoch": 0.4692891649413389, + "grad_norm": 0.44120800495147705, + "learning_rate": 9.904122341338765e-05, + "loss": 0.4208, + "step": 170 + }, + { + "epoch": 0.4968944099378882, + "grad_norm": 0.9154078364372253, + "learning_rate": 9.870258140279503e-05, + "loss": 0.4436, + "step": 180 + }, + { + "epoch": 0.5244996549344375, + "grad_norm": 0.4551916718482971, + "learning_rate": 9.831358641225624e-05, + "loss": 0.4288, + "step": 190 + }, + { + "epoch": 0.5521048999309869, + "grad_norm": 0.4513665437698364, + "learning_rate": 9.787464061874825e-05, + "loss": 0.4384, + "step": 200 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.43779632449150085, + "learning_rate": 9.738619784274833e-05, + "loss": 0.4178, + "step": 210 + }, + { + "epoch": 0.6073153899240856, + "grad_norm": 0.4170076847076416, + "learning_rate": 9.684876307903494e-05, + "loss": 0.42, + "step": 220 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.4370488226413727, + "learning_rate": 9.626289197457994e-05, + "loss": 0.4296, + "step": 230 + }, + { + "epoch": 0.6625258799171843, + "grad_norm": 0.42547333240509033, + "learning_rate": 9.562919025407236e-05, + "loss": 0.4264, + "step": 240 + }, + { + "epoch": 0.6901311249137336, + "grad_norm": 0.4317057430744171, + "learning_rate": 9.494831309366723e-05, + "loss": 0.4052, + "step": 250 + }, + { + "epoch": 0.717736369910283, + "grad_norm": 0.40589675307273865, + "learning_rate": 9.422096444360735e-05, + "loss": 0.41, + "step": 260 + }, + { + "epoch": 0.7453416149068323, + "grad_norm": 0.44671744108200073, + "learning_rate": 9.34478963004181e-05, + "loss": 0.4162, + "step": 270 + }, + { + "epoch": 0.7729468599033816, + "grad_norm": 0.41162508726119995, + "learning_rate": 9.262990792942768e-05, + "loss": 0.4183, + "step": 280 + }, + { + "epoch": 0.800552104899931, + "grad_norm": 0.483149915933609, + "learning_rate": 9.176784503841697e-05, + "loss": 0.4174, + "step": 290 + }, + { + "epoch": 0.8281573498964804, + "grad_norm": 0.4605332612991333, + "learning_rate": 9.086259890325297e-05, + "loss": 0.4191, + "step": 300 + }, + { + "epoch": 0.8557625948930296, + "grad_norm": 0.4153307378292084, + "learning_rate": 8.991510544640991e-05, + "loss": 0.4253, + "step": 310 + }, + { + "epoch": 0.883367839889579, + "grad_norm": 0.43806084990501404, + "learning_rate": 8.892634426933106e-05, + "loss": 0.4265, + "step": 320 + }, + { + "epoch": 0.9109730848861284, + "grad_norm": 0.45412200689315796, + "learning_rate": 8.78973376396311e-05, + "loss": 0.4365, + "step": 330 + }, + { + "epoch": 0.9385783298826778, + "grad_norm": 0.3769752085208893, + "learning_rate": 8.682914943418676e-05, + "loss": 0.4058, + "step": 340 + }, + { + "epoch": 0.966183574879227, + "grad_norm": 0.4275883436203003, + "learning_rate": 8.572288403920792e-05, + "loss": 0.4078, + "step": 350 + }, + { + "epoch": 0.9937888198757764, + "grad_norm": 0.43371307849884033, + "learning_rate": 8.45796852084268e-05, + "loss": 0.4063, + "step": 360 + }, + { + "epoch": 1.0193236714975846, + "grad_norm": 0.4527032673358917, + "learning_rate": 8.340073488058552e-05, + "loss": 0.3742, + "step": 370 + }, + { + "epoch": 1.0469289164941338, + "grad_norm": 0.5205631256103516, + "learning_rate": 8.218725195744463e-05, + "loss": 0.3809, + "step": 380 + }, + { + "epoch": 1.0745341614906831, + "grad_norm": 0.4031950533390045, + "learning_rate": 8.094049104357609e-05, + "loss": 0.3823, + "step": 390 + }, + { + "epoch": 1.1021394064872325, + "grad_norm": 0.41949087381362915, + "learning_rate": 7.966174114924351e-05, + "loss": 0.3765, + "step": 400 + }, + { + "epoch": 1.129744651483782, + "grad_norm": 0.43814027309417725, + "learning_rate": 7.83523243577109e-05, + "loss": 0.3751, + "step": 410 + }, + { + "epoch": 1.1573498964803313, + "grad_norm": 0.4457204341888428, + "learning_rate": 7.70135944583575e-05, + "loss": 0.3869, + "step": 420 + }, + { + "epoch": 1.1849551414768806, + "grad_norm": 0.41421836614608765, + "learning_rate": 7.56469355470122e-05, + "loss": 0.3634, + "step": 430 + }, + { + "epoch": 1.21256038647343, + "grad_norm": 0.4416670799255371, + "learning_rate": 7.425376059495442e-05, + "loss": 0.3768, + "step": 440 + }, + { + "epoch": 1.2401656314699794, + "grad_norm": 0.44710710644721985, + "learning_rate": 7.283550998806108e-05, + "loss": 0.3669, + "step": 450 + }, + { + "epoch": 1.2677708764665288, + "grad_norm": 0.39852890372276306, + "learning_rate": 7.139365003760999e-05, + "loss": 0.3824, + "step": 460 + }, + { + "epoch": 1.295376121463078, + "grad_norm": 0.4412725269794464, + "learning_rate": 6.992967146427913e-05, + "loss": 0.3646, + "step": 470 + }, + { + "epoch": 1.3229813664596273, + "grad_norm": 0.41978228092193604, + "learning_rate": 6.844508785690964e-05, + "loss": 0.3755, + "step": 480 + }, + { + "epoch": 1.3505866114561766, + "grad_norm": 0.4214731752872467, + "learning_rate": 6.694143410762542e-05, + "loss": 0.3841, + "step": 490 + }, + { + "epoch": 1.378191856452726, + "grad_norm": 0.4128514230251312, + "learning_rate": 6.54202648249278e-05, + "loss": 0.3839, + "step": 500 + }, + { + "epoch": 1.4057971014492754, + "grad_norm": 0.3899001181125641, + "learning_rate": 6.388315272640544e-05, + "loss": 0.3726, + "step": 510 + }, + { + "epoch": 1.4334023464458248, + "grad_norm": 0.4347754120826721, + "learning_rate": 6.233168701272167e-05, + "loss": 0.3722, + "step": 520 + }, + { + "epoch": 1.4610075914423741, + "grad_norm": 0.3798378109931946, + "learning_rate": 6.076747172456015e-05, + "loss": 0.3623, + "step": 530 + }, + { + "epoch": 1.4886128364389233, + "grad_norm": 0.3879692256450653, + "learning_rate": 5.919212408422753e-05, + "loss": 0.3684, + "step": 540 + }, + { + "epoch": 1.5162180814354729, + "grad_norm": 0.4210754930973053, + "learning_rate": 5.76072728236279e-05, + "loss": 0.3674, + "step": 550 + }, + { + "epoch": 1.543823326432022, + "grad_norm": 0.4184245467185974, + "learning_rate": 5.6014556500337534e-05, + "loss": 0.3602, + "step": 560 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.43027910590171814, + "learning_rate": 5.44156218035211e-05, + "loss": 0.3872, + "step": 570 + }, + { + "epoch": 1.5990338164251208, + "grad_norm": 0.38721945881843567, + "learning_rate": 5.28121218514406e-05, + "loss": 0.3678, + "step": 580 + }, + { + "epoch": 1.6266390614216701, + "grad_norm": 0.4199799597263336, + "learning_rate": 5.1205714482317455e-05, + "loss": 0.3652, + "step": 590 + }, + { + "epoch": 1.6542443064182195, + "grad_norm": 0.40728333592414856, + "learning_rate": 4.95980605403146e-05, + "loss": 0.3786, + "step": 600 + }, + { + "epoch": 1.6818495514147687, + "grad_norm": 0.41107377409935, + "learning_rate": 4.79908221584108e-05, + "loss": 0.3715, + "step": 610 + }, + { + "epoch": 1.7094547964113183, + "grad_norm": 0.45491889119148254, + "learning_rate": 4.638566103994258e-05, + "loss": 0.386, + "step": 620 + }, + { + "epoch": 1.7370600414078674, + "grad_norm": 0.4167945683002472, + "learning_rate": 4.478423674059015e-05, + "loss": 0.3723, + "step": 630 + }, + { + "epoch": 1.764665286404417, + "grad_norm": 0.4188650846481323, + "learning_rate": 4.318820495258396e-05, + "loss": 0.3794, + "step": 640 + }, + { + "epoch": 1.7922705314009661, + "grad_norm": 0.45200666785240173, + "learning_rate": 4.159921579290546e-05, + "loss": 0.3641, + "step": 650 + }, + { + "epoch": 1.8198757763975155, + "grad_norm": 0.42524534463882446, + "learning_rate": 4.0018912097252234e-05, + "loss": 0.3727, + "step": 660 + }, + { + "epoch": 1.847481021394065, + "grad_norm": 0.4238753318786621, + "learning_rate": 3.8448927721530967e-05, + "loss": 0.3666, + "step": 670 + }, + { + "epoch": 1.8750862663906143, + "grad_norm": 0.3949458599090576, + "learning_rate": 3.6890885852634635e-05, + "loss": 0.3707, + "step": 680 + }, + { + "epoch": 1.9026915113871636, + "grad_norm": 0.4040445387363434, + "learning_rate": 3.534639733025017e-05, + "loss": 0.3793, + "step": 690 + }, + { + "epoch": 1.9302967563837128, + "grad_norm": 0.42878955602645874, + "learning_rate": 3.3817058981431784e-05, + "loss": 0.3623, + "step": 700 + }, + { + "epoch": 1.9579020013802624, + "grad_norm": 0.42626291513442993, + "learning_rate": 3.230445196966181e-05, + "loss": 0.3564, + "step": 710 + }, + { + "epoch": 1.9855072463768115, + "grad_norm": 0.43052035570144653, + "learning_rate": 3.081014016010584e-05, + "loss": 0.3681, + "step": 720 + }, + { + "epoch": 2.0110420979986197, + "grad_norm": 0.4627828896045685, + "learning_rate": 2.9335668502752394e-05, + "loss": 0.359, + "step": 730 + }, + { + "epoch": 2.0386473429951693, + "grad_norm": 0.45345333218574524, + "learning_rate": 2.7882561435108824e-05, + "loss": 0.3189, + "step": 740 + }, + { + "epoch": 2.0662525879917184, + "grad_norm": 0.40497517585754395, + "learning_rate": 2.6452321306104634e-05, + "loss": 0.3409, + "step": 750 + }, + { + "epoch": 2.0938578329882676, + "grad_norm": 0.4666087329387665, + "learning_rate": 2.5046426822832175e-05, + "loss": 0.3354, + "step": 760 + }, + { + "epoch": 2.121463077984817, + "grad_norm": 0.38220757246017456, + "learning_rate": 2.3666331521730024e-05, + "loss": 0.3366, + "step": 770 + }, + { + "epoch": 2.1490683229813663, + "grad_norm": 0.4605223536491394, + "learning_rate": 2.2313462265790196e-05, + "loss": 0.3231, + "step": 780 + }, + { + "epoch": 2.176673567977916, + "grad_norm": 0.558403730392456, + "learning_rate": 2.098921776934269e-05, + "loss": 0.3333, + "step": 790 + }, + { + "epoch": 2.204278812974465, + "grad_norm": 0.45217105746269226, + "learning_rate": 1.96949671519424e-05, + "loss": 0.3401, + "step": 800 + }, + { + "epoch": 2.2318840579710146, + "grad_norm": 0.4413389563560486, + "learning_rate": 1.843204852285389e-05, + "loss": 0.3453, + "step": 810 + }, + { + "epoch": 2.259489302967564, + "grad_norm": 0.3977566063404083, + "learning_rate": 1.7201767597597196e-05, + "loss": 0.338, + "step": 820 + }, + { + "epoch": 2.287094547964113, + "grad_norm": 0.4817161560058594, + "learning_rate": 1.60053963479852e-05, + "loss": 0.3334, + "step": 830 + }, + { + "epoch": 2.3146997929606625, + "grad_norm": 0.4438902735710144, + "learning_rate": 1.4844171687048058e-05, + "loss": 0.3359, + "step": 840 + }, + { + "epoch": 2.3423050379572117, + "grad_norm": 0.45830076932907104, + "learning_rate": 1.371929419020459e-05, + "loss": 0.3534, + "step": 850 + }, + { + "epoch": 2.3699102829537613, + "grad_norm": 0.48253732919692993, + "learning_rate": 1.2631926854002574e-05, + "loss": 0.3247, + "step": 860 + }, + { + "epoch": 2.3975155279503104, + "grad_norm": 0.4572385549545288, + "learning_rate": 1.1583193893711475e-05, + "loss": 0.3309, + "step": 870 + }, + { + "epoch": 2.42512077294686, + "grad_norm": 0.4570174217224121, + "learning_rate": 1.0574179581010468e-05, + "loss": 0.3408, + "step": 880 + }, + { + "epoch": 2.452726017943409, + "grad_norm": 0.5289928913116455, + "learning_rate": 9.60592712297379e-06, + "loss": 0.3338, + "step": 890 + }, + { + "epoch": 2.4803312629399588, + "grad_norm": 0.49394240975379944, + "learning_rate": 8.679437583512168e-06, + "loss": 0.3398, + "step": 900 + }, + { + "epoch": 2.507936507936508, + "grad_norm": 0.412822425365448, + "learning_rate": 7.795668848385623e-06, + "loss": 0.333, + "step": 910 + }, + { + "epoch": 2.5355417529330575, + "grad_norm": 0.4305315911769867, + "learning_rate": 6.95553463485748e-06, + "loss": 0.342, + "step": 920 + }, + { + "epoch": 2.5631469979296067, + "grad_norm": 0.43158090114593506, + "learning_rate": 6.159903547013746e-06, + "loss": 0.3335, + "step": 930 + }, + { + "epoch": 2.590752242926156, + "grad_norm": 0.4319579005241394, + "learning_rate": 5.409598177724401e-06, + "loss": 0.3426, + "step": 940 + }, + { + "epoch": 2.6183574879227054, + "grad_norm": 0.4702156186103821, + "learning_rate": 4.7053942581750385e-06, + "loss": 0.3463, + "step": 950 + }, + { + "epoch": 2.6459627329192545, + "grad_norm": 0.38157370686531067, + "learning_rate": 4.048019855848273e-06, + "loss": 0.3331, + "step": 960 + }, + { + "epoch": 2.673567977915804, + "grad_norm": 0.4141283631324768, + "learning_rate": 3.438154621784029e-06, + "loss": 0.3422, + "step": 970 + }, + { + "epoch": 2.7011732229123533, + "grad_norm": 0.42628729343414307, + "learning_rate": 2.8764290878969756e-06, + "loss": 0.3262, + "step": 980 + }, + { + "epoch": 2.728778467908903, + "grad_norm": 0.4850899577140808, + "learning_rate": 2.3634240150775646e-06, + "loss": 0.3303, + "step": 990 + }, + { + "epoch": 2.756383712905452, + "grad_norm": 0.4277842938899994, + "learning_rate": 1.8996697927507468e-06, + "loss": 0.3446, + "step": 1000 + }, + { + "epoch": 2.783988957902001, + "grad_norm": 0.45691201090812683, + "learning_rate": 1.4856458905130822e-06, + "loss": 0.3309, + "step": 1010 + }, + { + "epoch": 2.8115942028985508, + "grad_norm": 0.4542577862739563, + "learning_rate": 1.1217803624152311e-06, + "loss": 0.326, + "step": 1020 + }, + { + "epoch": 2.8391994478951, + "grad_norm": 0.39988699555397034, + "learning_rate": 8.084494044022839e-07, + "loss": 0.3364, + "step": 1030 + }, + { + "epoch": 2.8668046928916495, + "grad_norm": 0.43636584281921387, + "learning_rate": 5.459769653695657e-07, + "loss": 0.3313, + "step": 1040 + }, + { + "epoch": 2.8944099378881987, + "grad_norm": 0.4335787892341614, + "learning_rate": 3.346344122360179e-07, + "loss": 0.328, + "step": 1050 + }, + { + "epoch": 2.9220151828847483, + "grad_norm": 0.4669038951396942, + "learning_rate": 1.746402493813415e-07, + "loss": 0.3426, + "step": 1060 + }, + { + "epoch": 2.9496204278812974, + "grad_norm": 0.43036729097366333, + "learning_rate": 6.615989273713874e-08, + "loss": 0.3378, + "step": 1070 + }, + { + "epoch": 2.9772256728778466, + "grad_norm": 0.4190558195114136, + "learning_rate": 9.305498765438404e-09, + "loss": 0.3358, + "step": 1080 + } + ], + "logging_steps": 10, + "max_steps": 1086, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.17252944434797e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-1086/training_args.bin b/checkpoint-1086/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..fd86c75750949f0ca2ee56bc27dadb57430a90de --- /dev/null +++ b/checkpoint-1086/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b52a6484c213110d668e89b1ff8d77bac863e0460a3e92ff200a8df3f14879a5 +size 5688 diff --git a/checkpoint-500/README.md b/checkpoint-500/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d823a0d9f612b6fd128ea17ebbddb8df140520ef --- /dev/null +++ b/checkpoint-500/README.md @@ -0,0 +1,202 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.12.0 \ No newline at end of file diff --git a/checkpoint-500/adapter_config.json b/checkpoint-500/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..652c79f18ba64f20d9cd4fc1eff31c4b47afb1c6 --- /dev/null +++ b/checkpoint-500/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "pissa_niter_16", + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/checkpoint-500/adapter_model.safetensors b/checkpoint-500/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bb64c1678b0993d094dd0652422db9a60b40ece8 --- /dev/null +++ b/checkpoint-500/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a43131df51ef87099c41c2eea064b6c7bed5f2ef3b3f3f89d0eeab4739d280a1 +size 268555264 diff --git a/checkpoint-500/optimizer.pt b/checkpoint-500/optimizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..74ddf5668a869bc1a80b8f6b3b7c9e6cc10d05ad --- /dev/null +++ b/checkpoint-500/optimizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b8e1f62ba644f7ff12a4315ffe2e414d517d1aefb5665ce137d8d7e46935864 +size 537626770 diff --git a/checkpoint-500/rng_state_0.pth b/checkpoint-500/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..37ac50652a3badbfb1bdeaccb8b1934575b584eb --- /dev/null +++ b/checkpoint-500/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbe0d720c4c75a6a04213fa3b64bacbe794718a53e2b56ebb67a1a795014dfad +size 15024 diff --git a/checkpoint-500/rng_state_1.pth b/checkpoint-500/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..0bc3650851dae439677613c9e23a5528de47b679 --- /dev/null +++ b/checkpoint-500/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72452d3138d0ca2ff89429e3294a834ae7a68e8596fc757735ca56ae52509d57 +size 15024 diff --git a/checkpoint-500/rng_state_2.pth b/checkpoint-500/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..0e00a6e8b4b743026f68d749a8cb3bdd4b746838 --- /dev/null +++ b/checkpoint-500/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f36e306fb8ebcf53a167bfd6c9af74db410a269ada1e619e3e816f5269543b9d +size 15024 diff --git a/checkpoint-500/rng_state_3.pth b/checkpoint-500/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..5354141d42e077c356f9ca8c6b12bd7e5e41f2af --- /dev/null +++ b/checkpoint-500/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb47ce0c6f815a6f8302b0e3819b4c2315ca71dae3138d97fdceb765cdd0a039 +size 15024 diff --git a/checkpoint-500/scheduler.pt b/checkpoint-500/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..94197b147067ec43b36354ccdb9b2783606e6183 --- /dev/null +++ b/checkpoint-500/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9b15c8fe8bd1f8470d946d672234e9bd15a1a3744a13876ee58a8a23e297c9d +size 1064 diff --git a/checkpoint-500/special_tokens_map.json b/checkpoint-500/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/checkpoint-500/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/checkpoint-500/tokenizer.json b/checkpoint-500/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c --- /dev/null +++ b/checkpoint-500/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 +size 11422778 diff --git a/checkpoint-500/tokenizer_config.json b/checkpoint-500/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a414ab9b6f7fec711d4c1346f5847dd0d5bd0ff --- /dev/null +++ b/checkpoint-500/tokenizer_config.json @@ -0,0 +1,197 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 2048, + "pad_token": "<|end▁of▁sentence|>", + "padding_side": "right", + "sp_model_kwargs": {}, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/checkpoint-500/trainer_state.json b/checkpoint-500/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5dfd5733280944330d7e84b420bbc35c487e3577 --- /dev/null +++ b/checkpoint-500/trainer_state.json @@ -0,0 +1,383 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.378191856452726, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.027605244996549344, + "grad_norm": 1.6335422992706299, + "learning_rate": 9.174311926605506e-06, + "loss": 0.8102, + "step": 10 + }, + { + "epoch": 0.05521048999309869, + "grad_norm": 0.8111785054206848, + "learning_rate": 1.834862385321101e-05, + "loss": 0.6999, + "step": 20 + }, + { + "epoch": 0.08281573498964803, + "grad_norm": 0.4619831144809723, + "learning_rate": 2.7522935779816515e-05, + "loss": 0.5682, + "step": 30 + }, + { + "epoch": 0.11042097998619738, + "grad_norm": 0.4434720575809479, + "learning_rate": 3.669724770642202e-05, + "loss": 0.5232, + "step": 40 + }, + { + "epoch": 0.13802622498274672, + "grad_norm": 0.44054797291755676, + "learning_rate": 4.587155963302753e-05, + "loss": 0.5084, + "step": 50 + }, + { + "epoch": 0.16563146997929606, + "grad_norm": 0.42256447672843933, + "learning_rate": 5.504587155963303e-05, + "loss": 0.477, + "step": 60 + }, + { + "epoch": 0.1932367149758454, + "grad_norm": 0.4349405765533447, + "learning_rate": 6.422018348623854e-05, + "loss": 0.4841, + "step": 70 + }, + { + "epoch": 0.22084195997239475, + "grad_norm": 0.4515930712223053, + "learning_rate": 7.339449541284404e-05, + "loss": 0.4704, + "step": 80 + }, + { + "epoch": 0.2484472049689441, + "grad_norm": 0.45412737131118774, + "learning_rate": 8.256880733944955e-05, + "loss": 0.4718, + "step": 90 + }, + { + "epoch": 0.27605244996549344, + "grad_norm": 0.49010995030403137, + "learning_rate": 9.174311926605506e-05, + "loss": 0.4496, + "step": 100 + }, + { + "epoch": 0.3036576949620428, + "grad_norm": 0.4931396245956421, + "learning_rate": 9.999974150612772e-05, + "loss": 0.4524, + "step": 110 + }, + { + "epoch": 0.33126293995859213, + "grad_norm": 1.1270735263824463, + "learning_rate": 9.996872547536591e-05, + "loss": 0.4503, + "step": 120 + }, + { + "epoch": 0.3588681849551415, + "grad_norm": 0.48991507291793823, + "learning_rate": 9.988604741439287e-05, + "loss": 0.4399, + "step": 130 + }, + { + "epoch": 0.3864734299516908, + "grad_norm": 0.45801088213920593, + "learning_rate": 9.975179280300506e-05, + "loss": 0.4524, + "step": 140 + }, + { + "epoch": 0.4140786749482402, + "grad_norm": 0.420897901058197, + "learning_rate": 9.956610044533896e-05, + "loss": 0.4281, + "step": 150 + }, + { + "epoch": 0.4416839199447895, + "grad_norm": 0.4336962103843689, + "learning_rate": 9.932916232636318e-05, + "loss": 0.4305, + "step": 160 + }, + { + "epoch": 0.4692891649413389, + "grad_norm": 0.44120800495147705, + "learning_rate": 9.904122341338765e-05, + "loss": 0.4208, + "step": 170 + }, + { + "epoch": 0.4968944099378882, + "grad_norm": 0.9154078364372253, + "learning_rate": 9.870258140279503e-05, + "loss": 0.4436, + "step": 180 + }, + { + "epoch": 0.5244996549344375, + "grad_norm": 0.4551916718482971, + "learning_rate": 9.831358641225624e-05, + "loss": 0.4288, + "step": 190 + }, + { + "epoch": 0.5521048999309869, + "grad_norm": 0.4513665437698364, + "learning_rate": 9.787464061874825e-05, + "loss": 0.4384, + "step": 200 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.43779632449150085, + "learning_rate": 9.738619784274833e-05, + "loss": 0.4178, + "step": 210 + }, + { + "epoch": 0.6073153899240856, + "grad_norm": 0.4170076847076416, + "learning_rate": 9.684876307903494e-05, + "loss": 0.42, + "step": 220 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.4370488226413727, + "learning_rate": 9.626289197457994e-05, + "loss": 0.4296, + "step": 230 + }, + { + "epoch": 0.6625258799171843, + "grad_norm": 0.42547333240509033, + "learning_rate": 9.562919025407236e-05, + "loss": 0.4264, + "step": 240 + }, + { + "epoch": 0.6901311249137336, + "grad_norm": 0.4317057430744171, + "learning_rate": 9.494831309366723e-05, + "loss": 0.4052, + "step": 250 + }, + { + "epoch": 0.717736369910283, + "grad_norm": 0.40589675307273865, + "learning_rate": 9.422096444360735e-05, + "loss": 0.41, + "step": 260 + }, + { + "epoch": 0.7453416149068323, + "grad_norm": 0.44671744108200073, + "learning_rate": 9.34478963004181e-05, + "loss": 0.4162, + "step": 270 + }, + { + "epoch": 0.7729468599033816, + "grad_norm": 0.41162508726119995, + "learning_rate": 9.262990792942768e-05, + "loss": 0.4183, + "step": 280 + }, + { + "epoch": 0.800552104899931, + "grad_norm": 0.483149915933609, + "learning_rate": 9.176784503841697e-05, + "loss": 0.4174, + "step": 290 + }, + { + "epoch": 0.8281573498964804, + "grad_norm": 0.4605332612991333, + "learning_rate": 9.086259890325297e-05, + "loss": 0.4191, + "step": 300 + }, + { + "epoch": 0.8557625948930296, + "grad_norm": 0.4153307378292084, + "learning_rate": 8.991510544640991e-05, + "loss": 0.4253, + "step": 310 + }, + { + "epoch": 0.883367839889579, + "grad_norm": 0.43806084990501404, + "learning_rate": 8.892634426933106e-05, + "loss": 0.4265, + "step": 320 + }, + { + "epoch": 0.9109730848861284, + "grad_norm": 0.45412200689315796, + "learning_rate": 8.78973376396311e-05, + "loss": 0.4365, + "step": 330 + }, + { + "epoch": 0.9385783298826778, + "grad_norm": 0.3769752085208893, + "learning_rate": 8.682914943418676e-05, + "loss": 0.4058, + "step": 340 + }, + { + "epoch": 0.966183574879227, + "grad_norm": 0.4275883436203003, + "learning_rate": 8.572288403920792e-05, + "loss": 0.4078, + "step": 350 + }, + { + "epoch": 0.9937888198757764, + "grad_norm": 0.43371307849884033, + "learning_rate": 8.45796852084268e-05, + "loss": 0.4063, + "step": 360 + }, + { + "epoch": 1.0193236714975846, + "grad_norm": 0.4527032673358917, + "learning_rate": 8.340073488058552e-05, + "loss": 0.3742, + "step": 370 + }, + { + "epoch": 1.0469289164941338, + "grad_norm": 0.5205631256103516, + "learning_rate": 8.218725195744463e-05, + "loss": 0.3809, + "step": 380 + }, + { + "epoch": 1.0745341614906831, + "grad_norm": 0.4031950533390045, + "learning_rate": 8.094049104357609e-05, + "loss": 0.3823, + "step": 390 + }, + { + "epoch": 1.1021394064872325, + "grad_norm": 0.41949087381362915, + "learning_rate": 7.966174114924351e-05, + "loss": 0.3765, + "step": 400 + }, + { + "epoch": 1.129744651483782, + "grad_norm": 0.43814027309417725, + "learning_rate": 7.83523243577109e-05, + "loss": 0.3751, + "step": 410 + }, + { + "epoch": 1.1573498964803313, + "grad_norm": 0.4457204341888428, + "learning_rate": 7.70135944583575e-05, + "loss": 0.3869, + "step": 420 + }, + { + "epoch": 1.1849551414768806, + "grad_norm": 0.41421836614608765, + "learning_rate": 7.56469355470122e-05, + "loss": 0.3634, + "step": 430 + }, + { + "epoch": 1.21256038647343, + "grad_norm": 0.4416670799255371, + "learning_rate": 7.425376059495442e-05, + "loss": 0.3768, + "step": 440 + }, + { + "epoch": 1.2401656314699794, + "grad_norm": 0.44710710644721985, + "learning_rate": 7.283550998806108e-05, + "loss": 0.3669, + "step": 450 + }, + { + "epoch": 1.2677708764665288, + "grad_norm": 0.39852890372276306, + "learning_rate": 7.139365003760999e-05, + "loss": 0.3824, + "step": 460 + }, + { + "epoch": 1.295376121463078, + "grad_norm": 0.4412725269794464, + "learning_rate": 6.992967146427913e-05, + "loss": 0.3646, + "step": 470 + }, + { + "epoch": 1.3229813664596273, + "grad_norm": 0.41978228092193604, + "learning_rate": 6.844508785690964e-05, + "loss": 0.3755, + "step": 480 + }, + { + "epoch": 1.3505866114561766, + "grad_norm": 0.4214731752872467, + "learning_rate": 6.694143410762542e-05, + "loss": 0.3841, + "step": 490 + }, + { + "epoch": 1.378191856452726, + "grad_norm": 0.4128514230251312, + "learning_rate": 6.54202648249278e-05, + "loss": 0.3839, + "step": 500 + } + ], + "logging_steps": 10, + "max_steps": 1086, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.837431317598044e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-500/training_args.bin b/checkpoint-500/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..fd86c75750949f0ca2ee56bc27dadb57430a90de --- /dev/null +++ b/checkpoint-500/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b52a6484c213110d668e89b1ff8d77bac863e0460a3e92ff200a8df3f14879a5 +size 5688 diff --git a/pissa_backup/README.md b/pissa_backup/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d823a0d9f612b6fd128ea17ebbddb8df140520ef --- /dev/null +++ b/pissa_backup/README.md @@ -0,0 +1,202 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.12.0 \ No newline at end of file diff --git a/pissa_backup/adapter_config.json b/pissa_backup/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..df89fe583f3943b9bfeca2c52b5233866c00a110 --- /dev/null +++ b/pissa_backup/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/pissa_backup/adapter_model.safetensors b/pissa_backup/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..476a84d19c4420d1d508ed6ebb6e7ce24ac1217b --- /dev/null +++ b/pissa_backup/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d2b60858f8196e9c42a81145ebed482335b25f140656f334689c5cd28feb329 +size 268555264 diff --git a/pissa_converted/README.md b/pissa_converted/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d823a0d9f612b6fd128ea17ebbddb8df140520ef --- /dev/null +++ b/pissa_converted/README.md @@ -0,0 +1,202 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.12.0 \ No newline at end of file diff --git a/pissa_converted/adapter_config.json b/pissa_converted/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d04cab88dd3f79b639159ddee01d4fe060aa22c2 --- /dev/null +++ b/pissa_converted/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 32, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 16, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/pissa_converted/adapter_model.safetensors b/pissa_converted/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3e5c02f3fe68ab34acad6fa189a2c85e7f786d71 --- /dev/null +++ b/pissa_converted/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbfc40b8ec9b2f06cd550e2a9d01cf1c316504db097c80f10ae188c4bc579292 +size 536991984 diff --git a/pissa_init/README.md b/pissa_init/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d823a0d9f612b6fd128ea17ebbddb8df140520ef --- /dev/null +++ b/pissa_init/README.md @@ -0,0 +1,202 @@ +--- +base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B +library_name: peft +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.12.0 \ No newline at end of file diff --git a/pissa_init/adapter_config.json b/pissa_init/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c76b6552676cd51a98750e2f26c3d85b375090d5 --- /dev/null +++ b/pissa_init/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "deepseek-ai/DeepSeek-R1-Distill-Qwen-32B", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": false, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 16, + "lora_dropout": 0.0, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 8, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "down_proj", + "up_proj", + "gate_proj", + "o_proj", + "q_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/pissa_init/adapter_model.safetensors b/pissa_init/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..215bd57118598136cf03c9630dd90dfa6b35d525 --- /dev/null +++ b/pissa_init/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:304735ac6297f3e0dfc0131f190887142e8d1c539967263d1154c4e62620a739 +size 268555264 diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..1d385d62cf08bca35254547902b792c243656ec1 --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,23 @@ +{ + "bos_token": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000000000000000000000000000000000000..1a2db243e47cbc113f6b2ddcc388aeeb8fe1a94c --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e20ddafc659ba90242154b55275402edeca0715e5dbb30f56815a4ce081f4893 +size 11422778 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0a414ab9b6f7fec711d4c1346f5847dd0d5bd0ff --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,197 @@ +{ + "add_bos_token": true, + "add_eos_token": false, + "add_prefix_space": null, + "added_tokens_decoder": { + "151643": { + "content": "<|end▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151644": { + "content": "<|User|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151645": { + "content": "<|Assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151646": { + "content": "<|begin▁of▁sentence|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151647": { + "content": "<|EOT|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151648": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151649": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151650": { + "content": "<|quad_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151651": { + "content": "<|quad_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151652": { + "content": "<|vision_start|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151653": { + "content": "<|vision_end|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151654": { + "content": "<|vision_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151655": { + "content": "<|image_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151656": { + "content": "<|video_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "151657": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151658": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151659": { + "content": "<|fim_prefix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151660": { + "content": "<|fim_middle|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151661": { + "content": "<|fim_suffix|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151662": { + "content": "<|fim_pad|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151663": { + "content": "<|repo_name|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + }, + "151664": { + "content": "<|file_sep|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": false + } + }, + "bos_token": "<|begin▁of▁sentence|>", + "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n'}}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|end▁of▁sentence|>", + "extra_special_tokens": {}, + "legacy": true, + "model_max_length": 2048, + "pad_token": "<|end▁of▁sentence|>", + "padding_side": "right", + "sp_model_kwargs": {}, + "split_special_tokens": false, + "tokenizer_class": "LlamaTokenizerFast", + "unk_token": null, + "use_default_system_prompt": false +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6f40b85634398802ef051edb9b192fe596a15ba6 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "epoch": 2.9937888198757765, + "total_flos": 6.17252944434797e+18, + "train_loss": 0.3891050570797086, + "train_runtime": 6764.7829, + "train_samples_per_second": 5.14, + "train_steps_per_second": 0.161 +} \ No newline at end of file diff --git a/trainer_log.jsonl b/trainer_log.jsonl new file mode 100644 index 0000000000000000000000000000000000000000..c7490982d06fd4b759317c4abbbd8741eda6efa9 --- /dev/null +++ b/trainer_log.jsonl @@ -0,0 +1,109 @@ +{"current_steps": 10, "total_steps": 1086, "loss": 0.8102, "lr": 9.174311926605506e-06, "epoch": 0.027605244996549344, "percentage": 0.92, "elapsed_time": "0:01:04", "remaining_time": "1:56:24"} +{"current_steps": 20, "total_steps": 1086, "loss": 0.6999, "lr": 1.834862385321101e-05, "epoch": 0.05521048999309869, "percentage": 1.84, "elapsed_time": "0:02:07", "remaining_time": "1:53:04"} +{"current_steps": 30, "total_steps": 1086, "loss": 0.5682, "lr": 2.7522935779816515e-05, "epoch": 0.08281573498964803, "percentage": 2.76, "elapsed_time": "0:03:09", "remaining_time": "1:50:53"} +{"current_steps": 40, "total_steps": 1086, "loss": 0.5232, "lr": 3.669724770642202e-05, "epoch": 0.11042097998619738, "percentage": 3.68, "elapsed_time": "0:04:09", "remaining_time": "1:48:39"} +{"current_steps": 50, "total_steps": 1086, "loss": 0.5084, "lr": 4.587155963302753e-05, "epoch": 0.13802622498274672, "percentage": 4.6, "elapsed_time": "0:05:10", "remaining_time": "1:47:12"} +{"current_steps": 60, "total_steps": 1086, "loss": 0.477, "lr": 5.504587155963303e-05, "epoch": 0.16563146997929606, "percentage": 5.52, "elapsed_time": "0:06:11", "remaining_time": "1:45:48"} +{"current_steps": 70, "total_steps": 1086, "loss": 0.4841, "lr": 6.422018348623854e-05, "epoch": 0.1932367149758454, "percentage": 6.45, "elapsed_time": "0:07:14", "remaining_time": "1:45:06"} +{"current_steps": 80, "total_steps": 1086, "loss": 0.4704, "lr": 7.339449541284404e-05, "epoch": 0.22084195997239475, "percentage": 7.37, "elapsed_time": "0:08:16", "remaining_time": "1:44:06"} +{"current_steps": 90, "total_steps": 1086, "loss": 0.4718, "lr": 8.256880733944955e-05, "epoch": 0.2484472049689441, "percentage": 8.29, "elapsed_time": "0:09:17", "remaining_time": "1:42:54"} +{"current_steps": 100, "total_steps": 1086, "loss": 0.4496, "lr": 9.174311926605506e-05, "epoch": 0.27605244996549344, "percentage": 9.21, "elapsed_time": "0:10:21", "remaining_time": "1:42:05"} +{"current_steps": 110, "total_steps": 1086, "loss": 0.4524, "lr": 9.999974150612772e-05, "epoch": 0.3036576949620428, "percentage": 10.13, "elapsed_time": "0:11:24", "remaining_time": "1:41:15"} +{"current_steps": 120, "total_steps": 1086, "loss": 0.4503, "lr": 9.996872547536591e-05, "epoch": 0.33126293995859213, "percentage": 11.05, "elapsed_time": "0:12:26", "remaining_time": "1:40:13"} +{"current_steps": 130, "total_steps": 1086, "loss": 0.4399, "lr": 9.988604741439287e-05, "epoch": 0.3588681849551415, "percentage": 11.97, "elapsed_time": "0:13:30", "remaining_time": "1:39:17"} +{"current_steps": 140, "total_steps": 1086, "loss": 0.4524, "lr": 9.975179280300506e-05, "epoch": 0.3864734299516908, "percentage": 12.89, "elapsed_time": "0:14:32", "remaining_time": "1:38:12"} +{"current_steps": 150, "total_steps": 1086, "loss": 0.4281, "lr": 9.956610044533896e-05, "epoch": 0.4140786749482402, "percentage": 13.81, "elapsed_time": "0:15:36", "remaining_time": "1:37:25"} +{"current_steps": 160, "total_steps": 1086, "loss": 0.4305, "lr": 9.932916232636318e-05, "epoch": 0.4416839199447895, "percentage": 14.73, "elapsed_time": "0:16:39", "remaining_time": "1:36:23"} +{"current_steps": 170, "total_steps": 1086, "loss": 0.4208, "lr": 9.904122341338765e-05, "epoch": 0.4692891649413389, "percentage": 15.65, "elapsed_time": "0:17:42", "remaining_time": "1:35:25"} +{"current_steps": 180, "total_steps": 1086, "loss": 0.4436, "lr": 9.870258140279503e-05, "epoch": 0.4968944099378882, "percentage": 16.57, "elapsed_time": "0:18:43", "remaining_time": "1:34:14"} +{"current_steps": 190, "total_steps": 1086, "loss": 0.4288, "lr": 9.831358641225624e-05, "epoch": 0.5244996549344375, "percentage": 17.5, "elapsed_time": "0:19:45", "remaining_time": "1:33:09"} +{"current_steps": 200, "total_steps": 1086, "loss": 0.4384, "lr": 9.787464061874825e-05, "epoch": 0.5521048999309869, "percentage": 18.42, "elapsed_time": "0:20:48", "remaining_time": "1:32:11"} +{"current_steps": 210, "total_steps": 1086, "loss": 0.4178, "lr": 9.738619784274833e-05, "epoch": 0.5797101449275363, "percentage": 19.34, "elapsed_time": "0:21:50", "remaining_time": "1:31:07"} +{"current_steps": 220, "total_steps": 1086, "loss": 0.42, "lr": 9.684876307903494e-05, "epoch": 0.6073153899240856, "percentage": 20.26, "elapsed_time": "0:22:50", "remaining_time": "1:29:56"} +{"current_steps": 230, "total_steps": 1086, "loss": 0.4296, "lr": 9.626289197457994e-05, "epoch": 0.6349206349206349, "percentage": 21.18, "elapsed_time": "0:23:52", "remaining_time": "1:28:53"} +{"current_steps": 240, "total_steps": 1086, "loss": 0.4264, "lr": 9.562919025407236e-05, "epoch": 0.6625258799171843, "percentage": 22.1, "elapsed_time": "0:24:56", "remaining_time": "1:27:55"} +{"current_steps": 250, "total_steps": 1086, "loss": 0.4052, "lr": 9.494831309366723e-05, "epoch": 0.6901311249137336, "percentage": 23.02, "elapsed_time": "0:26:00", "remaining_time": "1:26:58"} +{"current_steps": 260, "total_steps": 1086, "loss": 0.41, "lr": 9.422096444360735e-05, "epoch": 0.717736369910283, "percentage": 23.94, "elapsed_time": "0:27:02", "remaining_time": "1:25:54"} +{"current_steps": 270, "total_steps": 1086, "loss": 0.4162, "lr": 9.34478963004181e-05, "epoch": 0.7453416149068323, "percentage": 24.86, "elapsed_time": "0:28:04", "remaining_time": "1:24:51"} +{"current_steps": 280, "total_steps": 1086, "loss": 0.4183, "lr": 9.262990792942768e-05, "epoch": 0.7729468599033816, "percentage": 25.78, "elapsed_time": "0:29:09", "remaining_time": "1:23:54"} +{"current_steps": 290, "total_steps": 1086, "loss": 0.4174, "lr": 9.176784503841697e-05, "epoch": 0.800552104899931, "percentage": 26.7, "elapsed_time": "0:30:09", "remaining_time": "1:22:45"} +{"current_steps": 300, "total_steps": 1086, "loss": 0.4191, "lr": 9.086259890325297e-05, "epoch": 0.8281573498964804, "percentage": 27.62, "elapsed_time": "0:31:10", "remaining_time": "1:21:41"} +{"current_steps": 310, "total_steps": 1086, "loss": 0.4253, "lr": 8.991510544640991e-05, "epoch": 0.8557625948930296, "percentage": 28.55, "elapsed_time": "0:32:11", "remaining_time": "1:20:35"} +{"current_steps": 320, "total_steps": 1086, "loss": 0.4265, "lr": 8.892634426933106e-05, "epoch": 0.883367839889579, "percentage": 29.47, "elapsed_time": "0:33:13", "remaining_time": "1:19:32"} +{"current_steps": 330, "total_steps": 1086, "loss": 0.4365, "lr": 8.78973376396311e-05, "epoch": 0.9109730848861284, "percentage": 30.39, "elapsed_time": "0:34:15", "remaining_time": "1:18:29"} +{"current_steps": 340, "total_steps": 1086, "loss": 0.4058, "lr": 8.682914943418676e-05, "epoch": 0.9385783298826778, "percentage": 31.31, "elapsed_time": "0:35:17", "remaining_time": "1:17:24"} +{"current_steps": 350, "total_steps": 1086, "loss": 0.4078, "lr": 8.572288403920792e-05, "epoch": 0.966183574879227, "percentage": 32.23, "elapsed_time": "0:36:18", "remaining_time": "1:16:21"} +{"current_steps": 360, "total_steps": 1086, "loss": 0.4063, "lr": 8.45796852084268e-05, "epoch": 0.9937888198757764, "percentage": 33.15, "elapsed_time": "0:37:20", "remaining_time": "1:15:18"} +{"current_steps": 370, "total_steps": 1086, "loss": 0.3742, "lr": 8.340073488058552e-05, "epoch": 1.0193236714975846, "percentage": 34.07, "elapsed_time": "0:38:16", "remaining_time": "1:14:03"} +{"current_steps": 380, "total_steps": 1086, "loss": 0.3809, "lr": 8.218725195744463e-05, "epoch": 1.0469289164941338, "percentage": 34.99, "elapsed_time": "0:39:18", "remaining_time": "1:13:01"} +{"current_steps": 390, "total_steps": 1086, "loss": 0.3823, "lr": 8.094049104357609e-05, "epoch": 1.0745341614906831, "percentage": 35.91, "elapsed_time": "0:40:20", "remaining_time": "1:11:59"} +{"current_steps": 400, "total_steps": 1086, "loss": 0.3765, "lr": 7.966174114924351e-05, "epoch": 1.1021394064872325, "percentage": 36.83, "elapsed_time": "0:41:24", "remaining_time": "1:11:01"} +{"current_steps": 410, "total_steps": 1086, "loss": 0.3751, "lr": 7.83523243577109e-05, "epoch": 1.129744651483782, "percentage": 37.75, "elapsed_time": "0:42:26", "remaining_time": "1:09:58"} +{"current_steps": 420, "total_steps": 1086, "loss": 0.3869, "lr": 7.70135944583575e-05, "epoch": 1.1573498964803313, "percentage": 38.67, "elapsed_time": "0:43:29", "remaining_time": "1:08:57"} +{"current_steps": 430, "total_steps": 1086, "loss": 0.3634, "lr": 7.56469355470122e-05, "epoch": 1.1849551414768806, "percentage": 39.59, "elapsed_time": "0:44:30", "remaining_time": "1:07:53"} +{"current_steps": 440, "total_steps": 1086, "loss": 0.3768, "lr": 7.425376059495442e-05, "epoch": 1.21256038647343, "percentage": 40.52, "elapsed_time": "0:45:32", "remaining_time": "1:06:51"} +{"current_steps": 450, "total_steps": 1086, "loss": 0.3669, "lr": 7.283550998806108e-05, "epoch": 1.2401656314699794, "percentage": 41.44, "elapsed_time": "0:46:33", "remaining_time": "1:05:47"} +{"current_steps": 460, "total_steps": 1086, "loss": 0.3824, "lr": 7.139365003760999e-05, "epoch": 1.2677708764665288, "percentage": 42.36, "elapsed_time": "0:47:34", "remaining_time": "1:04:44"} +{"current_steps": 470, "total_steps": 1086, "loss": 0.3646, "lr": 6.992967146427913e-05, "epoch": 1.295376121463078, "percentage": 43.28, "elapsed_time": "0:48:37", "remaining_time": "1:03:43"} +{"current_steps": 480, "total_steps": 1086, "loss": 0.3755, "lr": 6.844508785690964e-05, "epoch": 1.3229813664596273, "percentage": 44.2, "elapsed_time": "0:49:36", "remaining_time": "1:02:38"} +{"current_steps": 490, "total_steps": 1086, "loss": 0.3841, "lr": 6.694143410762542e-05, "epoch": 1.3505866114561766, "percentage": 45.12, "elapsed_time": "0:50:37", "remaining_time": "1:01:34"} +{"current_steps": 500, "total_steps": 1086, "loss": 0.3839, "lr": 6.54202648249278e-05, "epoch": 1.378191856452726, "percentage": 46.04, "elapsed_time": "0:51:41", "remaining_time": "1:00:34"} +{"current_steps": 510, "total_steps": 1086, "loss": 0.3726, "lr": 6.388315272640544e-05, "epoch": 1.4057971014492754, "percentage": 46.96, "elapsed_time": "0:52:45", "remaining_time": "0:59:34"} +{"current_steps": 520, "total_steps": 1086, "loss": 0.3722, "lr": 6.233168701272167e-05, "epoch": 1.4334023464458248, "percentage": 47.88, "elapsed_time": "0:53:45", "remaining_time": "0:58:31"} +{"current_steps": 530, "total_steps": 1086, "loss": 0.3623, "lr": 6.076747172456015e-05, "epoch": 1.4610075914423741, "percentage": 48.8, "elapsed_time": "0:54:49", "remaining_time": "0:57:30"} +{"current_steps": 540, "total_steps": 1086, "loss": 0.3684, "lr": 5.919212408422753e-05, "epoch": 1.4886128364389233, "percentage": 49.72, "elapsed_time": "0:55:49", "remaining_time": "0:56:27"} +{"current_steps": 550, "total_steps": 1086, "loss": 0.3674, "lr": 5.76072728236279e-05, "epoch": 1.5162180814354729, "percentage": 50.64, "elapsed_time": "0:56:54", "remaining_time": "0:55:27"} +{"current_steps": 560, "total_steps": 1086, "loss": 0.3602, "lr": 5.6014556500337534e-05, "epoch": 1.543823326432022, "percentage": 51.57, "elapsed_time": "0:57:55", "remaining_time": "0:54:24"} +{"current_steps": 570, "total_steps": 1086, "loss": 0.3872, "lr": 5.44156218035211e-05, "epoch": 1.5714285714285714, "percentage": 52.49, "elapsed_time": "0:58:59", "remaining_time": "0:53:24"} +{"current_steps": 580, "total_steps": 1086, "loss": 0.3678, "lr": 5.28121218514406e-05, "epoch": 1.5990338164251208, "percentage": 53.41, "elapsed_time": "1:00:01", "remaining_time": "0:52:22"} +{"current_steps": 590, "total_steps": 1086, "loss": 0.3652, "lr": 5.1205714482317455e-05, "epoch": 1.6266390614216701, "percentage": 54.33, "elapsed_time": "1:01:05", "remaining_time": "0:51:21"} +{"current_steps": 600, "total_steps": 1086, "loss": 0.3786, "lr": 4.95980605403146e-05, "epoch": 1.6542443064182195, "percentage": 55.25, "elapsed_time": "1:02:10", "remaining_time": "0:50:21"} +{"current_steps": 610, "total_steps": 1086, "loss": 0.3715, "lr": 4.79908221584108e-05, "epoch": 1.6818495514147687, "percentage": 56.17, "elapsed_time": "1:03:13", "remaining_time": "0:49:19"} +{"current_steps": 620, "total_steps": 1086, "loss": 0.386, "lr": 4.638566103994258e-05, "epoch": 1.7094547964113183, "percentage": 57.09, "elapsed_time": "1:04:14", "remaining_time": "0:48:17"} +{"current_steps": 630, "total_steps": 1086, "loss": 0.3723, "lr": 4.478423674059015e-05, "epoch": 1.7370600414078674, "percentage": 58.01, "elapsed_time": "1:05:19", "remaining_time": "0:47:16"} +{"current_steps": 640, "total_steps": 1086, "loss": 0.3794, "lr": 4.318820495258396e-05, "epoch": 1.764665286404417, "percentage": 58.93, "elapsed_time": "1:06:21", "remaining_time": "0:46:14"} +{"current_steps": 650, "total_steps": 1086, "loss": 0.3641, "lr": 4.159921579290546e-05, "epoch": 1.7922705314009661, "percentage": 59.85, "elapsed_time": "1:07:24", "remaining_time": "0:45:13"} +{"current_steps": 660, "total_steps": 1086, "loss": 0.3727, "lr": 4.0018912097252234e-05, "epoch": 1.8198757763975155, "percentage": 60.77, "elapsed_time": "1:08:27", "remaining_time": "0:44:11"} +{"current_steps": 670, "total_steps": 1086, "loss": 0.3666, "lr": 3.8448927721530967e-05, "epoch": 1.847481021394065, "percentage": 61.69, "elapsed_time": "1:09:31", "remaining_time": "0:43:09"} +{"current_steps": 680, "total_steps": 1086, "loss": 0.3707, "lr": 3.6890885852634635e-05, "epoch": 1.8750862663906143, "percentage": 62.62, "elapsed_time": "1:10:33", "remaining_time": "0:42:07"} +{"current_steps": 690, "total_steps": 1086, "loss": 0.3793, "lr": 3.534639733025017e-05, "epoch": 1.9026915113871636, "percentage": 63.54, "elapsed_time": "1:11:36", "remaining_time": "0:41:05"} +{"current_steps": 700, "total_steps": 1086, "loss": 0.3623, "lr": 3.3817058981431784e-05, "epoch": 1.9302967563837128, "percentage": 64.46, "elapsed_time": "1:12:38", "remaining_time": "0:40:03"} +{"current_steps": 710, "total_steps": 1086, "loss": 0.3564, "lr": 3.230445196966181e-05, "epoch": 1.9579020013802624, "percentage": 65.38, "elapsed_time": "1:13:41", "remaining_time": "0:39:01"} +{"current_steps": 720, "total_steps": 1086, "loss": 0.3681, "lr": 3.081014016010584e-05, "epoch": 1.9855072463768115, "percentage": 66.3, "elapsed_time": "1:14:41", "remaining_time": "0:37:58"} +{"current_steps": 730, "total_steps": 1086, "loss": 0.359, "lr": 2.9335668502752394e-05, "epoch": 2.0110420979986197, "percentage": 67.22, "elapsed_time": "1:15:37", "remaining_time": "0:36:52"} +{"current_steps": 740, "total_steps": 1086, "loss": 0.3189, "lr": 2.7882561435108824e-05, "epoch": 2.0386473429951693, "percentage": 68.14, "elapsed_time": "1:16:40", "remaining_time": "0:35:50"} +{"current_steps": 750, "total_steps": 1086, "loss": 0.3409, "lr": 2.6452321306104634e-05, "epoch": 2.0662525879917184, "percentage": 69.06, "elapsed_time": "1:17:44", "remaining_time": "0:34:49"} +{"current_steps": 760, "total_steps": 1086, "loss": 0.3354, "lr": 2.5046426822832175e-05, "epoch": 2.0938578329882676, "percentage": 69.98, "elapsed_time": "1:18:46", "remaining_time": "0:33:47"} +{"current_steps": 770, "total_steps": 1086, "loss": 0.3366, "lr": 2.3666331521730024e-05, "epoch": 2.121463077984817, "percentage": 70.9, "elapsed_time": "1:19:48", "remaining_time": "0:32:45"} +{"current_steps": 780, "total_steps": 1086, "loss": 0.3231, "lr": 2.2313462265790196e-05, "epoch": 2.1490683229813663, "percentage": 71.82, "elapsed_time": "1:20:49", "remaining_time": "0:31:42"} +{"current_steps": 790, "total_steps": 1086, "loss": 0.3333, "lr": 2.098921776934269e-05, "epoch": 2.176673567977916, "percentage": 72.74, "elapsed_time": "1:21:52", "remaining_time": "0:30:40"} +{"current_steps": 800, "total_steps": 1086, "loss": 0.3401, "lr": 1.96949671519424e-05, "epoch": 2.204278812974465, "percentage": 73.66, "elapsed_time": "1:22:52", "remaining_time": "0:29:37"} +{"current_steps": 810, "total_steps": 1086, "loss": 0.3453, "lr": 1.843204852285389e-05, "epoch": 2.2318840579710146, "percentage": 74.59, "elapsed_time": "1:23:53", "remaining_time": "0:28:35"} +{"current_steps": 820, "total_steps": 1086, "loss": 0.338, "lr": 1.7201767597597196e-05, "epoch": 2.259489302967564, "percentage": 75.51, "elapsed_time": "1:24:57", "remaining_time": "0:27:33"} +{"current_steps": 830, "total_steps": 1086, "loss": 0.3334, "lr": 1.60053963479852e-05, "epoch": 2.287094547964113, "percentage": 76.43, "elapsed_time": "1:25:58", "remaining_time": "0:26:31"} +{"current_steps": 840, "total_steps": 1086, "loss": 0.3359, "lr": 1.4844171687048058e-05, "epoch": 2.3146997929606625, "percentage": 77.35, "elapsed_time": "1:27:02", "remaining_time": "0:25:29"} +{"current_steps": 850, "total_steps": 1086, "loss": 0.3534, "lr": 1.371929419020459e-05, "epoch": 2.3423050379572117, "percentage": 78.27, "elapsed_time": "1:28:03", "remaining_time": "0:24:27"} +{"current_steps": 860, "total_steps": 1086, "loss": 0.3247, "lr": 1.2631926854002574e-05, "epoch": 2.3699102829537613, "percentage": 79.19, "elapsed_time": "1:29:06", "remaining_time": "0:23:25"} +{"current_steps": 870, "total_steps": 1086, "loss": 0.3309, "lr": 1.1583193893711475e-05, "epoch": 2.3975155279503104, "percentage": 80.11, "elapsed_time": "1:30:09", "remaining_time": "0:22:23"} +{"current_steps": 880, "total_steps": 1086, "loss": 0.3408, "lr": 1.0574179581010468e-05, "epoch": 2.42512077294686, "percentage": 81.03, "elapsed_time": "1:31:11", "remaining_time": "0:21:20"} +{"current_steps": 890, "total_steps": 1086, "loss": 0.3338, "lr": 9.60592712297379e-06, "epoch": 2.452726017943409, "percentage": 81.95, "elapsed_time": "1:32:12", "remaining_time": "0:20:18"} +{"current_steps": 900, "total_steps": 1086, "loss": 0.3398, "lr": 8.679437583512168e-06, "epoch": 2.4803312629399588, "percentage": 82.87, "elapsed_time": "1:33:12", "remaining_time": "0:19:15"} +{"current_steps": 910, "total_steps": 1086, "loss": 0.333, "lr": 7.795668848385623e-06, "epoch": 2.507936507936508, "percentage": 83.79, "elapsed_time": "1:34:17", "remaining_time": "0:18:14"} +{"current_steps": 920, "total_steps": 1086, "loss": 0.342, "lr": 6.95553463485748e-06, "epoch": 2.5355417529330575, "percentage": 84.71, "elapsed_time": "1:35:17", "remaining_time": "0:17:11"} +{"current_steps": 930, "total_steps": 1086, "loss": 0.3335, "lr": 6.159903547013746e-06, "epoch": 2.5631469979296067, "percentage": 85.64, "elapsed_time": "1:36:21", "remaining_time": "0:16:09"} +{"current_steps": 940, "total_steps": 1086, "loss": 0.3426, "lr": 5.409598177724401e-06, "epoch": 2.590752242926156, "percentage": 86.56, "elapsed_time": "1:37:22", "remaining_time": "0:15:07"} +{"current_steps": 950, "total_steps": 1086, "loss": 0.3463, "lr": 4.7053942581750385e-06, "epoch": 2.6183574879227054, "percentage": 87.48, "elapsed_time": "1:38:22", "remaining_time": "0:14:05"} +{"current_steps": 960, "total_steps": 1086, "loss": 0.3331, "lr": 4.048019855848273e-06, "epoch": 2.6459627329192545, "percentage": 88.4, "elapsed_time": "1:39:25", "remaining_time": "0:13:03"} +{"current_steps": 970, "total_steps": 1086, "loss": 0.3422, "lr": 3.438154621784029e-06, "epoch": 2.673567977915804, "percentage": 89.32, "elapsed_time": "1:40:27", "remaining_time": "0:12:00"} +{"current_steps": 980, "total_steps": 1086, "loss": 0.3262, "lr": 2.8764290878969756e-06, "epoch": 2.7011732229123533, "percentage": 90.24, "elapsed_time": "1:41:27", "remaining_time": "0:10:58"} +{"current_steps": 990, "total_steps": 1086, "loss": 0.3303, "lr": 2.3634240150775646e-06, "epoch": 2.728778467908903, "percentage": 91.16, "elapsed_time": "1:42:32", "remaining_time": "0:09:56"} +{"current_steps": 1000, "total_steps": 1086, "loss": 0.3446, "lr": 1.8996697927507468e-06, "epoch": 2.756383712905452, "percentage": 92.08, "elapsed_time": "1:43:34", "remaining_time": "0:08:54"} +{"current_steps": 1010, "total_steps": 1086, "loss": 0.3309, "lr": 1.4856458905130822e-06, "epoch": 2.783988957902001, "percentage": 93.0, "elapsed_time": "1:44:40", "remaining_time": "0:07:52"} +{"current_steps": 1020, "total_steps": 1086, "loss": 0.326, "lr": 1.1217803624152311e-06, "epoch": 2.8115942028985508, "percentage": 93.92, "elapsed_time": "1:45:46", "remaining_time": "0:06:50"} +{"current_steps": 1030, "total_steps": 1086, "loss": 0.3364, "lr": 8.084494044022839e-07, "epoch": 2.8391994478951, "percentage": 94.84, "elapsed_time": "1:46:48", "remaining_time": "0:05:48"} +{"current_steps": 1040, "total_steps": 1086, "loss": 0.3313, "lr": 5.459769653695657e-07, "epoch": 2.8668046928916495, "percentage": 95.76, "elapsed_time": "1:47:50", "remaining_time": "0:04:46"} +{"current_steps": 1050, "total_steps": 1086, "loss": 0.328, "lr": 3.346344122360179e-07, "epoch": 2.8944099378881987, "percentage": 96.69, "elapsed_time": "1:48:51", "remaining_time": "0:03:43"} +{"current_steps": 1060, "total_steps": 1086, "loss": 0.3426, "lr": 1.746402493813415e-07, "epoch": 2.9220151828847483, "percentage": 97.61, "elapsed_time": "1:49:53", "remaining_time": "0:02:41"} +{"current_steps": 1070, "total_steps": 1086, "loss": 0.3378, "lr": 6.615989273713874e-08, "epoch": 2.9496204278812974, "percentage": 98.53, "elapsed_time": "1:50:59", "remaining_time": "0:01:39"} +{"current_steps": 1080, "total_steps": 1086, "loss": 0.3358, "lr": 9.305498765438404e-09, "epoch": 2.9772256728778466, "percentage": 99.45, "elapsed_time": "1:52:03", "remaining_time": "0:00:37"} +{"current_steps": 1086, "total_steps": 1086, "epoch": 2.9937888198757765, "percentage": 100.0, "elapsed_time": "1:52:43", "remaining_time": "0:00:00"} diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e2e64e3591e4dec2b26b7f77dbde6e53f919be07 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,798 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9937888198757765, + "eval_steps": 500, + "global_step": 1086, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.027605244996549344, + "grad_norm": 1.6335422992706299, + "learning_rate": 9.174311926605506e-06, + "loss": 0.8102, + "step": 10 + }, + { + "epoch": 0.05521048999309869, + "grad_norm": 0.8111785054206848, + "learning_rate": 1.834862385321101e-05, + "loss": 0.6999, + "step": 20 + }, + { + "epoch": 0.08281573498964803, + "grad_norm": 0.4619831144809723, + "learning_rate": 2.7522935779816515e-05, + "loss": 0.5682, + "step": 30 + }, + { + "epoch": 0.11042097998619738, + "grad_norm": 0.4434720575809479, + "learning_rate": 3.669724770642202e-05, + "loss": 0.5232, + "step": 40 + }, + { + "epoch": 0.13802622498274672, + "grad_norm": 0.44054797291755676, + "learning_rate": 4.587155963302753e-05, + "loss": 0.5084, + "step": 50 + }, + { + "epoch": 0.16563146997929606, + "grad_norm": 0.42256447672843933, + "learning_rate": 5.504587155963303e-05, + "loss": 0.477, + "step": 60 + }, + { + "epoch": 0.1932367149758454, + "grad_norm": 0.4349405765533447, + "learning_rate": 6.422018348623854e-05, + "loss": 0.4841, + "step": 70 + }, + { + "epoch": 0.22084195997239475, + "grad_norm": 0.4515930712223053, + "learning_rate": 7.339449541284404e-05, + "loss": 0.4704, + "step": 80 + }, + { + "epoch": 0.2484472049689441, + "grad_norm": 0.45412737131118774, + "learning_rate": 8.256880733944955e-05, + "loss": 0.4718, + "step": 90 + }, + { + "epoch": 0.27605244996549344, + "grad_norm": 0.49010995030403137, + "learning_rate": 9.174311926605506e-05, + "loss": 0.4496, + "step": 100 + }, + { + "epoch": 0.3036576949620428, + "grad_norm": 0.4931396245956421, + "learning_rate": 9.999974150612772e-05, + "loss": 0.4524, + "step": 110 + }, + { + "epoch": 0.33126293995859213, + "grad_norm": 1.1270735263824463, + "learning_rate": 9.996872547536591e-05, + "loss": 0.4503, + "step": 120 + }, + { + "epoch": 0.3588681849551415, + "grad_norm": 0.48991507291793823, + "learning_rate": 9.988604741439287e-05, + "loss": 0.4399, + "step": 130 + }, + { + "epoch": 0.3864734299516908, + "grad_norm": 0.45801088213920593, + "learning_rate": 9.975179280300506e-05, + "loss": 0.4524, + "step": 140 + }, + { + "epoch": 0.4140786749482402, + "grad_norm": 0.420897901058197, + "learning_rate": 9.956610044533896e-05, + "loss": 0.4281, + "step": 150 + }, + { + "epoch": 0.4416839199447895, + "grad_norm": 0.4336962103843689, + "learning_rate": 9.932916232636318e-05, + "loss": 0.4305, + "step": 160 + }, + { + "epoch": 0.4692891649413389, + "grad_norm": 0.44120800495147705, + "learning_rate": 9.904122341338765e-05, + "loss": 0.4208, + "step": 170 + }, + { + "epoch": 0.4968944099378882, + "grad_norm": 0.9154078364372253, + "learning_rate": 9.870258140279503e-05, + "loss": 0.4436, + "step": 180 + }, + { + "epoch": 0.5244996549344375, + "grad_norm": 0.4551916718482971, + "learning_rate": 9.831358641225624e-05, + "loss": 0.4288, + "step": 190 + }, + { + "epoch": 0.5521048999309869, + "grad_norm": 0.4513665437698364, + "learning_rate": 9.787464061874825e-05, + "loss": 0.4384, + "step": 200 + }, + { + "epoch": 0.5797101449275363, + "grad_norm": 0.43779632449150085, + "learning_rate": 9.738619784274833e-05, + "loss": 0.4178, + "step": 210 + }, + { + "epoch": 0.6073153899240856, + "grad_norm": 0.4170076847076416, + "learning_rate": 9.684876307903494e-05, + "loss": 0.42, + "step": 220 + }, + { + "epoch": 0.6349206349206349, + "grad_norm": 0.4370488226413727, + "learning_rate": 9.626289197457994e-05, + "loss": 0.4296, + "step": 230 + }, + { + "epoch": 0.6625258799171843, + "grad_norm": 0.42547333240509033, + "learning_rate": 9.562919025407236e-05, + "loss": 0.4264, + "step": 240 + }, + { + "epoch": 0.6901311249137336, + "grad_norm": 0.4317057430744171, + "learning_rate": 9.494831309366723e-05, + "loss": 0.4052, + "step": 250 + }, + { + "epoch": 0.717736369910283, + "grad_norm": 0.40589675307273865, + "learning_rate": 9.422096444360735e-05, + "loss": 0.41, + "step": 260 + }, + { + "epoch": 0.7453416149068323, + "grad_norm": 0.44671744108200073, + "learning_rate": 9.34478963004181e-05, + "loss": 0.4162, + "step": 270 + }, + { + "epoch": 0.7729468599033816, + "grad_norm": 0.41162508726119995, + "learning_rate": 9.262990792942768e-05, + "loss": 0.4183, + "step": 280 + }, + { + "epoch": 0.800552104899931, + "grad_norm": 0.483149915933609, + "learning_rate": 9.176784503841697e-05, + "loss": 0.4174, + "step": 290 + }, + { + "epoch": 0.8281573498964804, + "grad_norm": 0.4605332612991333, + "learning_rate": 9.086259890325297e-05, + "loss": 0.4191, + "step": 300 + }, + { + "epoch": 0.8557625948930296, + "grad_norm": 0.4153307378292084, + "learning_rate": 8.991510544640991e-05, + "loss": 0.4253, + "step": 310 + }, + { + "epoch": 0.883367839889579, + "grad_norm": 0.43806084990501404, + "learning_rate": 8.892634426933106e-05, + "loss": 0.4265, + "step": 320 + }, + { + "epoch": 0.9109730848861284, + "grad_norm": 0.45412200689315796, + "learning_rate": 8.78973376396311e-05, + "loss": 0.4365, + "step": 330 + }, + { + "epoch": 0.9385783298826778, + "grad_norm": 0.3769752085208893, + "learning_rate": 8.682914943418676e-05, + "loss": 0.4058, + "step": 340 + }, + { + "epoch": 0.966183574879227, + "grad_norm": 0.4275883436203003, + "learning_rate": 8.572288403920792e-05, + "loss": 0.4078, + "step": 350 + }, + { + "epoch": 0.9937888198757764, + "grad_norm": 0.43371307849884033, + "learning_rate": 8.45796852084268e-05, + "loss": 0.4063, + "step": 360 + }, + { + "epoch": 1.0193236714975846, + "grad_norm": 0.4527032673358917, + "learning_rate": 8.340073488058552e-05, + "loss": 0.3742, + "step": 370 + }, + { + "epoch": 1.0469289164941338, + "grad_norm": 0.5205631256103516, + "learning_rate": 8.218725195744463e-05, + "loss": 0.3809, + "step": 380 + }, + { + "epoch": 1.0745341614906831, + "grad_norm": 0.4031950533390045, + "learning_rate": 8.094049104357609e-05, + "loss": 0.3823, + "step": 390 + }, + { + "epoch": 1.1021394064872325, + "grad_norm": 0.41949087381362915, + "learning_rate": 7.966174114924351e-05, + "loss": 0.3765, + "step": 400 + }, + { + "epoch": 1.129744651483782, + "grad_norm": 0.43814027309417725, + "learning_rate": 7.83523243577109e-05, + "loss": 0.3751, + "step": 410 + }, + { + "epoch": 1.1573498964803313, + "grad_norm": 0.4457204341888428, + "learning_rate": 7.70135944583575e-05, + "loss": 0.3869, + "step": 420 + }, + { + "epoch": 1.1849551414768806, + "grad_norm": 0.41421836614608765, + "learning_rate": 7.56469355470122e-05, + "loss": 0.3634, + "step": 430 + }, + { + "epoch": 1.21256038647343, + "grad_norm": 0.4416670799255371, + "learning_rate": 7.425376059495442e-05, + "loss": 0.3768, + "step": 440 + }, + { + "epoch": 1.2401656314699794, + "grad_norm": 0.44710710644721985, + "learning_rate": 7.283550998806108e-05, + "loss": 0.3669, + "step": 450 + }, + { + "epoch": 1.2677708764665288, + "grad_norm": 0.39852890372276306, + "learning_rate": 7.139365003760999e-05, + "loss": 0.3824, + "step": 460 + }, + { + "epoch": 1.295376121463078, + "grad_norm": 0.4412725269794464, + "learning_rate": 6.992967146427913e-05, + "loss": 0.3646, + "step": 470 + }, + { + "epoch": 1.3229813664596273, + "grad_norm": 0.41978228092193604, + "learning_rate": 6.844508785690964e-05, + "loss": 0.3755, + "step": 480 + }, + { + "epoch": 1.3505866114561766, + "grad_norm": 0.4214731752872467, + "learning_rate": 6.694143410762542e-05, + "loss": 0.3841, + "step": 490 + }, + { + "epoch": 1.378191856452726, + "grad_norm": 0.4128514230251312, + "learning_rate": 6.54202648249278e-05, + "loss": 0.3839, + "step": 500 + }, + { + "epoch": 1.4057971014492754, + "grad_norm": 0.3899001181125641, + "learning_rate": 6.388315272640544e-05, + "loss": 0.3726, + "step": 510 + }, + { + "epoch": 1.4334023464458248, + "grad_norm": 0.4347754120826721, + "learning_rate": 6.233168701272167e-05, + "loss": 0.3722, + "step": 520 + }, + { + "epoch": 1.4610075914423741, + "grad_norm": 0.3798378109931946, + "learning_rate": 6.076747172456015e-05, + "loss": 0.3623, + "step": 530 + }, + { + "epoch": 1.4886128364389233, + "grad_norm": 0.3879692256450653, + "learning_rate": 5.919212408422753e-05, + "loss": 0.3684, + "step": 540 + }, + { + "epoch": 1.5162180814354729, + "grad_norm": 0.4210754930973053, + "learning_rate": 5.76072728236279e-05, + "loss": 0.3674, + "step": 550 + }, + { + "epoch": 1.543823326432022, + "grad_norm": 0.4184245467185974, + "learning_rate": 5.6014556500337534e-05, + "loss": 0.3602, + "step": 560 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.43027910590171814, + "learning_rate": 5.44156218035211e-05, + "loss": 0.3872, + "step": 570 + }, + { + "epoch": 1.5990338164251208, + "grad_norm": 0.38721945881843567, + "learning_rate": 5.28121218514406e-05, + "loss": 0.3678, + "step": 580 + }, + { + "epoch": 1.6266390614216701, + "grad_norm": 0.4199799597263336, + "learning_rate": 5.1205714482317455e-05, + "loss": 0.3652, + "step": 590 + }, + { + "epoch": 1.6542443064182195, + "grad_norm": 0.40728333592414856, + "learning_rate": 4.95980605403146e-05, + "loss": 0.3786, + "step": 600 + }, + { + "epoch": 1.6818495514147687, + "grad_norm": 0.41107377409935, + "learning_rate": 4.79908221584108e-05, + "loss": 0.3715, + "step": 610 + }, + { + "epoch": 1.7094547964113183, + "grad_norm": 0.45491889119148254, + "learning_rate": 4.638566103994258e-05, + "loss": 0.386, + "step": 620 + }, + { + "epoch": 1.7370600414078674, + "grad_norm": 0.4167945683002472, + "learning_rate": 4.478423674059015e-05, + "loss": 0.3723, + "step": 630 + }, + { + "epoch": 1.764665286404417, + "grad_norm": 0.4188650846481323, + "learning_rate": 4.318820495258396e-05, + "loss": 0.3794, + "step": 640 + }, + { + "epoch": 1.7922705314009661, + "grad_norm": 0.45200666785240173, + "learning_rate": 4.159921579290546e-05, + "loss": 0.3641, + "step": 650 + }, + { + "epoch": 1.8198757763975155, + "grad_norm": 0.42524534463882446, + "learning_rate": 4.0018912097252234e-05, + "loss": 0.3727, + "step": 660 + }, + { + "epoch": 1.847481021394065, + "grad_norm": 0.4238753318786621, + "learning_rate": 3.8448927721530967e-05, + "loss": 0.3666, + "step": 670 + }, + { + "epoch": 1.8750862663906143, + "grad_norm": 0.3949458599090576, + "learning_rate": 3.6890885852634635e-05, + "loss": 0.3707, + "step": 680 + }, + { + "epoch": 1.9026915113871636, + "grad_norm": 0.4040445387363434, + "learning_rate": 3.534639733025017e-05, + "loss": 0.3793, + "step": 690 + }, + { + "epoch": 1.9302967563837128, + "grad_norm": 0.42878955602645874, + "learning_rate": 3.3817058981431784e-05, + "loss": 0.3623, + "step": 700 + }, + { + "epoch": 1.9579020013802624, + "grad_norm": 0.42626291513442993, + "learning_rate": 3.230445196966181e-05, + "loss": 0.3564, + "step": 710 + }, + { + "epoch": 1.9855072463768115, + "grad_norm": 0.43052035570144653, + "learning_rate": 3.081014016010584e-05, + "loss": 0.3681, + "step": 720 + }, + { + "epoch": 2.0110420979986197, + "grad_norm": 0.4627828896045685, + "learning_rate": 2.9335668502752394e-05, + "loss": 0.359, + "step": 730 + }, + { + "epoch": 2.0386473429951693, + "grad_norm": 0.45345333218574524, + "learning_rate": 2.7882561435108824e-05, + "loss": 0.3189, + "step": 740 + }, + { + "epoch": 2.0662525879917184, + "grad_norm": 0.40497517585754395, + "learning_rate": 2.6452321306104634e-05, + "loss": 0.3409, + "step": 750 + }, + { + "epoch": 2.0938578329882676, + "grad_norm": 0.4666087329387665, + "learning_rate": 2.5046426822832175e-05, + "loss": 0.3354, + "step": 760 + }, + { + "epoch": 2.121463077984817, + "grad_norm": 0.38220757246017456, + "learning_rate": 2.3666331521730024e-05, + "loss": 0.3366, + "step": 770 + }, + { + "epoch": 2.1490683229813663, + "grad_norm": 0.4605223536491394, + "learning_rate": 2.2313462265790196e-05, + "loss": 0.3231, + "step": 780 + }, + { + "epoch": 2.176673567977916, + "grad_norm": 0.558403730392456, + "learning_rate": 2.098921776934269e-05, + "loss": 0.3333, + "step": 790 + }, + { + "epoch": 2.204278812974465, + "grad_norm": 0.45217105746269226, + "learning_rate": 1.96949671519424e-05, + "loss": 0.3401, + "step": 800 + }, + { + "epoch": 2.2318840579710146, + "grad_norm": 0.4413389563560486, + "learning_rate": 1.843204852285389e-05, + "loss": 0.3453, + "step": 810 + }, + { + "epoch": 2.259489302967564, + "grad_norm": 0.3977566063404083, + "learning_rate": 1.7201767597597196e-05, + "loss": 0.338, + "step": 820 + }, + { + "epoch": 2.287094547964113, + "grad_norm": 0.4817161560058594, + "learning_rate": 1.60053963479852e-05, + "loss": 0.3334, + "step": 830 + }, + { + "epoch": 2.3146997929606625, + "grad_norm": 0.4438902735710144, + "learning_rate": 1.4844171687048058e-05, + "loss": 0.3359, + "step": 840 + }, + { + "epoch": 2.3423050379572117, + "grad_norm": 0.45830076932907104, + "learning_rate": 1.371929419020459e-05, + "loss": 0.3534, + "step": 850 + }, + { + "epoch": 2.3699102829537613, + "grad_norm": 0.48253732919692993, + "learning_rate": 1.2631926854002574e-05, + "loss": 0.3247, + "step": 860 + }, + { + "epoch": 2.3975155279503104, + "grad_norm": 0.4572385549545288, + "learning_rate": 1.1583193893711475e-05, + "loss": 0.3309, + "step": 870 + }, + { + "epoch": 2.42512077294686, + "grad_norm": 0.4570174217224121, + "learning_rate": 1.0574179581010468e-05, + "loss": 0.3408, + "step": 880 + }, + { + "epoch": 2.452726017943409, + "grad_norm": 0.5289928913116455, + "learning_rate": 9.60592712297379e-06, + "loss": 0.3338, + "step": 890 + }, + { + "epoch": 2.4803312629399588, + "grad_norm": 0.49394240975379944, + "learning_rate": 8.679437583512168e-06, + "loss": 0.3398, + "step": 900 + }, + { + "epoch": 2.507936507936508, + "grad_norm": 0.412822425365448, + "learning_rate": 7.795668848385623e-06, + "loss": 0.333, + "step": 910 + }, + { + "epoch": 2.5355417529330575, + "grad_norm": 0.4305315911769867, + "learning_rate": 6.95553463485748e-06, + "loss": 0.342, + "step": 920 + }, + { + "epoch": 2.5631469979296067, + "grad_norm": 0.43158090114593506, + "learning_rate": 6.159903547013746e-06, + "loss": 0.3335, + "step": 930 + }, + { + "epoch": 2.590752242926156, + "grad_norm": 0.4319579005241394, + "learning_rate": 5.409598177724401e-06, + "loss": 0.3426, + "step": 940 + }, + { + "epoch": 2.6183574879227054, + "grad_norm": 0.4702156186103821, + "learning_rate": 4.7053942581750385e-06, + "loss": 0.3463, + "step": 950 + }, + { + "epoch": 2.6459627329192545, + "grad_norm": 0.38157370686531067, + "learning_rate": 4.048019855848273e-06, + "loss": 0.3331, + "step": 960 + }, + { + "epoch": 2.673567977915804, + "grad_norm": 0.4141283631324768, + "learning_rate": 3.438154621784029e-06, + "loss": 0.3422, + "step": 970 + }, + { + "epoch": 2.7011732229123533, + "grad_norm": 0.42628729343414307, + "learning_rate": 2.8764290878969756e-06, + "loss": 0.3262, + "step": 980 + }, + { + "epoch": 2.728778467908903, + "grad_norm": 0.4850899577140808, + "learning_rate": 2.3634240150775646e-06, + "loss": 0.3303, + "step": 990 + }, + { + "epoch": 2.756383712905452, + "grad_norm": 0.4277842938899994, + "learning_rate": 1.8996697927507468e-06, + "loss": 0.3446, + "step": 1000 + }, + { + "epoch": 2.783988957902001, + "grad_norm": 0.45691201090812683, + "learning_rate": 1.4856458905130822e-06, + "loss": 0.3309, + "step": 1010 + }, + { + "epoch": 2.8115942028985508, + "grad_norm": 0.4542577862739563, + "learning_rate": 1.1217803624152311e-06, + "loss": 0.326, + "step": 1020 + }, + { + "epoch": 2.8391994478951, + "grad_norm": 0.39988699555397034, + "learning_rate": 8.084494044022839e-07, + "loss": 0.3364, + "step": 1030 + }, + { + "epoch": 2.8668046928916495, + "grad_norm": 0.43636584281921387, + "learning_rate": 5.459769653695657e-07, + "loss": 0.3313, + "step": 1040 + }, + { + "epoch": 2.8944099378881987, + "grad_norm": 0.4335787892341614, + "learning_rate": 3.346344122360179e-07, + "loss": 0.328, + "step": 1050 + }, + { + "epoch": 2.9220151828847483, + "grad_norm": 0.4669038951396942, + "learning_rate": 1.746402493813415e-07, + "loss": 0.3426, + "step": 1060 + }, + { + "epoch": 2.9496204278812974, + "grad_norm": 0.43036729097366333, + "learning_rate": 6.615989273713874e-08, + "loss": 0.3378, + "step": 1070 + }, + { + "epoch": 2.9772256728778466, + "grad_norm": 0.4190558195114136, + "learning_rate": 9.305498765438404e-09, + "loss": 0.3358, + "step": 1080 + }, + { + "epoch": 2.9937888198757765, + "step": 1086, + "total_flos": 6.17252944434797e+18, + "train_loss": 0.3891050570797086, + "train_runtime": 6764.7829, + "train_samples_per_second": 5.14, + "train_steps_per_second": 0.161 + } + ], + "logging_steps": 10, + "max_steps": 1086, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.17252944434797e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..fd86c75750949f0ca2ee56bc27dadb57430a90de --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b52a6484c213110d668e89b1ff8d77bac863e0460a3e92ff200a8df3f14879a5 +size 5688 diff --git a/training_loss.png b/training_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..3ec530bd4465dd893f0e87f525592ef900463ab1 Binary files /dev/null and b/training_loss.png differ