|
[WARNING|parser.py:272] 2024-07-24 15:04:58,287 >> We recommend enable `upcast_layernorm` in quantized training. |
|
|
|
[WARNING|parser.py:292] 2024-07-24 15:04:58,287 >> `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. |
|
|
|
[INFO|parser.py:344] 2024-07-24 15:04:58,288 >> Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 |
|
|
|
07/24/2024 15:04:58 - WARNING - llamafactory.hparams.parser - We recommend enable `upcast_layernorm` in quantized training. |
|
|
|
07/24/2024 15:04:58 - WARNING - llamafactory.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. |
|
|
|
07/24/2024 15:04:58 - INFO - llamafactory.hparams.parser - Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 |
|
|
|
[INFO|tokenization_utils_base.py:2289] 2024-07-24 15:05:00,642 >> loading file tokenizer.model from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/tokenizer.model |
|
|
|
[INFO|tokenization_utils_base.py:2289] 2024-07-24 15:05:00,643 >> loading file tokenizer.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/tokenizer.json |
|
|
|
[INFO|tokenization_utils_base.py:2289] 2024-07-24 15:05:00,643 >> loading file added_tokens.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/added_tokens.json |
|
|
|
[INFO|tokenization_utils_base.py:2289] 2024-07-24 15:05:00,643 >> loading file special_tokens_map.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/special_tokens_map.json |
|
|
|
[INFO|tokenization_utils_base.py:2289] 2024-07-24 15:05:00,643 >> loading file tokenizer_config.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/tokenizer_config.json |
|
|
|
[INFO|tokenization_utils_base.py:2533] 2024-07-24 15:05:00,693 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
|
[INFO|loader.py:52] 2024-07-24 15:05:00,694 >> Loading dataset dataset_alpaca_IT_train_and_eval_25K.json... |
|
|
|
07/24/2024 15:05:08 - INFO - llamafactory.data.loader - Loading dataset dataset_alpaca_IT_train_and_eval_25K.json... |
|
|
|
[INFO|configuration_utils.py:733] 2024-07-24 15:05:11,124 >> loading configuration file config.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/config.json |
|
|
|
[INFO|configuration_utils.py:733] 2024-07-24 15:05:11,485 >> loading configuration file config.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-07-24 15:05:11,488 >> Model config Phi3Config { |
|
"_name_or_path": "microsoft/Phi-3-medium-128k-instruct", |
|
"architectures": [ |
|
"Phi3ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"auto_map": { |
|
"AutoConfig": "microsoft/Phi-3-medium-128k-instruct--configuration_phi3.Phi3Config", |
|
"AutoModelForCausalLM": "microsoft/Phi-3-medium-128k-instruct--modeling_phi3.Phi3ForCausalLM" |
|
}, |
|
"bos_token_id": 1, |
|
"embd_pdrop": 0.0, |
|
"eos_token_id": 32000, |
|
"hidden_act": "silu", |
|
"hidden_size": 5120, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 17920, |
|
"max_position_embeddings": 131072, |
|
"model_type": "phi3", |
|
"num_attention_heads": 40, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 10, |
|
"original_max_position_embeddings": 4096, |
|
"pad_token_id": null, |
|
"resid_pdrop": 0.0, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"long_factor": [ |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.25, |
|
1.25, |
|
1.5, |
|
2.0, |
|
2.75, |
|
5.75, |
|
5.75, |
|
6.5, |
|
9.25, |
|
11.0, |
|
13.25, |
|
19.25, |
|
19.75, |
|
19.75, |
|
21.25, |
|
21.5, |
|
26.5, |
|
30.0, |
|
33.75, |
|
35.25, |
|
38.5, |
|
42.0, |
|
42.25, |
|
46.0, |
|
47.0, |
|
50.0, |
|
50.5, |
|
51.0, |
|
52.0, |
|
52.75, |
|
53.75, |
|
54.75, |
|
57.0, |
|
57.25, |
|
58.5, |
|
59.25, |
|
59.5, |
|
62.0, |
|
62.5, |
|
62.75, |
|
63.25, |
|
63.25, |
|
63.25, |
|
63.75, |
|
64.0, |
|
64.0, |
|
64.25, |
|
64.5, |
|
64.5, |
|
65.0, |
|
65.0 |
|
], |
|
"short_factor": [ |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.01, |
|
1.02, |
|
1.02, |
|
1.04, |
|
1.04, |
|
1.07, |
|
1.07, |
|
1.1, |
|
1.3000000000000003, |
|
1.3000000000000003, |
|
1.5000000000000004, |
|
1.5700000000000005, |
|
1.9000000000000008, |
|
2.3100000000000014, |
|
2.759999999999992, |
|
3.3899999999999784, |
|
3.9399999999999666, |
|
4.009999999999965, |
|
4.289999999999959, |
|
4.349999999999958, |
|
5.349999999999937, |
|
6.659999999999909, |
|
7.029999999999901, |
|
7.51999999999989, |
|
8.00999999999988, |
|
8.249999999999876, |
|
8.279999999999875, |
|
9.629999999999846, |
|
9.89999999999984, |
|
10.589999999999826, |
|
11.049999999999816, |
|
11.7899999999998, |
|
12.189999999999792, |
|
12.889999999999777, |
|
13.129999999999772, |
|
13.16999999999977, |
|
13.20999999999977, |
|
13.479999999999764, |
|
13.539999999999763, |
|
13.779999999999758, |
|
13.929999999999755, |
|
14.429999999999744, |
|
14.759999999999737, |
|
15.149999999999729, |
|
15.419999999999723, |
|
15.53999999999972, |
|
15.659999999999718, |
|
15.749999999999716, |
|
15.759999999999716, |
|
15.799999999999715, |
|
16.05999999999971, |
|
16.079999999999714, |
|
16.11999999999972, |
|
16.11999999999972, |
|
16.18999999999973, |
|
16.31999999999975, |
|
16.539999999999786, |
|
16.799999999999827 |
|
], |
|
"type": "su" |
|
}, |
|
"rope_theta": 10000.0, |
|
"sliding_window": 131072, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.1", |
|
"use_cache": true, |
|
"vocab_size": 32064 |
|
} |
|
|
|
|
|
[INFO|quantization.py:182] 2024-07-24 15:05:11,496 >> Quantizing model to 4 bit with bitsandbytes. |
|
|
|
[INFO|modeling_utils.py:3621] 2024-07-24 15:05:12,104 >> loading weights file model.safetensors from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/model.safetensors.index.json |
|
|
|
07/24/2024 15:05:12 - INFO - llamafactory.model.model_utils.quantization - Quantizing model to 4 bit with bitsandbytes. |
|
|
|
[INFO|modeling_utils.py:1569] 2024-07-24 15:10:52,981 >> Instantiating Phi3ForCausalLM model under default dtype torch.bfloat16. |
|
|
|
[INFO|configuration_utils.py:1038] 2024-07-24 15:10:52,989 >> Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": 32000 |
|
} |
|
|
|
|
|
[INFO|modeling_utils.py:4450] 2024-07-24 15:11:18,809 >> All model checkpoint weights were used when initializing Phi3ForCausalLM. |
|
|
|
|
|
[INFO|modeling_utils.py:4458] 2024-07-24 15:11:18,810 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-medium-128k-instruct. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training. |
|
|
|
[INFO|configuration_utils.py:993] 2024-07-24 15:11:18,895 >> loading configuration file generation_config.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/generation_config.json |
|
|
|
[INFO|configuration_utils.py:1038] 2024-07-24 15:11:18,896 >> Generate config GenerationConfig { |
|
"bos_token_id": 1, |
|
"eos_token_id": [ |
|
32000, |
|
32001, |
|
32007 |
|
], |
|
"pad_token_id": 32000 |
|
} |
|
|
|
|
|
07/24/2024 15:12:39 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled. |
|
|
|
07/24/2024 15:12:39 - INFO - llamafactory.model.model_utils.attention - Using FlashAttention-2 for faster training and inference. |
|
|
|
07/24/2024 15:12:39 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32. |
|
|
|
07/24/2024 15:12:39 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA |
|
|
|
07/24/2024 15:12:39 - INFO - llamafactory.model.model_utils.misc - Found linear modules: o_proj,gate_up_proj,qkv_proj,down_proj |
|
|
|
07/24/2024 15:12:39 - INFO - llamafactory.model.loader - trainable params: 27,852,800 || all params: 13,988,090,880 || trainable%: 0.1991 |
|
|
|
[INFO|checkpointing.py:103] 2024-07-24 15:12:42,061 >> Gradient checkpointing enabled. |
|
|
|
[INFO|attention.py:82] 2024-07-24 15:12:42,061 >> Using FlashAttention-2 for faster training and inference. |
|
|
|
[INFO|adapter.py:302] 2024-07-24 15:12:42,061 >> Upcasting trainable params to float32. |
|
|
|
[INFO|adapter.py:158] 2024-07-24 15:12:42,061 >> Fine-tuning method: LoRA |
|
|
|
[INFO|misc.py:51] 2024-07-24 15:12:42,062 >> Found linear modules: o_proj,qkv_proj,down_proj,gate_up_proj |
|
|
|
[INFO|loader.py:196] 2024-07-24 15:12:42,467 >> trainable params: 27,852,800 || all params: 13,988,090,880 || trainable%: 0.1991 |
|
|
|
[INFO|trainer.py:648] 2024-07-24 15:12:42,473 >> Using auto half precision backend |
|
|
|
[INFO|deepspeed.py:329] 2024-07-24 15:12:42,673 >> Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB) |
|
|
|
[INFO|trainer.py:2134] 2024-07-24 15:13:06,954 >> ***** Running training ***** |
|
|
|
[INFO|trainer.py:2135] 2024-07-24 15:13:06,954 >> Num examples = 4,944 |
|
|
|
[INFO|trainer.py:2136] 2024-07-24 15:13:06,954 >> Num Epochs = 3 |
|
|
|
[INFO|trainer.py:2137] 2024-07-24 15:13:06,954 >> Instantaneous batch size per device = 2 |
|
|
|
[INFO|trainer.py:2140] 2024-07-24 15:13:06,954 >> Total train batch size (w. parallel, distributed & accumulation) = 32 |
|
|
|
[INFO|trainer.py:2141] 2024-07-24 15:13:06,954 >> Gradient Accumulation steps = 8 |
|
|
|
[INFO|trainer.py:2142] 2024-07-24 15:13:06,954 >> Total optimization steps = 462 |
|
|
|
[INFO|trainer.py:2143] 2024-07-24 15:13:06,958 >> Number of trainable parameters = 27,852,800 |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 15:21:31,280 >> {'loss': 0.5099, 'learning_rate': 1.0000e-05, 'epoch': 0.06, 'throughput': 2571.26} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 15:30:13,855 >> {'loss': 0.5115, 'learning_rate': 2.0000e-05, 'epoch': 0.13, 'throughput': 2534.92} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 15:38:39,051 >> {'loss': 0.4846, 'learning_rate': 3.0000e-05, 'epoch': 0.19, 'throughput': 2535.35} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 15:47:32,541 >> {'loss': 0.4076, 'learning_rate': 4.0000e-05, 'epoch': 0.26, 'throughput': 2553.97} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 15:56:37,450 >> {'loss': 0.3073, 'learning_rate': 5.0000e-05, 'epoch': 0.32, 'throughput': 2556.06} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 16:04:50,564 >> {'loss': 0.2516, 'learning_rate': 4.9927e-05, 'epoch': 0.39, 'throughput': 2559.12} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 16:13:54,185 >> {'loss': 0.2256, 'learning_rate': 4.9710e-05, 'epoch': 0.45, 'throughput': 2549.95} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 16:22:56,812 >> {'loss': 0.2146, 'learning_rate': 4.9349e-05, 'epoch': 0.52, 'throughput': 2547.79} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 16:31:48,796 >> {'loss': 0.2018, 'learning_rate': 4.8846e-05, 'epoch': 0.58, 'throughput': 2552.59} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 16:40:22,764 >> {'loss': 0.1958, 'learning_rate': 4.8205e-05, 'epoch': 0.65, 'throughput': 2556.94} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 16:48:46,503 >> {'loss': 0.1912, 'learning_rate': 4.7429e-05, 'epoch': 0.71, 'throughput': 2557.17} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 16:57:16,752 >> {'loss': 0.1876, 'learning_rate': 4.6522e-05, 'epoch': 0.78, 'throughput': 2558.65} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 17:05:34,669 >> {'loss': 0.1802, 'learning_rate': 4.5491e-05, 'epoch': 0.84, 'throughput': 2561.14} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 17:14:13,797 >> {'loss': 0.1793, 'learning_rate': 4.4340e-05, 'epoch': 0.91, 'throughput': 2560.01} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 17:22:00,855 >> {'loss': 0.1759, 'learning_rate': 4.3077e-05, 'epoch': 0.97, 'throughput': 2565.87} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 17:30:17,877 >> {'loss': 0.1746, 'learning_rate': 4.1709e-05, 'epoch': 1.04, 'throughput': 2564.86} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 17:38:59,853 >> {'loss': 0.1699, 'learning_rate': 4.0244e-05, 'epoch': 1.10, 'throughput': 2564.04} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 17:48:09,599 >> {'loss': 0.1680, 'learning_rate': 3.8690e-05, 'epoch': 1.17, 'throughput': 2560.03} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 17:56:04,957 >> {'loss': 0.1646, 'learning_rate': 3.7057e-05, 'epoch': 1.23, 'throughput': 2563.38} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 18:04:29,792 >> {'loss': 0.1667, 'learning_rate': 3.5354e-05, 'epoch': 1.29, 'throughput': 2565.53} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 18:13:16,644 >> {'loss': 0.1664, 'learning_rate': 3.3590e-05, 'epoch': 1.36, 'throughput': 2565.84} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 18:21:27,030 >> {'loss': 0.1622, 'learning_rate': 3.1777e-05, 'epoch': 1.42, 'throughput': 2565.58} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 18:30:03,234 >> {'loss': 0.1623, 'learning_rate': 2.9924e-05, 'epoch': 1.49, 'throughput': 2565.43} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 18:38:50,374 >> {'loss': 0.1616, 'learning_rate': 2.8043e-05, 'epoch': 1.55, 'throughput': 2565.51} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 18:47:03,606 >> {'loss': 0.1590, 'learning_rate': 2.6143e-05, 'epoch': 1.62, 'throughput': 2566.31} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 18:55:49,268 >> {'loss': 0.1619, 'learning_rate': 2.4238e-05, 'epoch': 1.68, 'throughput': 2564.17} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 19:04:24,775 >> {'loss': 0.1616, 'learning_rate': 2.2336e-05, 'epoch': 1.75, 'throughput': 2565.53} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 19:12:40,325 >> {'loss': 0.1604, 'learning_rate': 2.0450e-05, 'epoch': 1.81, 'throughput': 2566.61} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 19:21:50,945 >> {'loss': 0.1563, 'learning_rate': 1.8591e-05, 'epoch': 1.88, 'throughput': 2564.70} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 19:30:42,384 >> {'loss': 0.1548, 'learning_rate': 1.6769e-05, 'epoch': 1.94, 'throughput': 2565.36} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 19:39:26,495 >> {'loss': 0.1555, 'learning_rate': 1.4994e-05, 'epoch': 2.01, 'throughput': 2565.22} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 19:48:16,049 >> {'loss': 0.1526, 'learning_rate': 1.3278e-05, 'epoch': 2.07, 'throughput': 2564.82} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 19:56:55,676 >> {'loss': 0.1526, 'learning_rate': 1.1630e-05, 'epoch': 2.14, 'throughput': 2563.60} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 20:05:48,055 >> {'loss': 0.1516, 'learning_rate': 1.0060e-05, 'epoch': 2.20, 'throughput': 2564.75} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 20:14:05,975 >> {'loss': 0.1524, 'learning_rate': 8.5762e-06, 'epoch': 2.27, 'throughput': 2565.56} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 20:23:04,597 >> {'loss': 0.1502, 'learning_rate': 7.1880e-06, 'epoch': 2.33, 'throughput': 2564.08} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 20:31:47,755 >> {'loss': 0.1506, 'learning_rate': 5.9035e-06, 'epoch': 2.39, 'throughput': 2563.27} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 20:40:43,735 >> {'loss': 0.1479, 'learning_rate': 4.7298e-06, 'epoch': 2.46, 'throughput': 2560.31} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 20:49:04,924 >> {'loss': 0.1501, 'learning_rate': 3.6740e-06, 'epoch': 2.52, 'throughput': 2560.79} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 20:57:37,960 >> {'loss': 0.1504, 'learning_rate': 2.7422e-06, 'epoch': 2.59, 'throughput': 2561.70} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 21:05:54,158 >> {'loss': 0.1524, 'learning_rate': 1.9397e-06, 'epoch': 2.65, 'throughput': 2561.78} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 21:14:51,826 >> {'loss': 0.1494, 'learning_rate': 1.2712e-06, 'epoch': 2.72, 'throughput': 2559.88} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 21:23:35,877 >> {'loss': 0.1515, 'learning_rate': 7.4056e-07, 'epoch': 2.78, 'throughput': 2559.75} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 21:32:00,699 >> {'loss': 0.1504, 'learning_rate': 3.5095e-07, 'epoch': 2.85, 'throughput': 2559.97} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 21:40:43,444 >> {'loss': 0.1498, 'learning_rate': 1.0459e-07, 'epoch': 2.91, 'throughput': 2559.45} |
|
|
|
[INFO|callbacks.py:310] 2024-07-24 21:48:52,834 >> {'loss': 0.1469, 'learning_rate': 2.9071e-09, 'epoch': 2.98, 'throughput': 2559.91} |
|
|
|
[INFO|trainer.py:3503] 2024-07-24 21:50:38,895 >> Saving model checkpoint to saves/Custom/lora/train_2024-07-24-15-00-21/checkpoint-462 |
|
|
|
[INFO|configuration_utils.py:733] 2024-07-24 21:50:39,083 >> loading configuration file config.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-07-24 21:50:39,084 >> Model config Phi3Config { |
|
"_name_or_path": "Phi-3-medium-128k-instruct", |
|
"architectures": [ |
|
"Phi3ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"auto_map": { |
|
"AutoConfig": "microsoft/Phi-3-medium-128k-instruct--configuration_phi3.Phi3Config", |
|
"AutoModelForCausalLM": "microsoft/Phi-3-medium-128k-instruct--modeling_phi3.Phi3ForCausalLM" |
|
}, |
|
"bos_token_id": 1, |
|
"embd_pdrop": 0.0, |
|
"eos_token_id": 32000, |
|
"hidden_act": "silu", |
|
"hidden_size": 5120, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 17920, |
|
"max_position_embeddings": 131072, |
|
"model_type": "phi3", |
|
"num_attention_heads": 40, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 10, |
|
"original_max_position_embeddings": 4096, |
|
"pad_token_id": null, |
|
"resid_pdrop": 0.0, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"long_factor": [ |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.25, |
|
1.25, |
|
1.5, |
|
2.0, |
|
2.75, |
|
5.75, |
|
5.75, |
|
6.5, |
|
9.25, |
|
11.0, |
|
13.25, |
|
19.25, |
|
19.75, |
|
19.75, |
|
21.25, |
|
21.5, |
|
26.5, |
|
30.0, |
|
33.75, |
|
35.25, |
|
38.5, |
|
42.0, |
|
42.25, |
|
46.0, |
|
47.0, |
|
50.0, |
|
50.5, |
|
51.0, |
|
52.0, |
|
52.75, |
|
53.75, |
|
54.75, |
|
57.0, |
|
57.25, |
|
58.5, |
|
59.25, |
|
59.5, |
|
62.0, |
|
62.5, |
|
62.75, |
|
63.25, |
|
63.25, |
|
63.25, |
|
63.75, |
|
64.0, |
|
64.0, |
|
64.25, |
|
64.5, |
|
64.5, |
|
65.0, |
|
65.0 |
|
], |
|
"short_factor": [ |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.01, |
|
1.02, |
|
1.02, |
|
1.04, |
|
1.04, |
|
1.07, |
|
1.07, |
|
1.1, |
|
1.3000000000000003, |
|
1.3000000000000003, |
|
1.5000000000000004, |
|
1.5700000000000005, |
|
1.9000000000000008, |
|
2.3100000000000014, |
|
2.759999999999992, |
|
3.3899999999999784, |
|
3.9399999999999666, |
|
4.009999999999965, |
|
4.289999999999959, |
|
4.349999999999958, |
|
5.349999999999937, |
|
6.659999999999909, |
|
7.029999999999901, |
|
7.51999999999989, |
|
8.00999999999988, |
|
8.249999999999876, |
|
8.279999999999875, |
|
9.629999999999846, |
|
9.89999999999984, |
|
10.589999999999826, |
|
11.049999999999816, |
|
11.7899999999998, |
|
12.189999999999792, |
|
12.889999999999777, |
|
13.129999999999772, |
|
13.16999999999977, |
|
13.20999999999977, |
|
13.479999999999764, |
|
13.539999999999763, |
|
13.779999999999758, |
|
13.929999999999755, |
|
14.429999999999744, |
|
14.759999999999737, |
|
15.149999999999729, |
|
15.419999999999723, |
|
15.53999999999972, |
|
15.659999999999718, |
|
15.749999999999716, |
|
15.759999999999716, |
|
15.799999999999715, |
|
16.05999999999971, |
|
16.079999999999714, |
|
16.11999999999972, |
|
16.11999999999972, |
|
16.18999999999973, |
|
16.31999999999975, |
|
16.539999999999786, |
|
16.799999999999827 |
|
], |
|
"type": "su" |
|
}, |
|
"rope_theta": 10000.0, |
|
"sliding_window": 131072, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.1", |
|
"use_cache": true, |
|
"vocab_size": 32064 |
|
} |
|
|
|
|
|
[INFO|tokenization_utils_base.py:2702] 2024-07-24 21:50:39,135 >> tokenizer config file saved in saves/Custom/lora/train_2024-07-24-15-00-21/checkpoint-462/tokenizer_config.json |
|
|
|
[INFO|tokenization_utils_base.py:2711] 2024-07-24 21:50:39,135 >> Special tokens file saved in saves/Custom/lora/train_2024-07-24-15-00-21/checkpoint-462/special_tokens_map.json |
|
|
|
[INFO|trainer.py:2394] 2024-07-24 21:50:39,852 >> |
|
|
|
Training completed. Do not forget to share your model on huggingface.co/models =) |
|
|
|
|
|
|
|
[INFO|trainer.py:3503] 2024-07-24 21:50:42,012 >> Saving model checkpoint to saves/Custom/lora/train_2024-07-24-15-00-21 |
|
|
|
[INFO|configuration_utils.py:733] 2024-07-24 21:50:42,204 >> loading configuration file config.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/config.json |
|
|
|
[INFO|configuration_utils.py:800] 2024-07-24 21:50:42,205 >> Model config Phi3Config { |
|
"_name_or_path": "Phi-3-medium-128k-instruct", |
|
"architectures": [ |
|
"Phi3ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"auto_map": { |
|
"AutoConfig": "microsoft/Phi-3-medium-128k-instruct--configuration_phi3.Phi3Config", |
|
"AutoModelForCausalLM": "microsoft/Phi-3-medium-128k-instruct--modeling_phi3.Phi3ForCausalLM" |
|
}, |
|
"bos_token_id": 1, |
|
"embd_pdrop": 0.0, |
|
"eos_token_id": 32000, |
|
"hidden_act": "silu", |
|
"hidden_size": 5120, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 17920, |
|
"max_position_embeddings": 131072, |
|
"model_type": "phi3", |
|
"num_attention_heads": 40, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 10, |
|
"original_max_position_embeddings": 4096, |
|
"pad_token_id": null, |
|
"resid_pdrop": 0.0, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"long_factor": [ |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.25, |
|
1.25, |
|
1.5, |
|
2.0, |
|
2.75, |
|
5.75, |
|
5.75, |
|
6.5, |
|
9.25, |
|
11.0, |
|
13.25, |
|
19.25, |
|
19.75, |
|
19.75, |
|
21.25, |
|
21.5, |
|
26.5, |
|
30.0, |
|
33.75, |
|
35.25, |
|
38.5, |
|
42.0, |
|
42.25, |
|
46.0, |
|
47.0, |
|
50.0, |
|
50.5, |
|
51.0, |
|
52.0, |
|
52.75, |
|
53.75, |
|
54.75, |
|
57.0, |
|
57.25, |
|
58.5, |
|
59.25, |
|
59.5, |
|
62.0, |
|
62.5, |
|
62.75, |
|
63.25, |
|
63.25, |
|
63.25, |
|
63.75, |
|
64.0, |
|
64.0, |
|
64.25, |
|
64.5, |
|
64.5, |
|
65.0, |
|
65.0 |
|
], |
|
"short_factor": [ |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.01, |
|
1.02, |
|
1.02, |
|
1.04, |
|
1.04, |
|
1.07, |
|
1.07, |
|
1.1, |
|
1.3000000000000003, |
|
1.3000000000000003, |
|
1.5000000000000004, |
|
1.5700000000000005, |
|
1.9000000000000008, |
|
2.3100000000000014, |
|
2.759999999999992, |
|
3.3899999999999784, |
|
3.9399999999999666, |
|
4.009999999999965, |
|
4.289999999999959, |
|
4.349999999999958, |
|
5.349999999999937, |
|
6.659999999999909, |
|
7.029999999999901, |
|
7.51999999999989, |
|
8.00999999999988, |
|
8.249999999999876, |
|
8.279999999999875, |
|
9.629999999999846, |
|
9.89999999999984, |
|
10.589999999999826, |
|
11.049999999999816, |
|
11.7899999999998, |
|
12.189999999999792, |
|
12.889999999999777, |
|
13.129999999999772, |
|
13.16999999999977, |
|
13.20999999999977, |
|
13.479999999999764, |
|
13.539999999999763, |
|
13.779999999999758, |
|
13.929999999999755, |
|
14.429999999999744, |
|
14.759999999999737, |
|
15.149999999999729, |
|
15.419999999999723, |
|
15.53999999999972, |
|
15.659999999999718, |
|
15.749999999999716, |
|
15.759999999999716, |
|
15.799999999999715, |
|
16.05999999999971, |
|
16.079999999999714, |
|
16.11999999999972, |
|
16.11999999999972, |
|
16.18999999999973, |
|
16.31999999999975, |
|
16.539999999999786, |
|
16.799999999999827 |
|
], |
|
"type": "su" |
|
}, |
|
"rope_theta": 10000.0, |
|
"sliding_window": 131072, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.1", |
|
"use_cache": true, |
|
"vocab_size": 32064 |
|
} |
|
|
|
|
|
[INFO|tokenization_utils_base.py:2702] 2024-07-24 21:50:42,263 >> tokenizer config file saved in saves/Custom/lora/train_2024-07-24-15-00-21/tokenizer_config.json |
|
|
|
[INFO|tokenization_utils_base.py:2711] 2024-07-24 21:50:42,264 >> Special tokens file saved in saves/Custom/lora/train_2024-07-24-15-00-21/special_tokens_map.json |
|
|
|
[WARNING|ploting.py:89] 2024-07-24 21:50:42,678 >> No metric eval_loss to plot. |
|
|
|
[WARNING|ploting.py:89] 2024-07-24 21:50:42,678 >> No metric eval_accuracy to plot. |
|
|
|
[INFO|modelcard.py:449] 2024-07-24 21:50:42,679 >> Dropping the following result as it does not have all the necessary fields: |
|
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}} |
|
|
|
|