|
[WARNING|2024-12-11 15:38:05] logging.py:162 >> `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training. |
|
|
|
[INFO|2024-12-11 15:38:05] parser.py:355 >> Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16 |
|
|
|
[INFO|2024-12-11 15:38:05] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json |
|
|
|
[INFO|2024-12-11 15:38:05] configuration_utils.py:800 >> Model config LlamaConfig { |
|
"_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 8192, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 16, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 32.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.4", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[INFO|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file tokenizer.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json |
|
|
|
[INFO|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file added_tokens.json from cache at None |
|
|
|
[INFO|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file special_tokens_map.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json |
|
|
|
[INFO|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file tokenizer_config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json |
|
|
|
[INFO|2024-12-11 15:38:05] tokenization_utils_base.py:2533 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
|
[INFO|2024-12-11 15:38:05] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json |
|
|
|
[INFO|2024-12-11 15:38:05] configuration_utils.py:800 >> Model config LlamaConfig { |
|
"_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 8192, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 16, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 32.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.4", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[INFO|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file tokenizer.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json |
|
|
|
[INFO|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file added_tokens.json from cache at None |
|
|
|
[INFO|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file special_tokens_map.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json |
|
|
|
[INFO|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file tokenizer_config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json |
|
|
|
[INFO|2024-12-11 15:38:06] tokenization_utils_base.py:2533 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. |
|
|
|
[INFO|2024-12-11 15:38:06] logging.py:157 >> Replace eos token: <|eot_id|> |
|
|
|
[INFO|2024-12-11 15:38:06] logging.py:157 >> Add pad token: <|eot_id|> |
|
|
|
[INFO|2024-12-11 15:38:06] logging.py:157 >> Loading dataset radiology_sft_instruct.json... |
|
|
|
[INFO|2024-12-11 15:38:07] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json |
|
|
|
[INFO|2024-12-11 15:38:07] configuration_utils.py:800 >> Model config LlamaConfig { |
|
"_name_or_path": "meta-llama/Llama-3.2-1B-Instruct", |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 8192, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 16, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 32.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.4", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[INFO|2024-12-11 15:38:07] modeling_utils.py:3644 >> loading weights file model.safetensors from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/model.safetensors |
|
|
|
[INFO|2024-12-11 15:38:07] modeling_utils.py:1572 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16. |
|
|
|
[INFO|2024-12-11 15:38:07] configuration_utils.py:1038 >> Generate config GenerationConfig { |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
] |
|
} |
|
|
|
|
|
[INFO|2024-12-11 15:38:09] modeling_utils.py:4473 >> All model checkpoint weights were used when initializing LlamaForCausalLM. |
|
|
|
|
|
[INFO|2024-12-11 15:38:09] modeling_utils.py:4481 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct. |
|
If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training. |
|
|
|
[INFO|2024-12-11 15:38:09] configuration_utils.py:993 >> loading configuration file generation_config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/generation_config.json |
|
|
|
[INFO|2024-12-11 15:38:09] configuration_utils.py:1038 >> Generate config GenerationConfig { |
|
"bos_token_id": 128000, |
|
"do_sample": true, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"temperature": 0.6, |
|
"top_p": 0.9 |
|
} |
|
|
|
|
|
[INFO|2024-12-11 15:38:09] logging.py:157 >> Gradient checkpointing enabled. |
|
|
|
[INFO|2024-12-11 15:38:09] logging.py:157 >> Using torch SDPA for faster training and inference. |
|
|
|
[INFO|2024-12-11 15:38:09] logging.py:157 >> Upcasting trainable params to float32. |
|
|
|
[INFO|2024-12-11 15:38:09] logging.py:157 >> Fine-tuning method: LoRA |
|
|
|
[INFO|2024-12-11 15:38:09] logging.py:157 >> Found linear modules: k_proj,q_proj,v_proj,o_proj,up_proj,down_proj,gate_proj |
|
|
|
[INFO|2024-12-11 15:38:09] logging.py:157 >> trainable params: 11,272,192 || all params: 1,247,086,592 || trainable%: 0.9039 |
|
|
|
[INFO|2024-12-11 15:38:09] trainer.py:648 >> Using auto half precision backend |
|
|
|
[INFO|2024-12-11 15:38:10] trainer.py:2134 >> ***** Running training ***** |
|
|
|
[INFO|2024-12-11 15:38:10] trainer.py:2135 >> Num examples = 2,831 |
|
|
|
[INFO|2024-12-11 15:38:10] trainer.py:2136 >> Num Epochs = 3 |
|
|
|
[INFO|2024-12-11 15:38:10] trainer.py:2137 >> Instantaneous batch size per device = 8 |
|
|
|
[INFO|2024-12-11 15:38:10] trainer.py:2140 >> Total train batch size (w. parallel, distributed & accumulation) = 64 |
|
|
|
[INFO|2024-12-11 15:38:10] trainer.py:2141 >> Gradient Accumulation steps = 4 |
|
|
|
[INFO|2024-12-11 15:38:10] trainer.py:2142 >> Total optimization steps = 132 |
|
|
|
[INFO|2024-12-11 15:38:10] trainer.py:2143 >> Number of trainable parameters = 11,272,192 |
|
|
|
[INFO|2024-12-11 15:38:17] logging.py:157 >> {'loss': 4.1191, 'learning_rate': 2.5000e-05, 'epoch': 0.11, 'throughput': 7115.35} |
|
|
|
[INFO|2024-12-11 15:38:23] logging.py:157 >> {'loss': 3.6537, 'learning_rate': 5.0000e-05, 'epoch': 0.23, 'throughput': 7448.92} |
|
|
|
[INFO|2024-12-11 15:38:29] logging.py:157 >> {'loss': 2.3123, 'learning_rate': 4.9793e-05, 'epoch': 0.34, 'throughput': 7576.23} |
|
|
|
[INFO|2024-12-11 15:38:35] logging.py:157 >> {'loss': 1.7347, 'learning_rate': 4.9176e-05, 'epoch': 0.45, 'throughput': 7574.85} |
|
|
|
[INFO|2024-12-11 15:38:41] logging.py:157 >> {'loss': 1.6167, 'learning_rate': 4.8158e-05, 'epoch': 0.56, 'throughput': 7613.08} |
|
|
|
[INFO|2024-12-11 15:38:47] logging.py:157 >> {'loss': 1.6705, 'learning_rate': 4.6757e-05, 'epoch': 0.68, 'throughput': 7640.05} |
|
|
|
[INFO|2024-12-11 15:38:54] logging.py:157 >> {'loss': 1.5204, 'learning_rate': 4.4996e-05, 'epoch': 0.79, 'throughput': 7649.60} |
|
|
|
[INFO|2024-12-11 15:39:00] logging.py:157 >> {'loss': 1.4231, 'learning_rate': 4.2904e-05, 'epoch': 0.90, 'throughput': 7663.72} |
|
|
|
[INFO|2024-12-11 15:39:06] logging.py:157 >> {'loss': 1.4200, 'learning_rate': 4.0515e-05, 'epoch': 1.02, 'throughput': 7665.22} |
|
|
|
[INFO|2024-12-11 15:39:12] logging.py:157 >> {'loss': 1.4510, 'learning_rate': 3.7870e-05, 'epoch': 1.13, 'throughput': 7661.85} |
|
|
|
[INFO|2024-12-11 15:39:19] logging.py:157 >> {'loss': 1.2046, 'learning_rate': 3.5011e-05, 'epoch': 1.24, 'throughput': 7631.45} |
|
|
|
[INFO|2024-12-11 15:39:25] logging.py:157 >> {'loss': 1.3315, 'learning_rate': 3.1987e-05, 'epoch': 1.36, 'throughput': 7643.39} |
|
|
|
[INFO|2024-12-11 15:39:30] logging.py:157 >> {'loss': 1.2132, 'learning_rate': 2.8847e-05, 'epoch': 1.47, 'throughput': 7636.84} |
|
|
|
[INFO|2024-12-11 15:39:36] logging.py:157 >> {'loss': 1.1863, 'learning_rate': 2.5644e-05, 'epoch': 1.58, 'throughput': 7634.44} |
|
|
|
[INFO|2024-12-11 15:39:43] logging.py:157 >> {'loss': 1.2067, 'learning_rate': 2.2429e-05, 'epoch': 1.69, 'throughput': 7631.72} |
|
|
|
[INFO|2024-12-11 15:39:49] logging.py:157 >> {'loss': 1.1691, 'learning_rate': 1.9258e-05, 'epoch': 1.81, 'throughput': 7650.53} |
|
|
|
[INFO|2024-12-11 15:39:55] logging.py:157 >> {'loss': 1.2593, 'learning_rate': 1.6181e-05, 'epoch': 1.92, 'throughput': 7642.82} |
|
|
|
[INFO|2024-12-11 15:40:02] logging.py:157 >> {'loss': 1.2249, 'learning_rate': 1.3251e-05, 'epoch': 2.03, 'throughput': 7641.04} |
|
|
|
[INFO|2024-12-11 15:40:08] logging.py:157 >> {'loss': 1.0921, 'learning_rate': 1.0514e-05, 'epoch': 2.15, 'throughput': 7645.62} |
|
|
|
[INFO|2024-12-11 15:40:14] logging.py:157 >> {'loss': 1.1254, 'learning_rate': 8.0182e-06, 'epoch': 2.26, 'throughput': 7634.60} |
|
|
|
[INFO|2024-12-11 15:40:20] logging.py:157 >> {'loss': 1.1332, 'learning_rate': 5.8030e-06, 'epoch': 2.37, 'throughput': 7647.37} |
|
|
|
[INFO|2024-12-11 15:40:26] logging.py:157 >> {'loss': 1.0718, 'learning_rate': 3.9056e-06, 'epoch': 2.49, 'throughput': 7646.83} |
|
|
|
[INFO|2024-12-11 15:40:32] logging.py:157 >> {'loss': 1.2027, 'learning_rate': 2.3574e-06, 'epoch': 2.60, 'throughput': 7660.23} |
|
|
|
[INFO|2024-12-11 15:40:38] logging.py:157 >> {'loss': 1.1742, 'learning_rate': 1.1841e-06, 'epoch': 2.71, 'throughput': 7663.81} |
|
|
|
[INFO|2024-12-11 15:40:44] logging.py:157 >> {'loss': 1.1628, 'learning_rate': 4.0505e-07, 'epoch': 2.82, 'throughput': 7669.28} |
|
|
|
[INFO|2024-12-11 15:40:50] logging.py:157 >> {'loss': 1.0574, 'learning_rate': 3.3148e-08, 'epoch': 2.94, 'throughput': 7675.42} |
|
|
|
[INFO|2024-12-11 15:40:53] trainer.py:3503 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/lora_v14/checkpoint-132 |
|
|
|
[INFO|2024-12-11 15:40:53] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json |
|
|
|
[INFO|2024-12-11 15:40:53] configuration_utils.py:800 >> Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 8192, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 16, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 32.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.4", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[INFO|2024-12-11 15:40:53] tokenization_utils_base.py:2702 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/lora_v14/checkpoint-132/tokenizer_config.json |
|
|
|
[INFO|2024-12-11 15:40:53] tokenization_utils_base.py:2711 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/lora_v14/checkpoint-132/special_tokens_map.json |
|
|
|
[INFO|2024-12-11 15:40:55] trainer.py:2394 >> |
|
|
|
Training completed. Do not forget to share your model on huggingface.co/models =) |
|
|
|
|
|
|
|
[INFO|2024-12-11 15:40:55] trainer.py:3503 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/lora_v14 |
|
|
|
[INFO|2024-12-11 15:40:55] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json |
|
|
|
[INFO|2024-12-11 15:40:55] configuration_utils.py:800 >> Model config LlamaConfig { |
|
"architectures": [ |
|
"LlamaForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"bos_token_id": 128000, |
|
"eos_token_id": [ |
|
128001, |
|
128008, |
|
128009 |
|
], |
|
"head_dim": 64, |
|
"hidden_act": "silu", |
|
"hidden_size": 2048, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 8192, |
|
"max_position_embeddings": 131072, |
|
"mlp_bias": false, |
|
"model_type": "llama", |
|
"num_attention_heads": 32, |
|
"num_hidden_layers": 16, |
|
"num_key_value_heads": 8, |
|
"pretraining_tp": 1, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"factor": 32.0, |
|
"high_freq_factor": 4.0, |
|
"low_freq_factor": 1.0, |
|
"original_max_position_embeddings": 8192, |
|
"rope_type": "llama3" |
|
}, |
|
"rope_theta": 500000.0, |
|
"tie_word_embeddings": true, |
|
"torch_dtype": "bfloat16", |
|
"transformers_version": "4.43.4", |
|
"use_cache": true, |
|
"vocab_size": 128256 |
|
} |
|
|
|
|
|
[INFO|2024-12-11 15:40:55] tokenization_utils_base.py:2702 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/lora_v14/tokenizer_config.json |
|
|
|
[INFO|2024-12-11 15:40:55] tokenization_utils_base.py:2711 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/lora_v14/special_tokens_map.json |
|
|
|
[WARNING|2024-12-11 15:40:56] logging.py:162 >> No metric eval_loss to plot. |
|
|
|
[WARNING|2024-12-11 15:40:56] logging.py:162 >> No metric eval_accuracy to plot. |
|
|
|
[INFO|2024-12-11 15:40:56] trainer.py:3819 >> |
|
***** Running Evaluation ***** |
|
|
|
[INFO|2024-12-11 15:40:56] trainer.py:3821 >> Num examples = 500 |
|
|
|
[INFO|2024-12-11 15:40:56] trainer.py:3824 >> Batch size = 8 |
|
|
|
[INFO|2024-12-11 15:40:59] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields: |
|
{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}} |
|
|
|
|