Radiology_lora_v14 / running_log.txt

Upload all files from the main directory

c678392 verified 3 months ago

16.7 kB

	[WARNING\|2024-12-11 15:38:05] logging.py:162 >> `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training.

	[INFO\|2024-12-11 15:38:05] parser.py:355 >> Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16

	[INFO\|2024-12-11 15:38:05] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json

	[INFO\|2024-12-11 15:38:05] configuration_utils.py:800 >> Model config LlamaConfig {
	"_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
	"architectures": [
	"LlamaForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"head_dim": 64,
	"hidden_act": "silu",
	"hidden_size": 2048,
	"initializer_range": 0.02,
	"intermediate_size": 8192,
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "llama",
	"num_attention_heads": 32,
	"num_hidden_layers": 16,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"factor": 32.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3"
	},
	"rope_theta": 500000.0,
	"tie_word_embeddings": true,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.43.4",
	"use_cache": true,
	"vocab_size": 128256
	}


	[INFO\|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file tokenizer.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json

	[INFO\|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file added_tokens.json from cache at None

	[INFO\|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file special_tokens_map.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json

	[INFO\|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file tokenizer_config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json

	[INFO\|2024-12-11 15:38:05] tokenization_utils_base.py:2533 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

	[INFO\|2024-12-11 15:38:05] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json

	[INFO\|2024-12-11 15:38:05] configuration_utils.py:800 >> Model config LlamaConfig {
	"_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
	"architectures": [
	"LlamaForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"head_dim": 64,
	"hidden_act": "silu",
	"hidden_size": 2048,
	"initializer_range": 0.02,
	"intermediate_size": 8192,
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "llama",
	"num_attention_heads": 32,
	"num_hidden_layers": 16,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"factor": 32.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3"
	},
	"rope_theta": 500000.0,
	"tie_word_embeddings": true,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.43.4",
	"use_cache": true,
	"vocab_size": 128256
	}


	[INFO\|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file tokenizer.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer.json

	[INFO\|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file added_tokens.json from cache at None

	[INFO\|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file special_tokens_map.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/special_tokens_map.json

	[INFO\|2024-12-11 15:38:05] tokenization_utils_base.py:2289 >> loading file tokenizer_config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/tokenizer_config.json

	[INFO\|2024-12-11 15:38:06] tokenization_utils_base.py:2533 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

	[INFO\|2024-12-11 15:38:06] logging.py:157 >> Replace eos token: <\|eot_id\|>

	[INFO\|2024-12-11 15:38:06] logging.py:157 >> Add pad token: <\|eot_id\|>

	[INFO\|2024-12-11 15:38:06] logging.py:157 >> Loading dataset radiology_sft_instruct.json...

	[INFO\|2024-12-11 15:38:07] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json

	[INFO\|2024-12-11 15:38:07] configuration_utils.py:800 >> Model config LlamaConfig {
	"_name_or_path": "meta-llama/Llama-3.2-1B-Instruct",
	"architectures": [
	"LlamaForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"head_dim": 64,
	"hidden_act": "silu",
	"hidden_size": 2048,
	"initializer_range": 0.02,
	"intermediate_size": 8192,
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "llama",
	"num_attention_heads": 32,
	"num_hidden_layers": 16,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"factor": 32.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3"
	},
	"rope_theta": 500000.0,
	"tie_word_embeddings": true,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.43.4",
	"use_cache": true,
	"vocab_size": 128256
	}


	[INFO\|2024-12-11 15:38:07] modeling_utils.py:3644 >> loading weights file model.safetensors from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/model.safetensors

	[INFO\|2024-12-11 15:38:07] modeling_utils.py:1572 >> Instantiating LlamaForCausalLM model under default dtype torch.bfloat16.

	[INFO\|2024-12-11 15:38:07] configuration_utils.py:1038 >> Generate config GenerationConfig {
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	]
	}


	[INFO\|2024-12-11 15:38:09] modeling_utils.py:4473 >> All model checkpoint weights were used when initializing LlamaForCausalLM.


	[INFO\|2024-12-11 15:38:09] modeling_utils.py:4481 >> All the weights of LlamaForCausalLM were initialized from the model checkpoint at meta-llama/Llama-3.2-1B-Instruct.
	If your task is similar to the task the model of the checkpoint was trained on, you can already use LlamaForCausalLM for predictions without further training.

	[INFO\|2024-12-11 15:38:09] configuration_utils.py:993 >> loading configuration file generation_config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/generation_config.json

	[INFO\|2024-12-11 15:38:09] configuration_utils.py:1038 >> Generate config GenerationConfig {
	"bos_token_id": 128000,
	"do_sample": true,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"temperature": 0.6,
	"top_p": 0.9
	}


	[INFO\|2024-12-11 15:38:09] logging.py:157 >> Gradient checkpointing enabled.

	[INFO\|2024-12-11 15:38:09] logging.py:157 >> Using torch SDPA for faster training and inference.

	[INFO\|2024-12-11 15:38:09] logging.py:157 >> Upcasting trainable params to float32.

	[INFO\|2024-12-11 15:38:09] logging.py:157 >> Fine-tuning method: LoRA

	[INFO\|2024-12-11 15:38:09] logging.py:157 >> Found linear modules: k_proj,q_proj,v_proj,o_proj,up_proj,down_proj,gate_proj

	[INFO\|2024-12-11 15:38:09] logging.py:157 >> trainable params: 11,272,192 \|\| all params: 1,247,086,592 \|\| trainable%: 0.9039

	[INFO\|2024-12-11 15:38:09] trainer.py:648 >> Using auto half precision backend

	[INFO\|2024-12-11 15:38:10] trainer.py:2134 >> *** Running training ***

	[INFO\|2024-12-11 15:38:10] trainer.py:2135 >> Num examples = 2,831

	[INFO\|2024-12-11 15:38:10] trainer.py:2136 >> Num Epochs = 3

	[INFO\|2024-12-11 15:38:10] trainer.py:2137 >> Instantaneous batch size per device = 8

	[INFO\|2024-12-11 15:38:10] trainer.py:2140 >> Total train batch size (w. parallel, distributed & accumulation) = 64

	[INFO\|2024-12-11 15:38:10] trainer.py:2141 >> Gradient Accumulation steps = 4

	[INFO\|2024-12-11 15:38:10] trainer.py:2142 >> Total optimization steps = 132

	[INFO\|2024-12-11 15:38:10] trainer.py:2143 >> Number of trainable parameters = 11,272,192

	[INFO\|2024-12-11 15:38:17] logging.py:157 >> {'loss': 4.1191, 'learning_rate': 2.5000e-05, 'epoch': 0.11, 'throughput': 7115.35}

	[INFO\|2024-12-11 15:38:23] logging.py:157 >> {'loss': 3.6537, 'learning_rate': 5.0000e-05, 'epoch': 0.23, 'throughput': 7448.92}

	[INFO\|2024-12-11 15:38:29] logging.py:157 >> {'loss': 2.3123, 'learning_rate': 4.9793e-05, 'epoch': 0.34, 'throughput': 7576.23}

	[INFO\|2024-12-11 15:38:35] logging.py:157 >> {'loss': 1.7347, 'learning_rate': 4.9176e-05, 'epoch': 0.45, 'throughput': 7574.85}

	[INFO\|2024-12-11 15:38:41] logging.py:157 >> {'loss': 1.6167, 'learning_rate': 4.8158e-05, 'epoch': 0.56, 'throughput': 7613.08}

	[INFO\|2024-12-11 15:38:47] logging.py:157 >> {'loss': 1.6705, 'learning_rate': 4.6757e-05, 'epoch': 0.68, 'throughput': 7640.05}

	[INFO\|2024-12-11 15:38:54] logging.py:157 >> {'loss': 1.5204, 'learning_rate': 4.4996e-05, 'epoch': 0.79, 'throughput': 7649.60}

	[INFO\|2024-12-11 15:39:00] logging.py:157 >> {'loss': 1.4231, 'learning_rate': 4.2904e-05, 'epoch': 0.90, 'throughput': 7663.72}

	[INFO\|2024-12-11 15:39:06] logging.py:157 >> {'loss': 1.4200, 'learning_rate': 4.0515e-05, 'epoch': 1.02, 'throughput': 7665.22}

	[INFO\|2024-12-11 15:39:12] logging.py:157 >> {'loss': 1.4510, 'learning_rate': 3.7870e-05, 'epoch': 1.13, 'throughput': 7661.85}

	[INFO\|2024-12-11 15:39:19] logging.py:157 >> {'loss': 1.2046, 'learning_rate': 3.5011e-05, 'epoch': 1.24, 'throughput': 7631.45}

	[INFO\|2024-12-11 15:39:25] logging.py:157 >> {'loss': 1.3315, 'learning_rate': 3.1987e-05, 'epoch': 1.36, 'throughput': 7643.39}

	[INFO\|2024-12-11 15:39:30] logging.py:157 >> {'loss': 1.2132, 'learning_rate': 2.8847e-05, 'epoch': 1.47, 'throughput': 7636.84}

	[INFO\|2024-12-11 15:39:36] logging.py:157 >> {'loss': 1.1863, 'learning_rate': 2.5644e-05, 'epoch': 1.58, 'throughput': 7634.44}

	[INFO\|2024-12-11 15:39:43] logging.py:157 >> {'loss': 1.2067, 'learning_rate': 2.2429e-05, 'epoch': 1.69, 'throughput': 7631.72}

	[INFO\|2024-12-11 15:39:49] logging.py:157 >> {'loss': 1.1691, 'learning_rate': 1.9258e-05, 'epoch': 1.81, 'throughput': 7650.53}

	[INFO\|2024-12-11 15:39:55] logging.py:157 >> {'loss': 1.2593, 'learning_rate': 1.6181e-05, 'epoch': 1.92, 'throughput': 7642.82}

	[INFO\|2024-12-11 15:40:02] logging.py:157 >> {'loss': 1.2249, 'learning_rate': 1.3251e-05, 'epoch': 2.03, 'throughput': 7641.04}

	[INFO\|2024-12-11 15:40:08] logging.py:157 >> {'loss': 1.0921, 'learning_rate': 1.0514e-05, 'epoch': 2.15, 'throughput': 7645.62}

	[INFO\|2024-12-11 15:40:14] logging.py:157 >> {'loss': 1.1254, 'learning_rate': 8.0182e-06, 'epoch': 2.26, 'throughput': 7634.60}

	[INFO\|2024-12-11 15:40:20] logging.py:157 >> {'loss': 1.1332, 'learning_rate': 5.8030e-06, 'epoch': 2.37, 'throughput': 7647.37}

	[INFO\|2024-12-11 15:40:26] logging.py:157 >> {'loss': 1.0718, 'learning_rate': 3.9056e-06, 'epoch': 2.49, 'throughput': 7646.83}

	[INFO\|2024-12-11 15:40:32] logging.py:157 >> {'loss': 1.2027, 'learning_rate': 2.3574e-06, 'epoch': 2.60, 'throughput': 7660.23}

	[INFO\|2024-12-11 15:40:38] logging.py:157 >> {'loss': 1.1742, 'learning_rate': 1.1841e-06, 'epoch': 2.71, 'throughput': 7663.81}

	[INFO\|2024-12-11 15:40:44] logging.py:157 >> {'loss': 1.1628, 'learning_rate': 4.0505e-07, 'epoch': 2.82, 'throughput': 7669.28}

	[INFO\|2024-12-11 15:40:50] logging.py:157 >> {'loss': 1.0574, 'learning_rate': 3.3148e-08, 'epoch': 2.94, 'throughput': 7675.42}

	[INFO\|2024-12-11 15:40:53] trainer.py:3503 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/lora_v14/checkpoint-132

	[INFO\|2024-12-11 15:40:53] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json

	[INFO\|2024-12-11 15:40:53] configuration_utils.py:800 >> Model config LlamaConfig {
	"architectures": [
	"LlamaForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"head_dim": 64,
	"hidden_act": "silu",
	"hidden_size": 2048,
	"initializer_range": 0.02,
	"intermediate_size": 8192,
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "llama",
	"num_attention_heads": 32,
	"num_hidden_layers": 16,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"factor": 32.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3"
	},
	"rope_theta": 500000.0,
	"tie_word_embeddings": true,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.43.4",
	"use_cache": true,
	"vocab_size": 128256
	}


	[INFO\|2024-12-11 15:40:53] tokenization_utils_base.py:2702 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/lora_v14/checkpoint-132/tokenizer_config.json

	[INFO\|2024-12-11 15:40:53] tokenization_utils_base.py:2711 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/lora_v14/checkpoint-132/special_tokens_map.json

	[INFO\|2024-12-11 15:40:55] trainer.py:2394 >>

	Training completed. Do not forget to share your model on huggingface.co/models =)



	[INFO\|2024-12-11 15:40:55] trainer.py:3503 >> Saving model checkpoint to saves/Llama-3.2-1B-Instruct/lora/lora_v14

	[INFO\|2024-12-11 15:40:55] configuration_utils.py:733 >> loading configuration file config.json from cache at /home/dj475/.cache/huggingface/hub/models--meta-llama--Llama-3.2-1B-Instruct/snapshots/9213176726f574b556790deb65791e0c5aa438b6/config.json

	[INFO\|2024-12-11 15:40:55] configuration_utils.py:800 >> Model config LlamaConfig {
	"architectures": [
	"LlamaForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"bos_token_id": 128000,
	"eos_token_id": [
	128001,
	128008,
	128009
	],
	"head_dim": 64,
	"hidden_act": "silu",
	"hidden_size": 2048,
	"initializer_range": 0.02,
	"intermediate_size": 8192,
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "llama",
	"num_attention_heads": 32,
	"num_hidden_layers": 16,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"factor": 32.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3"
	},
	"rope_theta": 500000.0,
	"tie_word_embeddings": true,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.43.4",
	"use_cache": true,
	"vocab_size": 128256
	}


	[INFO\|2024-12-11 15:40:55] tokenization_utils_base.py:2702 >> tokenizer config file saved in saves/Llama-3.2-1B-Instruct/lora/lora_v14/tokenizer_config.json

	[INFO\|2024-12-11 15:40:55] tokenization_utils_base.py:2711 >> Special tokens file saved in saves/Llama-3.2-1B-Instruct/lora/lora_v14/special_tokens_map.json

	[WARNING\|2024-12-11 15:40:56] logging.py:162 >> No metric eval_loss to plot.

	[WARNING\|2024-12-11 15:40:56] logging.py:162 >> No metric eval_accuracy to plot.

	[INFO\|2024-12-11 15:40:56] trainer.py:3819 >>
	*** Running Evaluation ***

	[INFO\|2024-12-11 15:40:56] trainer.py:3821 >> Num examples = 500

	[INFO\|2024-12-11 15:40:56] trainer.py:3824 >> Batch size = 8

	[INFO\|2024-12-11 15:40:59] modelcard.py:449 >> Dropping the following result as it does not have all the necessary fields:
	{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}