Phi-3-medium-128K-LoRA / running_log.txt

Upload 17 files

56e5777 verified 4 months ago

25.5 kB

	[WARNING\|parser.py:272] 2024-07-24 15:04:58,287 >> We recommend enable `upcast_layernorm` in quantized training.

	[WARNING\|parser.py:292] 2024-07-24 15:04:58,287 >> `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training.

	[INFO\|parser.py:344] 2024-07-24 15:04:58,288 >> Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16

	07/24/2024 15:04:58 - WARNING - llamafactory.hparams.parser - We recommend enable `upcast_layernorm` in quantized training.

	07/24/2024 15:04:58 - WARNING - llamafactory.hparams.parser - `ddp_find_unused_parameters` needs to be set as False for LoRA in DDP training.

	07/24/2024 15:04:58 - INFO - llamafactory.hparams.parser - Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16

	[INFO\|tokenization_utils_base.py:2289] 2024-07-24 15:05:00,642 >> loading file tokenizer.model from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/tokenizer.model

	[INFO\|tokenization_utils_base.py:2289] 2024-07-24 15:05:00,643 >> loading file tokenizer.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/tokenizer.json

	[INFO\|tokenization_utils_base.py:2289] 2024-07-24 15:05:00,643 >> loading file added_tokens.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/added_tokens.json

	[INFO\|tokenization_utils_base.py:2289] 2024-07-24 15:05:00,643 >> loading file special_tokens_map.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/special_tokens_map.json

	[INFO\|tokenization_utils_base.py:2289] 2024-07-24 15:05:00,643 >> loading file tokenizer_config.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/tokenizer_config.json

	[INFO\|tokenization_utils_base.py:2533] 2024-07-24 15:05:00,693 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.

	[INFO\|loader.py:52] 2024-07-24 15:05:00,694 >> Loading dataset dataset_alpaca_IT_train_and_eval_25K.json...

	07/24/2024 15:05:08 - INFO - llamafactory.data.loader - Loading dataset dataset_alpaca_IT_train_and_eval_25K.json...

	[INFO\|configuration_utils.py:733] 2024-07-24 15:05:11,124 >> loading configuration file config.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/config.json

	[INFO\|configuration_utils.py:733] 2024-07-24 15:05:11,485 >> loading configuration file config.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/config.json

	[INFO\|configuration_utils.py:800] 2024-07-24 15:05:11,488 >> Model config Phi3Config {
	"_name_or_path": "microsoft/Phi-3-medium-128k-instruct",
	"architectures": [
	"Phi3ForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"auto_map": {
	"AutoConfig": "microsoft/Phi-3-medium-128k-instruct--configuration_phi3.Phi3Config",
	"AutoModelForCausalLM": "microsoft/Phi-3-medium-128k-instruct--modeling_phi3.Phi3ForCausalLM"
	},
	"bos_token_id": 1,
	"embd_pdrop": 0.0,
	"eos_token_id": 32000,
	"hidden_act": "silu",
	"hidden_size": 5120,
	"initializer_range": 0.02,
	"intermediate_size": 17920,
	"max_position_embeddings": 131072,
	"model_type": "phi3",
	"num_attention_heads": 40,
	"num_hidden_layers": 40,
	"num_key_value_heads": 10,
	"original_max_position_embeddings": 4096,
	"pad_token_id": null,
	"resid_pdrop": 0.0,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"long_factor": [
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.25,
	1.25,
	1.5,
	2.0,
	2.75,
	5.75,
	5.75,
	6.5,
	9.25,
	11.0,
	13.25,
	19.25,
	19.75,
	19.75,
	21.25,
	21.5,
	26.5,
	30.0,
	33.75,
	35.25,
	38.5,
	42.0,
	42.25,
	46.0,
	47.0,
	50.0,
	50.5,
	51.0,
	52.0,
	52.75,
	53.75,
	54.75,
	57.0,
	57.25,
	58.5,
	59.25,
	59.5,
	62.0,
	62.5,
	62.75,
	63.25,
	63.25,
	63.25,
	63.75,
	64.0,
	64.0,
	64.25,
	64.5,
	64.5,
	65.0,
	65.0
	],
	"short_factor": [
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.01,
	1.02,
	1.02,
	1.04,
	1.04,
	1.07,
	1.07,
	1.1,
	1.3000000000000003,
	1.3000000000000003,
	1.5000000000000004,
	1.5700000000000005,
	1.9000000000000008,
	2.3100000000000014,
	2.759999999999992,
	3.3899999999999784,
	3.9399999999999666,
	4.009999999999965,
	4.289999999999959,
	4.349999999999958,
	5.349999999999937,
	6.659999999999909,
	7.029999999999901,
	7.51999999999989,
	8.00999999999988,
	8.249999999999876,
	8.279999999999875,
	9.629999999999846,
	9.89999999999984,
	10.589999999999826,
	11.049999999999816,
	11.7899999999998,
	12.189999999999792,
	12.889999999999777,
	13.129999999999772,
	13.16999999999977,
	13.20999999999977,
	13.479999999999764,
	13.539999999999763,
	13.779999999999758,
	13.929999999999755,
	14.429999999999744,
	14.759999999999737,
	15.149999999999729,
	15.419999999999723,
	15.53999999999972,
	15.659999999999718,
	15.749999999999716,
	15.759999999999716,
	15.799999999999715,
	16.05999999999971,
	16.079999999999714,
	16.11999999999972,
	16.11999999999972,
	16.18999999999973,
	16.31999999999975,
	16.539999999999786,
	16.799999999999827
	],
	"type": "su"
	},
	"rope_theta": 10000.0,
	"sliding_window": 131072,
	"tie_word_embeddings": false,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.43.1",
	"use_cache": true,
	"vocab_size": 32064
	}


	[INFO\|quantization.py:182] 2024-07-24 15:05:11,496 >> Quantizing model to 4 bit with bitsandbytes.

	[INFO\|modeling_utils.py:3621] 2024-07-24 15:05:12,104 >> loading weights file model.safetensors from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/model.safetensors.index.json

	07/24/2024 15:05:12 - INFO - llamafactory.model.model_utils.quantization - Quantizing model to 4 bit with bitsandbytes.

	[INFO\|modeling_utils.py:1569] 2024-07-24 15:10:52,981 >> Instantiating Phi3ForCausalLM model under default dtype torch.bfloat16.

	[INFO\|configuration_utils.py:1038] 2024-07-24 15:10:52,989 >> Generate config GenerationConfig {
	"bos_token_id": 1,
	"eos_token_id": 32000
	}


	[INFO\|modeling_utils.py:4450] 2024-07-24 15:11:18,809 >> All model checkpoint weights were used when initializing Phi3ForCausalLM.


	[INFO\|modeling_utils.py:4458] 2024-07-24 15:11:18,810 >> All the weights of Phi3ForCausalLM were initialized from the model checkpoint at microsoft/Phi-3-medium-128k-instruct.
	If your task is similar to the task the model of the checkpoint was trained on, you can already use Phi3ForCausalLM for predictions without further training.

	[INFO\|configuration_utils.py:993] 2024-07-24 15:11:18,895 >> loading configuration file generation_config.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/generation_config.json

	[INFO\|configuration_utils.py:1038] 2024-07-24 15:11:18,896 >> Generate config GenerationConfig {
	"bos_token_id": 1,
	"eos_token_id": [
	32000,
	32001,
	32007
	],
	"pad_token_id": 32000
	}


	07/24/2024 15:12:39 - INFO - llamafactory.model.model_utils.checkpointing - Gradient checkpointing enabled.

	07/24/2024 15:12:39 - INFO - llamafactory.model.model_utils.attention - Using FlashAttention-2 for faster training and inference.

	07/24/2024 15:12:39 - INFO - llamafactory.model.adapter - Upcasting trainable params to float32.

	07/24/2024 15:12:39 - INFO - llamafactory.model.adapter - Fine-tuning method: LoRA

	07/24/2024 15:12:39 - INFO - llamafactory.model.model_utils.misc - Found linear modules: o_proj,gate_up_proj,qkv_proj,down_proj

	07/24/2024 15:12:39 - INFO - llamafactory.model.loader - trainable params: 27,852,800 \|\| all params: 13,988,090,880 \|\| trainable%: 0.1991

	[INFO\|checkpointing.py:103] 2024-07-24 15:12:42,061 >> Gradient checkpointing enabled.

	[INFO\|attention.py:82] 2024-07-24 15:12:42,061 >> Using FlashAttention-2 for faster training and inference.

	[INFO\|adapter.py:302] 2024-07-24 15:12:42,061 >> Upcasting trainable params to float32.

	[INFO\|adapter.py:158] 2024-07-24 15:12:42,061 >> Fine-tuning method: LoRA

	[INFO\|misc.py:51] 2024-07-24 15:12:42,062 >> Found linear modules: o_proj,qkv_proj,down_proj,gate_up_proj

	[INFO\|loader.py:196] 2024-07-24 15:12:42,467 >> trainable params: 27,852,800 \|\| all params: 13,988,090,880 \|\| trainable%: 0.1991

	[INFO\|trainer.py:648] 2024-07-24 15:12:42,473 >> Using auto half precision backend

	[INFO\|deepspeed.py:329] 2024-07-24 15:12:42,673 >> Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)

	[INFO\|trainer.py:2134] 2024-07-24 15:13:06,954 >> *** Running training ***

	[INFO\|trainer.py:2135] 2024-07-24 15:13:06,954 >> Num examples = 4,944

	[INFO\|trainer.py:2136] 2024-07-24 15:13:06,954 >> Num Epochs = 3

	[INFO\|trainer.py:2137] 2024-07-24 15:13:06,954 >> Instantaneous batch size per device = 2

	[INFO\|trainer.py:2140] 2024-07-24 15:13:06,954 >> Total train batch size (w. parallel, distributed & accumulation) = 32

	[INFO\|trainer.py:2141] 2024-07-24 15:13:06,954 >> Gradient Accumulation steps = 8

	[INFO\|trainer.py:2142] 2024-07-24 15:13:06,954 >> Total optimization steps = 462

	[INFO\|trainer.py:2143] 2024-07-24 15:13:06,958 >> Number of trainable parameters = 27,852,800

	[INFO\|callbacks.py:310] 2024-07-24 15:21:31,280 >> {'loss': 0.5099, 'learning_rate': 1.0000e-05, 'epoch': 0.06, 'throughput': 2571.26}

	[INFO\|callbacks.py:310] 2024-07-24 15:30:13,855 >> {'loss': 0.5115, 'learning_rate': 2.0000e-05, 'epoch': 0.13, 'throughput': 2534.92}

	[INFO\|callbacks.py:310] 2024-07-24 15:38:39,051 >> {'loss': 0.4846, 'learning_rate': 3.0000e-05, 'epoch': 0.19, 'throughput': 2535.35}

	[INFO\|callbacks.py:310] 2024-07-24 15:47:32,541 >> {'loss': 0.4076, 'learning_rate': 4.0000e-05, 'epoch': 0.26, 'throughput': 2553.97}

	[INFO\|callbacks.py:310] 2024-07-24 15:56:37,450 >> {'loss': 0.3073, 'learning_rate': 5.0000e-05, 'epoch': 0.32, 'throughput': 2556.06}

	[INFO\|callbacks.py:310] 2024-07-24 16:04:50,564 >> {'loss': 0.2516, 'learning_rate': 4.9927e-05, 'epoch': 0.39, 'throughput': 2559.12}

	[INFO\|callbacks.py:310] 2024-07-24 16:13:54,185 >> {'loss': 0.2256, 'learning_rate': 4.9710e-05, 'epoch': 0.45, 'throughput': 2549.95}

	[INFO\|callbacks.py:310] 2024-07-24 16:22:56,812 >> {'loss': 0.2146, 'learning_rate': 4.9349e-05, 'epoch': 0.52, 'throughput': 2547.79}

	[INFO\|callbacks.py:310] 2024-07-24 16:31:48,796 >> {'loss': 0.2018, 'learning_rate': 4.8846e-05, 'epoch': 0.58, 'throughput': 2552.59}

	[INFO\|callbacks.py:310] 2024-07-24 16:40:22,764 >> {'loss': 0.1958, 'learning_rate': 4.8205e-05, 'epoch': 0.65, 'throughput': 2556.94}

	[INFO\|callbacks.py:310] 2024-07-24 16:48:46,503 >> {'loss': 0.1912, 'learning_rate': 4.7429e-05, 'epoch': 0.71, 'throughput': 2557.17}

	[INFO\|callbacks.py:310] 2024-07-24 16:57:16,752 >> {'loss': 0.1876, 'learning_rate': 4.6522e-05, 'epoch': 0.78, 'throughput': 2558.65}

	[INFO\|callbacks.py:310] 2024-07-24 17:05:34,669 >> {'loss': 0.1802, 'learning_rate': 4.5491e-05, 'epoch': 0.84, 'throughput': 2561.14}

	[INFO\|callbacks.py:310] 2024-07-24 17:14:13,797 >> {'loss': 0.1793, 'learning_rate': 4.4340e-05, 'epoch': 0.91, 'throughput': 2560.01}

	[INFO\|callbacks.py:310] 2024-07-24 17:22:00,855 >> {'loss': 0.1759, 'learning_rate': 4.3077e-05, 'epoch': 0.97, 'throughput': 2565.87}

	[INFO\|callbacks.py:310] 2024-07-24 17:30:17,877 >> {'loss': 0.1746, 'learning_rate': 4.1709e-05, 'epoch': 1.04, 'throughput': 2564.86}

	[INFO\|callbacks.py:310] 2024-07-24 17:38:59,853 >> {'loss': 0.1699, 'learning_rate': 4.0244e-05, 'epoch': 1.10, 'throughput': 2564.04}

	[INFO\|callbacks.py:310] 2024-07-24 17:48:09,599 >> {'loss': 0.1680, 'learning_rate': 3.8690e-05, 'epoch': 1.17, 'throughput': 2560.03}

	[INFO\|callbacks.py:310] 2024-07-24 17:56:04,957 >> {'loss': 0.1646, 'learning_rate': 3.7057e-05, 'epoch': 1.23, 'throughput': 2563.38}

	[INFO\|callbacks.py:310] 2024-07-24 18:04:29,792 >> {'loss': 0.1667, 'learning_rate': 3.5354e-05, 'epoch': 1.29, 'throughput': 2565.53}

	[INFO\|callbacks.py:310] 2024-07-24 18:13:16,644 >> {'loss': 0.1664, 'learning_rate': 3.3590e-05, 'epoch': 1.36, 'throughput': 2565.84}

	[INFO\|callbacks.py:310] 2024-07-24 18:21:27,030 >> {'loss': 0.1622, 'learning_rate': 3.1777e-05, 'epoch': 1.42, 'throughput': 2565.58}

	[INFO\|callbacks.py:310] 2024-07-24 18:30:03,234 >> {'loss': 0.1623, 'learning_rate': 2.9924e-05, 'epoch': 1.49, 'throughput': 2565.43}

	[INFO\|callbacks.py:310] 2024-07-24 18:38:50,374 >> {'loss': 0.1616, 'learning_rate': 2.8043e-05, 'epoch': 1.55, 'throughput': 2565.51}

	[INFO\|callbacks.py:310] 2024-07-24 18:47:03,606 >> {'loss': 0.1590, 'learning_rate': 2.6143e-05, 'epoch': 1.62, 'throughput': 2566.31}

	[INFO\|callbacks.py:310] 2024-07-24 18:55:49,268 >> {'loss': 0.1619, 'learning_rate': 2.4238e-05, 'epoch': 1.68, 'throughput': 2564.17}

	[INFO\|callbacks.py:310] 2024-07-24 19:04:24,775 >> {'loss': 0.1616, 'learning_rate': 2.2336e-05, 'epoch': 1.75, 'throughput': 2565.53}

	[INFO\|callbacks.py:310] 2024-07-24 19:12:40,325 >> {'loss': 0.1604, 'learning_rate': 2.0450e-05, 'epoch': 1.81, 'throughput': 2566.61}

	[INFO\|callbacks.py:310] 2024-07-24 19:21:50,945 >> {'loss': 0.1563, 'learning_rate': 1.8591e-05, 'epoch': 1.88, 'throughput': 2564.70}

	[INFO\|callbacks.py:310] 2024-07-24 19:30:42,384 >> {'loss': 0.1548, 'learning_rate': 1.6769e-05, 'epoch': 1.94, 'throughput': 2565.36}

	[INFO\|callbacks.py:310] 2024-07-24 19:39:26,495 >> {'loss': 0.1555, 'learning_rate': 1.4994e-05, 'epoch': 2.01, 'throughput': 2565.22}

	[INFO\|callbacks.py:310] 2024-07-24 19:48:16,049 >> {'loss': 0.1526, 'learning_rate': 1.3278e-05, 'epoch': 2.07, 'throughput': 2564.82}

	[INFO\|callbacks.py:310] 2024-07-24 19:56:55,676 >> {'loss': 0.1526, 'learning_rate': 1.1630e-05, 'epoch': 2.14, 'throughput': 2563.60}

	[INFO\|callbacks.py:310] 2024-07-24 20:05:48,055 >> {'loss': 0.1516, 'learning_rate': 1.0060e-05, 'epoch': 2.20, 'throughput': 2564.75}

	[INFO\|callbacks.py:310] 2024-07-24 20:14:05,975 >> {'loss': 0.1524, 'learning_rate': 8.5762e-06, 'epoch': 2.27, 'throughput': 2565.56}

	[INFO\|callbacks.py:310] 2024-07-24 20:23:04,597 >> {'loss': 0.1502, 'learning_rate': 7.1880e-06, 'epoch': 2.33, 'throughput': 2564.08}

	[INFO\|callbacks.py:310] 2024-07-24 20:31:47,755 >> {'loss': 0.1506, 'learning_rate': 5.9035e-06, 'epoch': 2.39, 'throughput': 2563.27}

	[INFO\|callbacks.py:310] 2024-07-24 20:40:43,735 >> {'loss': 0.1479, 'learning_rate': 4.7298e-06, 'epoch': 2.46, 'throughput': 2560.31}

	[INFO\|callbacks.py:310] 2024-07-24 20:49:04,924 >> {'loss': 0.1501, 'learning_rate': 3.6740e-06, 'epoch': 2.52, 'throughput': 2560.79}

	[INFO\|callbacks.py:310] 2024-07-24 20:57:37,960 >> {'loss': 0.1504, 'learning_rate': 2.7422e-06, 'epoch': 2.59, 'throughput': 2561.70}

	[INFO\|callbacks.py:310] 2024-07-24 21:05:54,158 >> {'loss': 0.1524, 'learning_rate': 1.9397e-06, 'epoch': 2.65, 'throughput': 2561.78}

	[INFO\|callbacks.py:310] 2024-07-24 21:14:51,826 >> {'loss': 0.1494, 'learning_rate': 1.2712e-06, 'epoch': 2.72, 'throughput': 2559.88}

	[INFO\|callbacks.py:310] 2024-07-24 21:23:35,877 >> {'loss': 0.1515, 'learning_rate': 7.4056e-07, 'epoch': 2.78, 'throughput': 2559.75}

	[INFO\|callbacks.py:310] 2024-07-24 21:32:00,699 >> {'loss': 0.1504, 'learning_rate': 3.5095e-07, 'epoch': 2.85, 'throughput': 2559.97}

	[INFO\|callbacks.py:310] 2024-07-24 21:40:43,444 >> {'loss': 0.1498, 'learning_rate': 1.0459e-07, 'epoch': 2.91, 'throughput': 2559.45}

	[INFO\|callbacks.py:310] 2024-07-24 21:48:52,834 >> {'loss': 0.1469, 'learning_rate': 2.9071e-09, 'epoch': 2.98, 'throughput': 2559.91}

	[INFO\|trainer.py:3503] 2024-07-24 21:50:38,895 >> Saving model checkpoint to saves/Custom/lora/train_2024-07-24-15-00-21/checkpoint-462

	[INFO\|configuration_utils.py:733] 2024-07-24 21:50:39,083 >> loading configuration file config.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/config.json

	[INFO\|configuration_utils.py:800] 2024-07-24 21:50:39,084 >> Model config Phi3Config {
	"_name_or_path": "Phi-3-medium-128k-instruct",
	"architectures": [
	"Phi3ForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"auto_map": {
	"AutoConfig": "microsoft/Phi-3-medium-128k-instruct--configuration_phi3.Phi3Config",
	"AutoModelForCausalLM": "microsoft/Phi-3-medium-128k-instruct--modeling_phi3.Phi3ForCausalLM"
	},
	"bos_token_id": 1,
	"embd_pdrop": 0.0,
	"eos_token_id": 32000,
	"hidden_act": "silu",
	"hidden_size": 5120,
	"initializer_range": 0.02,
	"intermediate_size": 17920,
	"max_position_embeddings": 131072,
	"model_type": "phi3",
	"num_attention_heads": 40,
	"num_hidden_layers": 40,
	"num_key_value_heads": 10,
	"original_max_position_embeddings": 4096,
	"pad_token_id": null,
	"resid_pdrop": 0.0,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"long_factor": [
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.25,
	1.25,
	1.5,
	2.0,
	2.75,
	5.75,
	5.75,
	6.5,
	9.25,
	11.0,
	13.25,
	19.25,
	19.75,
	19.75,
	21.25,
	21.5,
	26.5,
	30.0,
	33.75,
	35.25,
	38.5,
	42.0,
	42.25,
	46.0,
	47.0,
	50.0,
	50.5,
	51.0,
	52.0,
	52.75,
	53.75,
	54.75,
	57.0,
	57.25,
	58.5,
	59.25,
	59.5,
	62.0,
	62.5,
	62.75,
	63.25,
	63.25,
	63.25,
	63.75,
	64.0,
	64.0,
	64.25,
	64.5,
	64.5,
	65.0,
	65.0
	],
	"short_factor": [
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.01,
	1.02,
	1.02,
	1.04,
	1.04,
	1.07,
	1.07,
	1.1,
	1.3000000000000003,
	1.3000000000000003,
	1.5000000000000004,
	1.5700000000000005,
	1.9000000000000008,
	2.3100000000000014,
	2.759999999999992,
	3.3899999999999784,
	3.9399999999999666,
	4.009999999999965,
	4.289999999999959,
	4.349999999999958,
	5.349999999999937,
	6.659999999999909,
	7.029999999999901,
	7.51999999999989,
	8.00999999999988,
	8.249999999999876,
	8.279999999999875,
	9.629999999999846,
	9.89999999999984,
	10.589999999999826,
	11.049999999999816,
	11.7899999999998,
	12.189999999999792,
	12.889999999999777,
	13.129999999999772,
	13.16999999999977,
	13.20999999999977,
	13.479999999999764,
	13.539999999999763,
	13.779999999999758,
	13.929999999999755,
	14.429999999999744,
	14.759999999999737,
	15.149999999999729,
	15.419999999999723,
	15.53999999999972,
	15.659999999999718,
	15.749999999999716,
	15.759999999999716,
	15.799999999999715,
	16.05999999999971,
	16.079999999999714,
	16.11999999999972,
	16.11999999999972,
	16.18999999999973,
	16.31999999999975,
	16.539999999999786,
	16.799999999999827
	],
	"type": "su"
	},
	"rope_theta": 10000.0,
	"sliding_window": 131072,
	"tie_word_embeddings": false,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.43.1",
	"use_cache": true,
	"vocab_size": 32064
	}


	[INFO\|tokenization_utils_base.py:2702] 2024-07-24 21:50:39,135 >> tokenizer config file saved in saves/Custom/lora/train_2024-07-24-15-00-21/checkpoint-462/tokenizer_config.json

	[INFO\|tokenization_utils_base.py:2711] 2024-07-24 21:50:39,135 >> Special tokens file saved in saves/Custom/lora/train_2024-07-24-15-00-21/checkpoint-462/special_tokens_map.json

	[INFO\|trainer.py:2394] 2024-07-24 21:50:39,852 >>

	Training completed. Do not forget to share your model on huggingface.co/models =)



	[INFO\|trainer.py:3503] 2024-07-24 21:50:42,012 >> Saving model checkpoint to saves/Custom/lora/train_2024-07-24-15-00-21

	[INFO\|configuration_utils.py:733] 2024-07-24 21:50:42,204 >> loading configuration file config.json from cache at /workspace/data/huggingface-cache/hub/models--microsoft--Phi-3-medium-128k-instruct/snapshots/cae1d42b5577398fd1be9f0746052562ae552886/config.json

	[INFO\|configuration_utils.py:800] 2024-07-24 21:50:42,205 >> Model config Phi3Config {
	"_name_or_path": "Phi-3-medium-128k-instruct",
	"architectures": [
	"Phi3ForCausalLM"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"auto_map": {
	"AutoConfig": "microsoft/Phi-3-medium-128k-instruct--configuration_phi3.Phi3Config",
	"AutoModelForCausalLM": "microsoft/Phi-3-medium-128k-instruct--modeling_phi3.Phi3ForCausalLM"
	},
	"bos_token_id": 1,
	"embd_pdrop": 0.0,
	"eos_token_id": 32000,
	"hidden_act": "silu",
	"hidden_size": 5120,
	"initializer_range": 0.02,
	"intermediate_size": 17920,
	"max_position_embeddings": 131072,
	"model_type": "phi3",
	"num_attention_heads": 40,
	"num_hidden_layers": 40,
	"num_key_value_heads": 10,
	"original_max_position_embeddings": 4096,
	"pad_token_id": null,
	"resid_pdrop": 0.0,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"long_factor": [
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.25,
	1.25,
	1.5,
	2.0,
	2.75,
	5.75,
	5.75,
	6.5,
	9.25,
	11.0,
	13.25,
	19.25,
	19.75,
	19.75,
	21.25,
	21.5,
	26.5,
	30.0,
	33.75,
	35.25,
	38.5,
	42.0,
	42.25,
	46.0,
	47.0,
	50.0,
	50.5,
	51.0,
	52.0,
	52.75,
	53.75,
	54.75,
	57.0,
	57.25,
	58.5,
	59.25,
	59.5,
	62.0,
	62.5,
	62.75,
	63.25,
	63.25,
	63.25,
	63.75,
	64.0,
	64.0,
	64.25,
	64.5,
	64.5,
	65.0,
	65.0
	],
	"short_factor": [
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.0,
	1.01,
	1.02,
	1.02,
	1.04,
	1.04,
	1.07,
	1.07,
	1.1,
	1.3000000000000003,
	1.3000000000000003,
	1.5000000000000004,
	1.5700000000000005,
	1.9000000000000008,
	2.3100000000000014,
	2.759999999999992,
	3.3899999999999784,
	3.9399999999999666,
	4.009999999999965,
	4.289999999999959,
	4.349999999999958,
	5.349999999999937,
	6.659999999999909,
	7.029999999999901,
	7.51999999999989,
	8.00999999999988,
	8.249999999999876,
	8.279999999999875,
	9.629999999999846,
	9.89999999999984,
	10.589999999999826,
	11.049999999999816,
	11.7899999999998,
	12.189999999999792,
	12.889999999999777,
	13.129999999999772,
	13.16999999999977,
	13.20999999999977,
	13.479999999999764,
	13.539999999999763,
	13.779999999999758,
	13.929999999999755,
	14.429999999999744,
	14.759999999999737,
	15.149999999999729,
	15.419999999999723,
	15.53999999999972,
	15.659999999999718,
	15.749999999999716,
	15.759999999999716,
	15.799999999999715,
	16.05999999999971,
	16.079999999999714,
	16.11999999999972,
	16.11999999999972,
	16.18999999999973,
	16.31999999999975,
	16.539999999999786,
	16.799999999999827
	],
	"type": "su"
	},
	"rope_theta": 10000.0,
	"sliding_window": 131072,
	"tie_word_embeddings": false,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.43.1",
	"use_cache": true,
	"vocab_size": 32064
	}


	[INFO\|tokenization_utils_base.py:2702] 2024-07-24 21:50:42,263 >> tokenizer config file saved in saves/Custom/lora/train_2024-07-24-15-00-21/tokenizer_config.json

	[INFO\|tokenization_utils_base.py:2711] 2024-07-24 21:50:42,264 >> Special tokens file saved in saves/Custom/lora/train_2024-07-24-15-00-21/special_tokens_map.json

	[WARNING\|ploting.py:89] 2024-07-24 21:50:42,678 >> No metric eval_loss to plot.

	[WARNING\|ploting.py:89] 2024-07-24 21:50:42,678 >> No metric eval_accuracy to plot.

	[INFO\|modelcard.py:449] 2024-07-24 21:50:42,679 >> Dropping the following result as it does not have all the necessary fields:
	{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}}