KB-VQA-E

Sleeping

App Files Files Community

KB-VQA-E / my_model /config /fine_tuning_config.py

m7mdal7aj

Update my_model/config/fine_tuning_config.py

964dd5c verified 6 months ago

raw

history blame

3.78 kB

	# Configurable parameters for fine-tuning

	import os


	# * Dataset *
	# Base directory where the script is running
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	PARENT = os.path.dirname(BASE_DIR)
	# Path to the folder containing the data files, relative to the configuration file
	DATA_FOLDER = 'fine_tuning_data'
	# Full path to the data folder
	DATA_FOLDER_PATH = os.path.join(PARENT, 'fine_tuner', DATA_FOLDER)
	# Path to the dataset file (CSV format)
	DATASET_FILE = os.path.join(DATA_FOLDER_PATH, 'fine_tuning_data_yolov5.csv') # or 'fine_tuning_data_detic.csv'


	# * Fine-tuned Adapter *
	TRAINED_ADAPTER_NAME = 'fine_tuned_adapter' # name of fine-tuned adapter.
	FINE_TUNED_ADAPTER_FOLDER = 'fine_tuned_model'
	FINE_TUNED_ADAPTER_PATH = os.path.join(BASE_DIR, FINE_TUNED_ADAPTER_FOLDER)
	ADAPTER_SAVE_NAME = os.path.join(FINE_TUNED_ADAPTER_PATH, TRAINED_ADAPTER_NAME)


	# Proportion of the dataset to include in the test split (e.g., 0.1 for 10%)
	TEST_SIZE = 0.1

	# Seed for random operations to ensure reproducibility
	SEED = 123

	# * QLoRA Configuration Parameters *
	# LoRA attention dimension: number of additional parameters in each LoRA layer
	LORA_R = 64

	# Alpha parameter for LoRA scaling: controls the scaling of LoRA weights
	LORA_ALPHA = 32

	# Dropout probability for LoRA layers: probability of dropping a unit in LoRA layers
	LORA_DROPOUT = 0.05



	# * TrainingArguments Configuration Parameters for the Transformers library *
	# Output directory to save model predictions and checkpoints
	OUTPUT_DIR = "./TUNED_MODEL_LLAMA"

	# Number of epochs to train the model
	NUM_TRAIN_EPOCHS = 1

	# Enable mixed-precision training using fp16 (set to True for faster training)
	FP16 = True

	# Enable mixed-precision training using bf16 (set to True if using an A100 GPU)
	BF16 = False

	# Batch size per GPU/Device for training
	PER_DEVICE_TRAIN_BATCH_SIZE = 16

	# Batch size per GPU/Device for evaluation
	PER_DEVICE_EVAL_BATCH_SIZE = 8

	# Number of update steps to accumulate gradients before performing a backward/update pass
	GRADIENT_ACCUMULATION_STEPS = 1

	# Enable gradient checkpointing to reduce memory usage at the cost of a slight slowdown
	GRADIENT_CHECKPOINTING = True

	# Maximum gradient norm for gradient clipping to prevent exploding gradients
	MAX_GRAD_NORM = 0.3

	# Initial learning rate for the AdamW optimizer
	LEARNING_RATE = 2e-4

	# Weight decay coefficient for regularization (applied to all layers except bias/LayerNorm weights)
	WEIGHT_DECAY = 0.01

	# Optimizer type, here using 'paged_adamw_8bit' for efficient training
	OPTIM = "paged_adamw_8bit"

	# Learning rate scheduler type (e.g., 'linear', 'cosine', etc.)
	LR_SCHEDULER_TYPE = "linear"

	# Maximum number of training steps, overrides 'num_train_epochs' if set to a positive number
	# Setting MAX_STEPS = -1 in training arguments for SFTTrainer means that the number of steps will be determined by the
	# number of epochs, the size of the dataset, the batch size, and the number of GPUs1. This is the default behavior
	# when MAX_STEPS is not specified or set to a negative value2.
	MAX_STEPS = -1

	# Ratio of the total number of training steps used for linear warmup
	WARMUP_RATIO = 0.03

	# Whether to group sequences into batches with the same length to save memory and increase speed
	GROUP_BY_LENGTH = False

	# Save a model checkpoint every X update steps
	SAVE_STEPS = 50

	# Log training information every X update steps
	LOGGING_STEPS = 25

	PACKING = False

	# Evaluation strategy during training ("steps", "epoch, "no")
	Evaluation_STRATEGY = "steps"

	# Number of update steps between two evaluations if `evaluation_strategy="steps"`.
	# Will default to the same value as `logging_steps` if not set.
	EVALUATION_STEPS = 5

	# Maximum number of tokens per sample in the dataset
	MAX_TOKEN_COUNT = 1024