# Configurable parameters for fine-tuning | |
import os | |
# *** Dataset *** | |
# Base directory where the script is running | |
BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
PARENT = os.path.dirname(BASE_DIR) | |
# Path to the folder containing the data files, relative to the configuration file | |
DATA_FOLDER = 'fine_tuning_data' | |
# Full path to the data folder | |
DATA_FOLDER_PATH = os.path.join(PARENT, 'fine_tuner', DATA_FOLDER) | |
# Path to the dataset file (CSV format) | |
DATASET_FILE = os.path.join(DATA_FOLDER_PATH, 'fine_tuning_data_yolov5.csv') # or 'fine_tuning_data_detic.csv' | |
# *** Fine-tuned Adapter *** | |
TRAINED_ADAPTER_NAME = 'fine_tuned_adapter' # name of fine-tuned adapter. | |
FINE_TUNED_ADAPTER_FOLDER = 'fine_tuned_model' | |
FINE_TUNED_ADAPTER_PATH = os.path.join(BASE_DIR, FINE_TUNED_ADAPTER_FOLDER) | |
ADAPTER_SAVE_NAME = os.path.join(FINE_TUNED_ADAPTER_PATH, TRAINED_ADAPTER_NAME) | |
# Proportion of the dataset to include in the test split (e.g., 0.1 for 10%) | |
TEST_SIZE = 0.1 | |
# Seed for random operations to ensure reproducibility | |
SEED = 123 | |
# *** QLoRA Configuration Parameters *** | |
# LoRA attention dimension: number of additional parameters in each LoRA layer | |
LORA_R = 64 | |
# Alpha parameter for LoRA scaling: controls the scaling of LoRA weights | |
LORA_ALPHA = 32 | |
# Dropout probability for LoRA layers: probability of dropping a unit in LoRA layers | |
LORA_DROPOUT = 0.05 | |
# *** TrainingArguments Configuration Parameters for the Transformers library *** | |
# Output directory to save model predictions and checkpoints | |
OUTPUT_DIR = "./TUNED_MODEL_LLAMA" | |
# Number of epochs to train the model | |
NUM_TRAIN_EPOCHS = 1 | |
# Enable mixed-precision training using fp16 (set to True for faster training) | |
FP16 = True | |
# Enable mixed-precision training using bf16 (set to True if using an A100 GPU) | |
BF16 = False | |
# Batch size per GPU/Device for training | |
PER_DEVICE_TRAIN_BATCH_SIZE = 16 | |
# Batch size per GPU/Device for evaluation | |
PER_DEVICE_EVAL_BATCH_SIZE = 8 | |
# Number of update steps to accumulate gradients before performing a backward/update pass | |
GRADIENT_ACCUMULATION_STEPS = 1 | |
# Enable gradient checkpointing to reduce memory usage at the cost of a slight slowdown | |
GRADIENT_CHECKPOINTING = True | |
# Maximum gradient norm for gradient clipping to prevent exploding gradients | |
MAX_GRAD_NORM = 0.3 | |
# Initial learning rate for the AdamW optimizer | |
LEARNING_RATE = 2e-4 | |
# Weight decay coefficient for regularization (applied to all layers except bias/LayerNorm weights) | |
WEIGHT_DECAY = 0.01 | |
# Optimizer type, here using 'paged_adamw_8bit' for efficient training | |
OPTIM = "paged_adamw_8bit" | |
# Learning rate scheduler type (e.g., 'linear', 'cosine', etc.) | |
LR_SCHEDULER_TYPE = "linear" | |
# Maximum number of training steps, overrides 'num_train_epochs' if set to a positive number | |
# Setting MAX_STEPS = -1 in training arguments for SFTTrainer means that the number of steps will be determined by the | |
# number of epochs, the size of the dataset, the batch size, and the number of GPUs1. This is the default behavior | |
# when MAX_STEPS is not specified or set to a negative value2. | |
MAX_STEPS = -1 | |
# Ratio of the total number of training steps used for linear warmup | |
WARMUP_RATIO = 0.03 | |
# Whether to group sequences into batches with the same length to save memory and increase speed | |
GROUP_BY_LENGTH = False | |
# Save a model checkpoint every X update steps | |
SAVE_STEPS = 50 | |
# Log training information every X update steps | |
LOGGING_STEPS = 25 | |
PACKING = False | |
# Evaluation strategy during training ("steps", "epoch, "no") | |
Evaluation_STRATEGY = "steps" | |
# Number of update steps between two evaluations if `evaluation_strategy="steps"`. | |
# Will default to the same value as `logging_steps` if not set. | |
EVALUATION_STEPS = 5 | |
# Maximum number of tokens per sample in the dataset | |
MAX_TOKEN_COUNT = 1024 | |