# Configurable parameters for fine-tuning import os # *** Dataset *** # Base directory where the script is running BASE_DIR = os.path.dirname(os.path.abspath(__file__)) PARENT = os.path.dirname(BASE_DIR) # Path to the folder containing the data files, relative to the configuration file DATA_FOLDER = 'fine_tuning_data' # Full path to the data folder DATA_FOLDER_PATH = os.path.join(PARENT, 'fine_tuner', DATA_FOLDER) # Path to the dataset file (CSV format) DATASET_FILE = os.path.join(DATA_FOLDER_PATH, 'fine_tuning_data_yolov5.csv') # or 'fine_tuning_data_detic.csv' # *** Fine-tuned Adapter *** TRAINED_ADAPTER_NAME = 'fine_tuned_adapter' # name of fine-tuned adapter. FINE_TUNED_ADAPTER_FOLDER = 'fine_tuned_model' FINE_TUNED_ADAPTER_PATH = os.path.join(BASE_DIR, FINE_TUNED_ADAPTER_FOLDER) ADAPTER_SAVE_NAME = os.path.join(FINE_TUNED_ADAPTER_PATH, TRAINED_ADAPTER_NAME) # Proportion of the dataset to include in the test split (e.g., 0.1 for 10%) TEST_SIZE = 0.1 # Seed for random operations to ensure reproducibility SEED = 123 # *** QLoRA Configuration Parameters *** # LoRA attention dimension: number of additional parameters in each LoRA layer LORA_R = 64 # Alpha parameter for LoRA scaling: controls the scaling of LoRA weights LORA_ALPHA = 32 # Dropout probability for LoRA layers: probability of dropping a unit in LoRA layers LORA_DROPOUT = 0.05 # *** TrainingArguments Configuration Parameters for the Transformers library *** # Output directory to save model predictions and checkpoints OUTPUT_DIR = "./TUNED_MODEL_LLAMA" # Number of epochs to train the model NUM_TRAIN_EPOCHS = 1 # Enable mixed-precision training using fp16 (set to True for faster training) FP16 = True # Enable mixed-precision training using bf16 (set to True if using an A100 GPU) BF16 = False # Batch size per GPU/Device for training PER_DEVICE_TRAIN_BATCH_SIZE = 16 # Batch size per GPU/Device for evaluation PER_DEVICE_EVAL_BATCH_SIZE = 8 # Number of update steps to accumulate gradients before performing a backward/update pass GRADIENT_ACCUMULATION_STEPS = 1 # Enable gradient checkpointing to reduce memory usage at the cost of a slight slowdown GRADIENT_CHECKPOINTING = True # Maximum gradient norm for gradient clipping to prevent exploding gradients MAX_GRAD_NORM = 0.3 # Initial learning rate for the AdamW optimizer LEARNING_RATE = 2e-4 # Weight decay coefficient for regularization (applied to all layers except bias/LayerNorm weights) WEIGHT_DECAY = 0.01 # Optimizer type, here using 'paged_adamw_8bit' for efficient training OPTIM = "paged_adamw_8bit" # Learning rate scheduler type (e.g., 'linear', 'cosine', etc.) LR_SCHEDULER_TYPE = "linear" # Maximum number of training steps, overrides 'num_train_epochs' if set to a positive number # Setting MAX_STEPS = -1 in training arguments for SFTTrainer means that the number of steps will be determined by the # number of epochs, the size of the dataset, the batch size, and the number of GPUs1. This is the default behavior # when MAX_STEPS is not specified or set to a negative value2. MAX_STEPS = -1 # Ratio of the total number of training steps used for linear warmup WARMUP_RATIO = 0.03 # Whether to group sequences into batches with the same length to save memory and increase speed GROUP_BY_LENGTH = False # Save a model checkpoint every X update steps SAVE_STEPS = 50 # Log training information every X update steps LOGGING_STEPS = 25 PACKING = False # Evaluation strategy during training ("steps", "epoch, "no") Evaluation_STRATEGY = "steps" # Number of update steps between two evaluations if `evaluation_strategy="steps"`. # Will default to the same value as `logging_steps` if not set. EVALUATION_STEPS = 5 # Maximum number of tokens per sample in the dataset MAX_TOKEN_COUNT = 1024