KB-VQA-E / my_model /config /fine_tuning_config.py
m7mdal7aj's picture
Update my_model/config/fine_tuning_config.py
964dd5c verified
raw
history blame
No virus
3.78 kB
# Configurable parameters for fine-tuning
import os
# *** Dataset ***
# Base directory where the script is running
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
PARENT = os.path.dirname(BASE_DIR)
# Path to the folder containing the data files, relative to the configuration file
DATA_FOLDER = 'fine_tuning_data'
# Full path to the data folder
DATA_FOLDER_PATH = os.path.join(PARENT, 'fine_tuner', DATA_FOLDER)
# Path to the dataset file (CSV format)
DATASET_FILE = os.path.join(DATA_FOLDER_PATH, 'fine_tuning_data_yolov5.csv') # or 'fine_tuning_data_detic.csv'
# *** Fine-tuned Adapter ***
TRAINED_ADAPTER_NAME = 'fine_tuned_adapter' # name of fine-tuned adapter.
FINE_TUNED_ADAPTER_FOLDER = 'fine_tuned_model'
FINE_TUNED_ADAPTER_PATH = os.path.join(BASE_DIR, FINE_TUNED_ADAPTER_FOLDER)
ADAPTER_SAVE_NAME = os.path.join(FINE_TUNED_ADAPTER_PATH, TRAINED_ADAPTER_NAME)
# Proportion of the dataset to include in the test split (e.g., 0.1 for 10%)
TEST_SIZE = 0.1
# Seed for random operations to ensure reproducibility
SEED = 123
# *** QLoRA Configuration Parameters ***
# LoRA attention dimension: number of additional parameters in each LoRA layer
LORA_R = 64
# Alpha parameter for LoRA scaling: controls the scaling of LoRA weights
LORA_ALPHA = 32
# Dropout probability for LoRA layers: probability of dropping a unit in LoRA layers
LORA_DROPOUT = 0.05
# *** TrainingArguments Configuration Parameters for the Transformers library ***
# Output directory to save model predictions and checkpoints
OUTPUT_DIR = "./TUNED_MODEL_LLAMA"
# Number of epochs to train the model
NUM_TRAIN_EPOCHS = 1
# Enable mixed-precision training using fp16 (set to True for faster training)
FP16 = True
# Enable mixed-precision training using bf16 (set to True if using an A100 GPU)
BF16 = False
# Batch size per GPU/Device for training
PER_DEVICE_TRAIN_BATCH_SIZE = 16
# Batch size per GPU/Device for evaluation
PER_DEVICE_EVAL_BATCH_SIZE = 8
# Number of update steps to accumulate gradients before performing a backward/update pass
GRADIENT_ACCUMULATION_STEPS = 1
# Enable gradient checkpointing to reduce memory usage at the cost of a slight slowdown
GRADIENT_CHECKPOINTING = True
# Maximum gradient norm for gradient clipping to prevent exploding gradients
MAX_GRAD_NORM = 0.3
# Initial learning rate for the AdamW optimizer
LEARNING_RATE = 2e-4
# Weight decay coefficient for regularization (applied to all layers except bias/LayerNorm weights)
WEIGHT_DECAY = 0.01
# Optimizer type, here using 'paged_adamw_8bit' for efficient training
OPTIM = "paged_adamw_8bit"
# Learning rate scheduler type (e.g., 'linear', 'cosine', etc.)
LR_SCHEDULER_TYPE = "linear"
# Maximum number of training steps, overrides 'num_train_epochs' if set to a positive number
# Setting MAX_STEPS = -1 in training arguments for SFTTrainer means that the number of steps will be determined by the
# number of epochs, the size of the dataset, the batch size, and the number of GPUs1. This is the default behavior
# when MAX_STEPS is not specified or set to a negative value2.
MAX_STEPS = -1
# Ratio of the total number of training steps used for linear warmup
WARMUP_RATIO = 0.03
# Whether to group sequences into batches with the same length to save memory and increase speed
GROUP_BY_LENGTH = False
# Save a model checkpoint every X update steps
SAVE_STEPS = 50
# Log training information every X update steps
LOGGING_STEPS = 25
PACKING = False
# Evaluation strategy during training ("steps", "epoch, "no")
Evaluation_STRATEGY = "steps"
# Number of update steps between two evaluations if `evaluation_strategy="steps"`.
# Will default to the same value as `logging_steps` if not set.
EVALUATION_STEPS = 5
# Maximum number of tokens per sample in the dataset
MAX_TOKEN_COUNT = 1024