# Configurable parameters for fine-tuning

import os


# *** Dataset ***
# Base directory where the script is running
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
PARENT = os.path.dirname(BASE_DIR)
# Path to the folder containing the data files, relative to the configuration file
DATA_FOLDER = 'fine_tuning_data'
# Full path to the data folder
DATA_FOLDER_PATH = os.path.join(PARENT, 'fine_tuner', DATA_FOLDER)
# Path to the dataset file (CSV format)
DATASET_FILE = os.path.join(DATA_FOLDER_PATH, 'fine_tuning_data_yolov5.csv')  # or 'fine_tuning_data_detic.csv'


# *** Fine-tuned Adapter ***
TRAINED_ADAPTER_NAME = 'fine_tuned_adapter'  # name of fine-tuned adapter.
FINE_TUNED_ADAPTER_FOLDER = 'fine_tuned_model'
FINE_TUNED_ADAPTER_PATH = os.path.join(BASE_DIR, FINE_TUNED_ADAPTER_FOLDER)
ADAPTER_SAVE_NAME = os.path.join(FINE_TUNED_ADAPTER_PATH, TRAINED_ADAPTER_NAME)


# Proportion of the dataset to include in the test split (e.g., 0.1 for 10%)
TEST_SIZE = 0.1

# Seed for random operations to ensure reproducibility
SEED = 123

# *** QLoRA Configuration Parameters ***
# LoRA attention dimension: number of additional parameters in each LoRA layer
LORA_R = 64

# Alpha parameter for LoRA scaling: controls the scaling of LoRA weights
LORA_ALPHA = 32

# Dropout probability for LoRA layers: probability of dropping a unit in LoRA layers
LORA_DROPOUT = 0.05



# *** TrainingArguments Configuration Parameters for the Transformers library ***
# Output directory to save model predictions and checkpoints
OUTPUT_DIR = "./TUNED_MODEL_LLAMA"

# Number of epochs to train the model
NUM_TRAIN_EPOCHS = 1

# Enable mixed-precision training using fp16 (set to True for faster training)
FP16 = True

# Enable mixed-precision training using bf16 (set to True if using an A100 GPU)
BF16 = False

# Batch size per GPU/Device for training
PER_DEVICE_TRAIN_BATCH_SIZE = 16

# Batch size per GPU/Device for evaluation
PER_DEVICE_EVAL_BATCH_SIZE = 8

# Number of update steps to accumulate gradients before performing a backward/update pass
GRADIENT_ACCUMULATION_STEPS = 1

# Enable gradient checkpointing to reduce memory usage at the cost of a slight slowdown
GRADIENT_CHECKPOINTING = True

# Maximum gradient norm for gradient clipping to prevent exploding gradients
MAX_GRAD_NORM = 0.3

# Initial learning rate for the AdamW optimizer
LEARNING_RATE = 2e-4

# Weight decay coefficient for regularization (applied to all layers except bias/LayerNorm weights)
WEIGHT_DECAY = 0.01

# Optimizer type, here using 'paged_adamw_8bit' for efficient training
OPTIM = "paged_adamw_8bit"

# Learning rate scheduler type (e.g., 'linear', 'cosine', etc.)
LR_SCHEDULER_TYPE = "linear"

# Maximum number of training steps, overrides 'num_train_epochs' if set to a positive number
# Setting MAX_STEPS = -1 in training arguments for SFTTrainer means that the number of steps will be determined by the
# number of epochs, the size of the dataset, the batch size, and the number of GPUs1. This is the default behavior
# when MAX_STEPS is not specified or set to a negative value2.
MAX_STEPS = -1

# Ratio of the total number of training steps used for linear warmup
WARMUP_RATIO = 0.03

# Whether to group sequences into batches with the same length to save memory and increase speed
GROUP_BY_LENGTH = False

# Save a model checkpoint every X update steps
SAVE_STEPS = 50

# Log training information every X update steps
LOGGING_STEPS = 25

PACKING = False

# Evaluation strategy during training ("steps", "epoch, "no")
Evaluation_STRATEGY = "steps"

# Number of update steps between two evaluations if `evaluation_strategy="steps"`.
# Will default to the same value as `logging_steps` if not set.
EVALUATION_STEPS = 5

# Maximum number of tokens per sample in the dataset
MAX_TOKEN_COUNT = 1024