import torch IMAGE_SIZE = 512 PATCH_SIZE = 16 HIDDEN_DIM = 256 CONTEXT_LENGTH = 1536 TEXT_LENGTH = 512 # Max length for *target* sequence (coords) PROMPT_LENGTH = 64 # Max length for *prompt* sequence (description) - Adjust as needed DROPOUT = 0.1 NUM_HEADS = 8 NUM_LAYERS = 12 # Keep moderate layers BATCH_SIZE = 16 LEARNING_RATE = 1e-3 # Lower LR might be needed with contrastive loss DTYPE = torch.float32 # torch.bfloat16 created some instability, why? GRAD_ACCUMULATION_STEPS = 16 IMAGE_MEAN = [0.485, 0.456, 0.406] IMAGE_STD = [0.229, 0.224, 0.225] DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu' IMAGE_LOCATION = "./images/" NUM_BINS = 32 SHARED_EMBED_DIM = 256 # Dimension for contrastive space NUM_BINS = 32 MAX_POINTS = 10 # Maximum number of points per image to handle # Training loop constants NUM_EPOCHS = 400 # desired number of epochs LOGGING_STEPS = 1 # Log every N optimization steps MAX_GRAD_NORM = 1.0 LAMBDA_CONTRASTIVE = 2 # Weight for contrastive loss - TUNE THIS LAMBDA_REGRESSION = 2 # Works but noisy