{
    "_name_or_path": "configs/ViTLP-1920-1600",
    "architectures": ["ViTLPForPreTraining"],
    "model_type": "ViTLP",
    "torch_dtype": "float32",
    "gradient_checkpointing": true,
    "init_std": 0.02,
    "initializer_range": 0.02,
    "layer_norm_eps": 1e-12,
    "dropout": 0.1,
    "hidden_dropout_prob": 0.1,
    "attention_dropout": 0.1,
    "activation_dropout": 0.1,
    "activation_function": "gelu",
    "encoder_attention_heads": 16,
    "encoder_ffn_dim": 4096,
    "encoder_hidden_size": 1024,
    "decoder_attention_heads": 16,
    "decoder_ffn_dim": 4096,
    "hidden_size": 1024,
    "encoder_layers": 18,
    "decoder_layers": 6,

    "image_height": 1920,
    "image_width": 1600,
    "patch_size": 32,
    "patch_num": 3000,
    "num_channels": 3,
    "resample": 2,
    "bin_size": 1001,
    "load_vit": true,

    "seq_length": 1280,
    "decoder_start_token_id": 2,
    "cls_token_id": 0,
    "pad_token_id": 1,
    "eos_token_id": 2,
    "unk_token_id": 3,
    "vocab_size": 50267,
    "load_bart": true
}