{
  "model_type": "bert",  // Specify the type of model (e.g., bert, gpt, etc.)
  "vocabulary_size": 30522,  // Vocabulary size of the model
  "hidden_size": 768,  // Size of the hidden layers
  "num_attention_heads": 12,  // Number of attention heads in the model
  "num_hidden_layers": 12,  // Number of hidden layers in the model
  "intermediate_size": 3072,  // Size of the intermediate layers
  "activation_function": "gelu",  // Activation function used in the model
  "initializer_range": 0.02,  // Standard deviation of the truncated_normal_initializer
  "layer_norm_eps": 1e-12,  // Epsilon value for layer normalization
  "max_position_embeddings": 512,  // Maximum length of sequences
  "tokenizer_type": "WordPiece",  // Type of tokenizer used
  "special_tokens": {
    "pad_token": "[PAD]",  // Padding token
    "unk_token": "[UNK]",  // Unknown token
    "cls_token": "[CLS]",  // Classification token
    "sep_token": "[SEP]",  // Separator token
    "mask_token": "[MASK]"  // Masking token
  },
  "dropout_rate": 0.1,  // Dropout rate for regularization
  "learning_rate": 0.00005,  // Learning rate for training
  "optimizer": "adamw",  // Optimizer used during training
  "num_labels": 2,  // Number of labels for classification tasks
  "train_batch_size": 16,  // Batch size for training
  "eval_batch_size": 32,  // Batch size for evaluation
  "epochs": 3,  // Number of epochs for training
  "early_stopping_patience": 3  // Patience for early stopping
}