dataset:
  dataset: imagenet
  image_resolution: 256

stage1:
  type: vqgan
  embed_dim: 256
  n_embed: 16384
  hparams:
    double_z: False
    z_channels: 256
    resolution: 256
    in_channels: 3
    out_ch: 3
    ch: 128
    ch_mult: [1, 1, 2, 2, 4]
    num_res_blocks: 2
    attn_resolutions: [16]
    pdrop: 0.0

stage2:
  type: igpt
  use_cls_cond: False
  vocab_size_img: 16384
  hparams:
    embed_dim: 1536
    n_layers: 42
    n_heads: 24
    n_dense_layers: 42
    ctx_len_img: 256
    embd_pdrop: 0.0
    resid_pdrop: 0.0
    attn_pdrop: 0.0
    mlp_bias: True
    attn_bias: True
    gelu_use_approx: False

optimizer:
  opt_type: adamW
  base_lr: 1e-4
  weight_decay: 0.0
  betas: [0.9, 0.95]
  grad_clip_norm: 4.0

experiment:
  local_batch_size: 2
  total_batch_size: 512
  epochs: 8