# @package __global__
defaults:
  - _self_
  - /model/lm/model_scale: base # prefer this group to set model scale instead of transformer_lm keys directly

lm_model: transformer_lm

codebooks_pattern:
  modeling: parallel

transformer_lm:
  dim: 512
  num_heads: 8
  num_layers: 8
  hidden_scale: 4
  n_q: 8                   # number of streams to model
  card: 1024
  dropout: 0.
  emb_lr: null
  activation: gelu
  norm_first: false        # use pre-norm instead of post-norm
  bias_ff: true            # use bias for the feedforward
  bias_attn: true          # use bias for the attention
  bias_proj: true          # use bias for the output projections
  past_context: null
  causal: true
  custom: false                 # use custom MHA implementation
  memory_efficient: false       # use flash attention
  attention_as_float32: false   # use float32 for the attention part,
                                # recommended at the moment when memory_efficient is True.
  layer_scale: null
  positional_embedding: sin     # positional embedding strategy (sin, rope, or sin_rope).
  xpos: false                   # apply xpos decay (rope only).
  checkpointing: none      # layer checkpointing method, can be none, torch, xformers_default.
                           # torch is the slowest but uses the least memory,
                           # xformers_default is somewhere in between.
  weight_init: null     # weight initialization (null, gaussian or uniform)
  depthwise_init: null  # perform depthwise initialization (null, current, global)
  zero_bias_init: false # initialize bias to zero if bias in linears and
                        # if a weight_init method is used.
  norm: layer_norm             # normalization method to use in transformer.
  cross_attention: false
  qk_layer_norm: false
  qk_layer_norm_cross: false
  attention_dropout: null
  kv_repeat: 1
  two_step_cfg: false          # whether to do true 2 steps CFG, potentially resolving some padding issues or not...