name: llama
model:
  pretrained_model_name_or_path: "meta-llama/Meta-Llama-3.1-8B"
  cache_dir: "/scr-ssd/mzhang/models/llama-3_1-8b" # Set this to where you want to save checkpoint weights 
  return_dict: true
  load_in_8bit: false
  load_in_4bit: false
  device_map: auto
  low_cpu_mem_usage: true
  torch_dtype: bfloat16
  attn_implementation: eager
  rope_theta: 500000.0
  rope_scaling:
    factor: 8.0
    low_freq_factor: 1.0
    high_freq_factor: 4.0
    original_max_position_embeddings: 8192
    rope_type: llama3

attention:
  attention_type: lolcats_llama
  feature_map: softmax_dim
  feature_map_kwargs:
    eps: 1e-12
    # mlp: null  # to set
    fullspace: true
  layer_idx: null # to set
  learned_kernel: untied_head_einsum
  learned_kernel_kwargs:
    feature_dim: 64
    skip_connection: false
    bias: false
    zero_init: false
  tie_qk_kernels: false
  train_qk: false