name: llama model: pretrained_model_name_or_path: "meta-llama/Meta-Llama-3.1-8B" cache_dir: "/scr-ssd/mzhang/models/llama-3_1-8b" # Set this to where you want to save checkpoint weights return_dict: true load_in_8bit: false load_in_4bit: false device_map: auto low_cpu_mem_usage: true torch_dtype: bfloat16 attn_implementation: eager rope_theta: 500000.0 rope_scaling: factor: 8.0 low_freq_factor: 1.0 high_freq_factor: 4.0 original_max_position_embeddings: 8192 rope_type: llama3 attention: attention_type: lolcats_llama feature_map: softmax_dim feature_map_kwargs: eps: 1e-12 # mlp: null # to set fullspace: true layer_idx: null # to set learned_kernel: untied_head_einsum learned_kernel_kwargs: feature_dim: 64 skip_connection: false bias: false zero_init: false tie_qk_kernels: false train_qk: false