#!/usr/bin/env python3 # 1/17/2024 # Charles O. Goddard """Convert internlm2 weights to Llama format.""" import json import os import einops import tqdm from mergekit.io import LazyTensorLoader, TensorWriter from mergekit.common import ModelReference from transformers import LlamaTokenizer MODEL_IN = "internlm/internlm2-20b" OUT_PATH = "./internlm2-20b-llama" model_ref = ModelReference.parse(MODEL_IN) cfg = model_ref.config(trust_remote_code=True) head_dim = cfg.hidden_size // cfg.num_attention_heads num_key_value_groups = cfg.num_attention_heads // cfg.num_key_value_heads loader = LazyTensorLoader(model_ref.tensor_index(), lazy_unpickle=True) writer = TensorWriter(OUT_PATH) SIMPLE_REPLACEMENTS = { "feed_forward.w1": "mlp.gate_proj", "feed_forward.w2": "mlp.down_proj", "feed_forward.w3": "mlp.up_proj", "attention.wo": "self_attn.o_proj", "ffn_norm": "post_attention_layernorm", "attention_norm": "input_layernorm", "tok_embeddings": "embed_tokens", "output.weight": "lm_head.weight", } for tensor_name in tqdm.tqdm(loader.index.tensor_paths): tensor = loader.get_tensor(tensor_name) if "attention.wqkv" in tensor_name: # make me think about tensor shapes will you >:( # ((cfg.num_attention_heads + 2 * cfg.num_key_value_heads) * head_dim, cfg.hidden_size) x (batch_sz, sq_len, cfg.hidden_size) # -> (batch_sz, sq_len, (cfg.num_attention_heads + 2 * cfg.num_key_value_heads) * head_dim) # qkv_states = rearrange( # qkv_states, # "b q (h gs d) -> b q h gs d", # gs=2 + self.num_key_value_groups, # d=self.head_dim, # ) # ->(batch_sz, sq_len, h, 2 + self.num_key_value_groups, head_dim) qkv_vecs = einops.rearrange( tensor, "(h gs d) z -> h gs d z", gs=2 + num_key_value_groups, d=head_dim ) q_proj = ( qkv_vecs[:, :num_key_value_groups, ...] .reshape(-1, cfg.hidden_size) .contiguous() ) k_proj = qkv_vecs[:, -2, ...].reshape(-1, cfg.hidden_size).contiguous() v_proj = qkv_vecs[:, -1, ...].reshape(-1, cfg.hidden_size).contiguous() assert k_proj.shape == v_proj.shape writer.save_tensor( tensor_name.replace("attention.wqkv", "self_attn.q_proj"), q_proj, clone=True, ) writer.save_tensor( tensor_name.replace("attention.wqkv", "self_attn.k_proj"), k_proj, clone=True, ) writer.save_tensor( tensor_name.replace("attention.wqkv", "self_attn.v_proj"), v_proj, clone=True, ) continue out_name = tensor_name for pattern, sub in SIMPLE_REPLACEMENTS.items(): if pattern in out_name: out_name = out_name.replace(pattern, sub) writer.save_tensor(out_name, tensor) writer.finalize() cfg_dict = json.loads(cfg.to_json_string()) del cfg_dict["auto_map"] cfg_dict["architectures"] = ["LlamaForCausalLM"] cfg_dict["model_type"] = "llama" if "rope_scaling" in cfg_dict and cfg_dict["rope_scaling"]["factor"] == 1.0: del cfg_dict["rope_scaling"] with open(os.path.join(OUT_PATH, "config.json"), "w", encoding="utf-8") as fp: json.dump(cfg_dict, fp, indent=2) # InternLMTokenizer differences: # 1. clean_up_tokenization() hardcoded to always be called # 2. might prepend a space to some tokens that LlamaTokenizer doesn't if they're the first token # 1 is easy to fix, 2... is not important tok = LlamaTokenizer.from_pretrained(MODEL_IN, trust_remote_code=False, legacy=True) tok.clean_up_tokenization_spaces = True tok.save_pretrained(OUT_PATH)