#!/usr/bin/env python3
# 1/17/2024
# Charles O. Goddard
# https://huggingface.co/chargoddard/internlm2-7b-llama/raw/main/convert_weights.py
"""Convert internlm2 weights to Llama format."""

import json
import os
import einops
import tqdm
from mergekit.io import LazyTensorLoader, TensorWriter
from mergekit.common import ModelReference
from transformers import LlamaTokenizer

MODEL_IN = "raw weights"
OUT_PATH = "llamafied weights"

model_ref = ModelReference.parse(MODEL_IN)
cfg = model_ref.config(trust_remote_code=True)
head_dim = cfg.hidden_size // cfg.num_attention_heads
num_key_value_groups = cfg.num_attention_heads // cfg.num_key_value_heads
loader = LazyTensorLoader(model_ref.tensor_index(), lazy_unpickle=True)
writer = TensorWriter(OUT_PATH)

SIMPLE_REPLACEMENTS = {
    "feed_forward.w1": "mlp.gate_proj",
    "feed_forward.w2": "mlp.down_proj",
    "feed_forward.w3": "mlp.up_proj",
    "attention.wo": "self_attn.o_proj",
    "ffn_norm": "post_attention_layernorm",
    "attention_norm": "input_layernorm",
    "tok_embeddings": "embed_tokens",
    "output.weight": "lm_head.weight",
}

for tensor_name in tqdm.tqdm(loader.index.tensor_paths):
    tensor = loader.get_tensor(tensor_name)
    if "attention.wqkv" in tensor_name:
        # make me think about tensor shapes will you >:(

        # ((cfg.num_attention_heads + 2 * cfg.num_key_value_heads) * head_dim, cfg.hidden_size) x (batch_sz, sq_len, cfg.hidden_size)
        # -> (batch_sz, sq_len, (cfg.num_attention_heads + 2 * cfg.num_key_value_heads) * head_dim)
        # qkv_states = rearrange(
        #     qkv_states,
        #     "b q (h gs d) -> b q h gs d",
        #     gs=2 + self.num_key_value_groups,
        #     d=self.head_dim,
        # )
        # ->(batch_sz, sq_len, h, 2 + self.num_key_value_groups, head_dim)
        qkv_vecs = einops.rearrange(
            tensor, "(h gs d) z -> h gs d z", gs=2 + num_key_value_groups, d=head_dim
        )
        q_proj = (
            qkv_vecs[:, :num_key_value_groups, ...]
            .reshape(-1, cfg.hidden_size)
            .contiguous()
        )
        k_proj = qkv_vecs[:, -2, ...].reshape(-1, cfg.hidden_size).contiguous()
        v_proj = qkv_vecs[:, -1, ...].reshape(-1, cfg.hidden_size).contiguous()
        assert k_proj.shape == v_proj.shape

        writer.save_tensor(
            tensor_name.replace("attention.wqkv", "self_attn.q_proj"),
            q_proj,
            clone=True,
        )
        writer.save_tensor(
            tensor_name.replace("attention.wqkv", "self_attn.k_proj"),
            k_proj,
            clone=True,
        )
        writer.save_tensor(
            tensor_name.replace("attention.wqkv", "self_attn.v_proj"),
            v_proj,
            clone=True,
        )
        continue

    out_name = tensor_name
    for pattern, sub in SIMPLE_REPLACEMENTS.items():
        if pattern in out_name:
            out_name = out_name.replace(pattern, sub)
    writer.save_tensor(out_name, tensor)
writer.finalize()

cfg_dict = json.loads(cfg.to_json_string())
del cfg_dict["auto_map"]
cfg_dict["architectures"] = ["LlamaForCausalLM"]
cfg_dict["model_type"] = "llama"
if "rope_scaling" in cfg_dict and cfg_dict["rope_scaling"]["factor"] == 1.0:
    del cfg_dict["rope_scaling"]
with open(os.path.join(OUT_PATH, "config.json"), "w", encoding="utf-8") as fp:
    json.dump(cfg_dict, fp, indent=2)

# InternLMTokenizer differences:
# 1. clean_up_tokenization() hardcoded to always be called
# 2. might prepend a space to some tokens that LlamaTokenizer doesn't if they're the first token
# 1 is easy to fix, 2... is not important
tok = LlamaTokenizer.from_pretrained(MODEL_IN, trust_remote_code=False, legacy=True)
tok.clean_up_tokenization_spaces = True
tok.save_pretrained(OUT_PATH)