Is this correct in the source code?

#!/usr/bin/env python3

Charles O. Goddard

7/20/2023

"""Script used to generate the base frankenmerge. Output will need fine-tuning to be useful."""

import copy
import torch
from torch import Tensor, nn
import transformers

from transformers.models.gpt_neox.modeling_gpt_neox import (
GPTNeoXForCausalLM,
GPTNeoXLayer,
)
from transformers import GPTNeoXForCausalLM,GPTNeoXConfig

import torch
import transformers
import numpy as np

MODEL_NAME_13B = "./japanese-gpt-neox-3.6b" # primary model
MODEL_NAME_33B = "./gpt-neox-20b" # donor
BLOCK_DIAGONAL = True

If BLOCK_DIAGONAL is set to True, each tensor in the resultant model will form a

block diagonal matrix, as illustrated below:

a a a 0 0

0 0 0 b b

In this configuration, the states (hidden and intermediate) from the original

and donor models are completely decoupled. That is, the hidden states

corresponding to the original model remain unchanged, and the new dimensions

added from the donor model do not depend on the hidden states of the original model.

If BLOCK_DIAGONAL is set to False, the tensors will instead have the following form:

a a a 0 0

b b b b b

In this case, the output of the newly added attention heads depends on the hidden

state values as if they were part of the donor model. Although the original model's

hidden states remain unchanged in either case, interaction between the new and old

features will result in features of varying usefulness.

class NoInit:
def enter(self):
def noop(*args, **kwargs):
pass

    (k, u, n) = (
        torch.nn.init.kaiming_uniform_,
        torch.nn.init.uniform_,
        torch.nn.init.normal_,
    )
    torch.nn.init.kaiming_uniform_ = noop
    torch.nn.init.uniform_ = noop
    torch.nn.init.normal_ = noop

    transformers.modeling_utils._init_weights = False
    self.funcs = (k, u, n)

def __exit__(self, *args):
    (k, u, n) = self.funcs
    (
        torch.nn.init.kaiming_uniform_,
        torch.nn.init.uniform_,
        torch.nn.init.normal_,
    ) = (
        k,
        u,
        n,
    )
    transformers.modeling_utils._init_weights = True

def format_kmb(n, digits=None):
n = int(n)
if n < 1000:
return str(n)
elif n < 1000_000:
return f"{round(n/1000, digits)}k"
elif n < 1000 * 1000 * 1000:
return f"{round(n/(10001000), digits)}m"
else:
return f"{round(n/(10001000*1000), digits)}b"

def convert_size(size, unit="B"):
units = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB")
size = int(size)
i = units.index(unit.upper())
size = round(size / 1024 ** i, 2)

return f"{size} {units[i]}"

def count_params(model):
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
return int(params)

torch.set_default_dtype(torch.float16)

config_13b: GPTNeoXConfig = GPTNeoXConfig.from_pretrained(MODEL_NAME_13B)
config_33b: GPTNeoXConfig = GPTNeoXConfig.from_pretrained(MODEL_NAME_33B)
config_more = copy.deepcopy(config_13b)
config_more.intermediate_size = config_33b.intermediate_size
config_more.hidden_size = config_33b.hidden_size
#config_more.num_key_value_heads = config_33b.num_key_value_heads
#config_more.num_attention_heads = config_33b.num_key_value_heads
config_more.num_attention_heads = config_33b.num_attention_heads

print(config_more)

with NoInit():
model = GPTNeoXForCausalLM(config_more)

print(f"{format_kmb(count_params(model), 3)} parameters")

def merge_tensors_inplace(dest: Tensor, s0: Tensor, s1: Tensor, block_diagonal: bool):
dest.zero_()
if block_diagonal:
dest[s0.shape[0] :, s0.shape[1] :] = s1[
s0.shape[0] : dest.shape[0],
s0.shape[1] : dest.shape[1],
]
else:
dest[s0.shape[0] :, :]= s1[
s0.shape[0] : dest.shape[0],
: dest.shape[1],
]
dest[: s0.shape[0], : s0.shape[1]] = s0

with NoInit():
donor_13b = (
GPTNeoXForCausalLM.from_pretrained(MODEL_NAME_13B).to(torch.float16).eval()
)
print(donor_13b)
donor_33b = (
GPTNeoXForCausalLM.from_pretrained(MODEL_NAME_33B).to(torch.float16).eval()
)

with torch.no_grad():
for layer_idx in range(len(model.gpt_neox.layers)):
layer: GPTNeoXLayer = model.gpt_neox.layers[layer_idx]
l13: GPTNeoXLayer = donor_13b.gpt_neox.layers[layer_idx]
l33: GPTNeoXLayer = donor_33b.gpt_neox.layers[layer_idx]

    dest: nn.Linear = getattr(layer.attention, 'query_key_value')
    s13: nn.Linear = getattr(l13.attention, 'query_key_value')
    s33: nn.Linear = getattr(l33.attention, 'query_key_value')
    dest: nn.Linear = getattr(layer.attention, 'dense')
    s13: nn.Linear = getattr(l13.attention, 'dense')
    s33: nn.Linear = getattr(l33.attention, 'dense')
    merge_tensors_inplace(dest.weight, s13.weight, s33.weight, BLOCK_DIAGONAL)


    dest: nn.Linear = getattr(layer.mlp, 'dense_h_to_4h')
    s13: nn.Linear = getattr(l13.mlp, 'dense_h_to_4h')
    s33: nn.Linear = getattr(l33.mlp, 'dense_h_to_4h')
    dest: nn.Linear = getattr(layer.mlp, 'dense_4h_to_h')
    s13: nn.Linear = getattr(l13.mlp, 'dense_4h_to_h')
    s33: nn.Linear = getattr(l33.mlp, 'dense_4h_to_h')
    merge_tensors_inplace(dest.weight, s13.weight, s33.weight, BLOCK_DIAGONAL)

    layer.input_layernorm.weight[:] = l33.input_layernorm.weight[
        : layer.input_layernorm.weight.shape[0]
    ]
    layer.input_layernorm.weight[
        : l13.input_layernorm.weight.shape[0]
    ] = l13.input_layernorm.weight
    layer.post_attention_layernorm.weight[:] = l33.post_attention_layernorm.weight[
        : layer.post_attention_layernorm.weight.shape[0]
    ]
    layer.post_attention_layernorm.weight[
        : l13.post_attention_layernorm.weight.shape[0]
    ] = l13.post_attention_layernorm.weight

# have initial output depend on only original llama2-13b features, so model
# starts unimpaired and can learn to incorporate the new features as well

model.embed_out.weight.zero_()
model.embed_out.weight[
    : donor_13b.embed_out.weight.shape[0], : donor_13b.embed_out.weight.shape[1]
] = donor_13b.embed_out.weight

merge_tensors_inplace(
    model.gpt_neox.embed_in.weight,
    donor_13b.gpt_neox.embed_in.weight,
    donor_33b.gpt_neox.embed_in.weight,
    BLOCK_DIAGONAL,
)

model.save_pretrained("D:\llm_model\gpt-neox-20b_", safe_serialization=True)

chargoddard
/

llama2-22b

gpt-neox frankensteined code

Charles O. Goddard

7/20/2023

If BLOCK_DIAGONAL is set to True, each tensor in the resultant model will form a

block diagonal matrix, as illustrated below:

a a a 0 0

a a a 0 0

a a a 0 0

0 0 0 b b

0 0 0 b b

In this configuration, the states (hidden and intermediate) from the original

and donor models are completely decoupled. That is, the hidden states

corresponding to the original model remain unchanged, and the new dimensions

added from the donor model do not depend on the hidden states of the original model.

If BLOCK_DIAGONAL is set to False, the tensors will instead have the following form:

a a a 0 0

a a a 0 0

a a a 0 0

b b b b b

b b b b b

In this case, the output of the newly added attention heads depends on the hidden

state values as if they were part of the donor model. Although the original model's

hidden states remain unchanged in either case, interaction between the new and old

features will result in features of varying usefulness.