|
|
|
|
|
|
|
"""Script used to generate the base frankenmerge. Output will need fine-tuning to be useful.""" |
|
|
|
import copy |
|
import torch |
|
from torch import Tensor, nn |
|
import transformers |
|
|
|
from transformers.models.llama.modeling_llama import ( |
|
LlamaForCausalLM, |
|
LlamaDecoderLayer, |
|
) |
|
from transformers import LlamaForCausalLM, LlamaConfig |
|
|
|
import torch |
|
import transformers |
|
import numpy as np |
|
|
|
|
|
MODEL_NAME_13B = "meta-llama/Llama-2-13b-hf" |
|
MODEL_NAME_33B = "huggyllama/llama-30b" |
|
BLOCK_DIAGONAL = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NoInit: |
|
def __enter__(self): |
|
def noop(*args, **kwargs): |
|
pass |
|
|
|
(k, u, n) = ( |
|
torch.nn.init.kaiming_uniform_, |
|
torch.nn.init.uniform_, |
|
torch.nn.init.normal_, |
|
) |
|
torch.nn.init.kaiming_uniform_ = noop |
|
torch.nn.init.uniform_ = noop |
|
torch.nn.init.normal_ = noop |
|
|
|
transformers.modeling_utils._init_weights = False |
|
self.funcs = (k, u, n) |
|
|
|
def __exit__(self, *args): |
|
(k, u, n) = self.funcs |
|
( |
|
torch.nn.init.kaiming_uniform_, |
|
torch.nn.init.uniform_, |
|
torch.nn.init.normal_, |
|
) = ( |
|
k, |
|
u, |
|
n, |
|
) |
|
transformers.modeling_utils._init_weights = True |
|
|
|
|
|
def format_kmb(n, digits=None): |
|
n = int(n) |
|
if n < 1000: |
|
return str(n) |
|
elif n < 1000_000: |
|
return f"{round(n/1000, digits)}k" |
|
elif n < 1000 * 1000 * 1000: |
|
return f"{round(n/(1000*1000), digits)}m" |
|
else: |
|
return f"{round(n/(1000*1000*1000), digits)}b" |
|
|
|
|
|
def count_params(model): |
|
model_parameters = filter(lambda p: p.requires_grad, model.parameters()) |
|
params = sum([np.prod(p.size()) for p in model_parameters]) |
|
return int(params) |
|
|
|
|
|
torch.set_default_dtype(torch.float16) |
|
|
|
config_13b: LlamaConfig = LlamaConfig.from_pretrained(MODEL_NAME_13B) |
|
config_33b: LlamaConfig = LlamaConfig.from_pretrained(MODEL_NAME_33B) |
|
config_more = copy.deepcopy(config_13b) |
|
config_more.intermediate_size = config_33b.intermediate_size |
|
config_more.hidden_size = config_33b.hidden_size |
|
config_more.num_key_value_heads = config_33b.num_key_value_heads |
|
config_more.num_attention_heads = config_33b.num_key_value_heads |
|
|
|
print(config_more) |
|
|
|
with NoInit(): |
|
model = LlamaForCausalLM(config_more) |
|
|
|
print(f"{format_kmb(count_params(model), 3)} parameters") |
|
|
|
|
|
def merge_tensors_inplace(dest: Tensor, s0: Tensor, s1: Tensor, block_diagonal: bool): |
|
dest.zero_() |
|
if block_diagonal: |
|
dest[s0.shape[0] :, s0.shape[1] :] = s1[ |
|
s0.shape[0] : dest.shape[0], |
|
s0.shape[1] : dest.shape[1], |
|
] |
|
else: |
|
dest[s0.shape[0] :, :] = s1[ |
|
s0.shape[0] : dest.shape[0], |
|
: dest.shape[1], |
|
] |
|
dest[: s0.shape[0], : s0.shape[1]] = s0 |
|
|
|
|
|
with NoInit(): |
|
donor_13b = ( |
|
LlamaForCausalLM.from_pretrained(MODEL_NAME_13B).to(torch.float16).eval() |
|
) |
|
donor_33b = ( |
|
LlamaForCausalLM.from_pretrained(MODEL_NAME_33B).to(torch.float16).eval() |
|
) |
|
|
|
with torch.no_grad(): |
|
for layer_idx in range(len(model.model.layers)): |
|
layer: LlamaDecoderLayer = model.model.layers[layer_idx] |
|
l13: LlamaDecoderLayer = donor_13b.model.layers[layer_idx] |
|
l33: LlamaDecoderLayer = donor_33b.model.layers[layer_idx] |
|
|
|
for name in ("q_proj", "k_proj", "v_proj", "o_proj"): |
|
dest: nn.Linear = getattr(layer.self_attn, name) |
|
s13: nn.Linear = getattr(l13.self_attn, name) |
|
s33: nn.Linear = getattr(l33.self_attn, name) |
|
merge_tensors_inplace(dest.weight, s13.weight, s33.weight, BLOCK_DIAGONAL) |
|
|
|
for name in ("up_proj", "gate_proj", "down_proj"): |
|
dest: nn.Linear = getattr(layer.mlp, name) |
|
s13: nn.Linear = getattr(l13.mlp, name) |
|
s33: nn.Linear = getattr(l33.mlp, name) |
|
merge_tensors_inplace(dest.weight, s13.weight, s33.weight, BLOCK_DIAGONAL) |
|
|
|
layer.input_layernorm.weight[:] = l33.input_layernorm.weight[ |
|
: layer.input_layernorm.weight.shape[0] |
|
] |
|
layer.input_layernorm.weight[ |
|
: l13.input_layernorm.weight.shape[0] |
|
] = l13.input_layernorm.weight |
|
layer.post_attention_layernorm.weight[:] = l33.post_attention_layernorm.weight[ |
|
: layer.post_attention_layernorm.weight.shape[0] |
|
] |
|
layer.post_attention_layernorm.weight[ |
|
: l13.post_attention_layernorm.weight.shape[0] |
|
] = l13.post_attention_layernorm.weight |
|
|
|
|
|
|
|
model.lm_head.weight.zero_() |
|
model.lm_head.weight[ |
|
: donor_13b.lm_head.weight.shape[0], : donor_13b.lm_head.weight.shape[1] |
|
] = donor_13b.lm_head.weight |
|
|
|
merge_tensors_inplace( |
|
model.model.embed_tokens.weight, |
|
donor_13b.model.embed_tokens.weight, |
|
donor_33b.model.embed_tokens.weight, |
|
BLOCK_DIAGONAL, |
|
) |
|
|
|
model.save_pretrained("./llama2-22b/", safe_serialization=True) |
|
|