|
|
|
|
|
""" |
|
If you want to fine-tune, here's an example Unsloth fine tuning guide for: |
|
Alpaca + TinyLlama + RoPE Scaling full example.ipynb |
|
https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing |
|
|
|
Code here was refactored from gist: |
|
https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b |
|
|
|
Fine tuning example: |
|
https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing |
|
|
|
CodeLlama example: |
|
https://huggingface.co/collections/mlabonne/codellama-6509bc68c2d4c8fc379ee87f |
|
""" |
|
|
|
import os |
|
import transformers |
|
import torch |
|
import logging |
|
from ddare.merge import merge_tensors |
|
from ddare.tensor import ( |
|
dare_ties_sparsification, |
|
relative_norm, |
|
divide_tensor_into_sets, |
|
) |
|
from ddare.util import get_device |
|
import re |
|
from typing import Dict, Tuple, List |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
log = logging.getLogger(__name__) |
|
|
|
|
|
def get_models( |
|
models: List[str], |
|
trust_remote_code: bool, |
|
): |
|
""" |
|
get the models |
|
|
|
:param models: model names to download |
|
:param trust_remote_code: are you sure??? True/False |
|
""" |
|
config = { |
|
"torch_dtype": torch.float16, |
|
"low_cpu_mem_usage": False, |
|
"trust_remote_code": trust_remote_code, |
|
} |
|
loaded_models = [] |
|
num_models = len(models) |
|
for midx, model_path in enumerate(models): |
|
log.info( |
|
f"loading model={midx + 1}/{num_models} " |
|
f"model={model_path} " |
|
) |
|
loaded_models.append( |
|
transformers.AutoModelForCausalLM.from_pretrained( |
|
model_path, **config |
|
) |
|
) |
|
return loaded_models |
|
|
|
|
|
def pm( |
|
model, |
|
): |
|
""" |
|
pretty print model |
|
|
|
:param model: show me the model |
|
""" |
|
keys = model.state_dict().keys() |
|
log.info(f"model keys={len(keys)}") |
|
for i, k in enumerate(keys): |
|
tensor = model.state_dict()[k] |
|
log.info( |
|
f"{i:3d} {k} shape={tensor.shape} " |
|
f"type={tensor.dtype} dev={tensor.device} " |
|
f"contig={tensor.is_contiguous()}" |
|
) |
|
|
|
|
|
def run_text_test( |
|
model, |
|
tokenizer_path: str, |
|
question: str, |
|
device: str = "cuda", |
|
): |
|
""" |
|
run a question on the model and return the answer |
|
|
|
:param model: initialized model |
|
:param tokenizer_path: tokenizer path/name |
|
:param question: what are you asking? |
|
:param device: where do you want to run "cpu"/"gpu"? |
|
""" |
|
base_model = model.to(device) |
|
log.info(f"loading tokenizer={tokenizer_path}") |
|
tokenizer = transformers.AutoTokenizer.from_pretrained( |
|
tokenizer_path, |
|
torch_dtype=torch.float16, |
|
) |
|
|
|
inputs = tokenizer(question, return_tensors="pt").to( |
|
device |
|
) |
|
with torch.backends.cuda.sdp_kernel( |
|
enable_flash=True, |
|
enable_math=False, |
|
enable_mem_efficient=True, |
|
): |
|
outputs = base_model.generate( |
|
**inputs, |
|
max_new_tokens=256, |
|
) |
|
answer = tokenizer.decode( |
|
outputs[0], skip_special_tokens=True |
|
) |
|
log.info( |
|
"\n" |
|
"----------" |
|
"\n" |
|
f"tokenizer={tokenizer}\n " |
|
f"question:\n{question}\n" |
|
f"answer:\n{answer}\n" |
|
"----------" |
|
) |
|
base_model = base_model.to(device) |
|
return tokenizer |
|
|
|
|
|
def get_layer_type(key: str) -> Tuple[int, str]: |
|
""" |
|
get the layer type |
|
|
|
:param key: name of the layer |
|
:return: layer id and name |
|
""" |
|
matcher = re.compile(r"model.layers.(\d+).(.+)") |
|
m = matcher.match(key) |
|
if m is None: |
|
if "model.norm.weight" == key: |
|
return -1, "norm" |
|
if "model.embed_tokens.weight" == key: |
|
return -1, "embed" |
|
if "lm_head.weight" == key: |
|
return -1, "head" |
|
log.info(f"Unknown key {key}") |
|
return -1, "unknown" |
|
return int(m.group(1)), m.group(2) |
|
|
|
|
|
def merge_model_with_ties( |
|
models: List[str], |
|
model_dst: str, |
|
trust_remote_code: bool = True, |
|
): |
|
""" |
|
merge the list of models into one model |
|
called model_dst |
|
|
|
:param models: list of models to merge |
|
:param model_dst: name of the new model |
|
:param trust_remote_code: are you sure? True/False |
|
""" |
|
models = get_models( |
|
models=models, |
|
trust_remote_code=trust_remote_code, |
|
) |
|
config = {} |
|
result_dict: Dict[str, torch.Tensor] = {} |
|
device = get_device() |
|
keys = models[0].state_dict().keys() |
|
num_keys = len(keys) |
|
for k in keys: |
|
block, layer_type = get_layer_type(k) |
|
m0: torch.Tensor = models[0].state_dict()[k] |
|
result = m0.clone() |
|
sets = divide_tensor_into_sets(tensor=m0, n_sets=4) |
|
|
|
|
|
m = [ |
|
models[1].state_dict()[k], |
|
models[2].state_dict()[k], |
|
models[3].state_dict()[k], |
|
models[4].state_dict()[k], |
|
] |
|
|
|
|
|
ratio = { |
|
"to_q": 0.0, |
|
"to_k": 0.0, |
|
"to_v": 0.0, |
|
}.get(layer_type, 0.5) |
|
|
|
norm_ratio = 0.68 |
|
log.info( |
|
f"model={k} {num_keys} shape={m0.shape} " |
|
f"dtype={m0.dtype} {m0.device} " |
|
f"ratio={ratio} " |
|
f"contig={m0.is_contiguous()} " |
|
f"norm={norm_ratio}" |
|
) |
|
|
|
|
|
for i, tensor in enumerate(m): |
|
if layer_type == "to_k": |
|
|
|
q_base = models[0].state_dict()[ |
|
k.replace("to_k", "to_q") |
|
] |
|
q_merge = models[i].state_dict()[ |
|
k.replace("to_k", "to_q") |
|
] |
|
scale = relative_norm(q_merge, q_base) |
|
tensor = tensor.to(device) / scale |
|
del scale |
|
elif layer_type == "to_q": |
|
scale = relative_norm(tensor, m0) |
|
tensor = tensor.to(device) * scale |
|
del scale |
|
slice_mask = (sets == i).bool() |
|
new_tensor = dare_ties_sparsification( |
|
model_a_param=m0, |
|
model_b_param=tensor, |
|
drop_rate=norm_ratio, |
|
ties="sum", |
|
rescale="off", |
|
device=device, |
|
**config, |
|
) |
|
new_tensor = merge_tensors( |
|
"slerp", m0, tensor, ratio |
|
) |
|
result = torch.where( |
|
slice_mask, new_tensor, result |
|
) |
|
del new_tensor, slice_mask |
|
|
|
result_dict[k] = result |
|
|
|
|
|
log.info(f"done merge saving to file: {model_dst}") |
|
out_model = ( |
|
transformers.AutoModelForCausalLM.from_pretrained( |
|
model_dst, **config |
|
) |
|
) |
|
out_model.state_dict = lambda: result_dict |
|
out_model.save_pretrained(model_dst) |
|
|
|
|
|
def run(): |
|
""" |
|
run the merge and upload the model and tokenizer |
|
|
|
This requires running having the Hugging Face token |
|
set before it will work: |
|
```huggingface-cli login``` |
|
""" |
|
question = "why is the sky blue?" |
|
log.info( |
|
f"merging models and asking the question: {question}" |
|
) |
|
model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
|
model_dst = "matlok/tinyllama-cinder-openhermes-32k" |
|
device = "cuda" |
|
config = { |
|
"torch_dtype": torch.float16, |
|
"low_cpu_mem_usage": False, |
|
"trust_remote_code": True, |
|
} |
|
models = [ |
|
model_src, |
|
"Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct", |
|
"Doctor-Shotgun/TinyLlama-1.1B-32k", |
|
"Tensoic/TinyLlama-1.1B-3T-openhermes", |
|
"Josephgflowers/TinyLlama-3T-Cinder-v1.3", |
|
] |
|
merge_model_with_ties( |
|
models=models, model_dst=model_dst |
|
) |
|
log.info(f"loading newly-created file: {model_dst}") |
|
model = ( |
|
transformers.AutoModelForCausalLM.from_pretrained( |
|
model_dst, **config |
|
) |
|
) |
|
log.info( |
|
f"loaded new model file: {model_dst} " |
|
f"asking question: {question} " |
|
) |
|
run_text_test( |
|
model=model, |
|
tokenizer_path=model_src, |
|
question=question, |
|
device=device, |
|
) |
|
|
|
|
|
|
|
model_org = model_dst.split("/")[0] |
|
if os.path.exists(model_org): |
|
os.system(f"rm -rf ./{model_org}") |
|
|
|
log.info(f"uploading model: {model_dst}") |
|
model.push_to_hub(model_dst) |
|
|
|
log.info(f"uploading src tokenizer: {model_src}") |
|
|
|
|
|
tokenizer = transformers.AutoTokenizer.from_pretrained( |
|
model_src, trust_remote_code=True |
|
) |
|
|
|
|
|
tokenizer.push_to_hub(model_dst) |
|
log.info( |
|
f"done loading new model: {model} " |
|
f"file: {model_dst}" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
run() |
|
|