File size: 5,224 Bytes
e48b15a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 |
# Convert Hugging Face falcon models to ggml format
#
# Usage:
#
# python3 falcon-convert.py 2 ~/huggingface/models/falcon-7b-instruct ./models/falcon-7b-ggmlv3-f16.bin
#
# This script is similar to "convert-pt-to-ggml.py"
#
import io
import os
import sys
import struct
import json
import code
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
GGML_MEM_ALIGN = 32
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8+n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
if len(sys.argv) < 4:
print("Usage: python3 falcon.py num_parts model_name output [use-f32]")
print(" num_parts: number of pytorch parts, use 0 if not a multipart model. example: 2")
print(" model_name: name of the model to convert.")
print(" output: the output file path will be written")
print(" use-f32: if present, use float32 instead of float16")
sys.exit(1)
num_parts = int(sys.argv[1])
model_name = sys.argv[2]
output = sys.argv[3]
# possible data types
# ftype == 0 -> float32
# ftype == 1 -> float16
#
# map from ftype to string
ftype_str = ["f32", "f16"]
ftype = 1
if len(sys.argv) > 4:
ftype = 0
tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
hparams = config.to_dict()
print("* Loading model from: ", model_name)
#fname_out = dir_out + f"/falcon-7b-instruct-ggmlv3-{ftype_str[ftype]}.bin"
fout = open(output, "wb")
# magic
fout.write(b"ggjt"[::-1])
# config
n_vocab = hparams["vocab_size"]
n_embd = hparams["hidden_size"]
n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
n_layer = hparams["n_layer"]
head_dim = n_embd // n_head
config_values = [
3,
n_vocab,
n_embd,
n_head,
n_head_kv,
n_layer,
ftype
]
fout.write(struct.pack("i" * len(config_values), *config_values))
# vocab
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v:k for k, v in byte_encoder.items()}
for i in range(hparams["vocab_size"]):
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
fout.write(struct.pack("i", len(text)))
fout.write(text)
# score
fout.write(struct.pack('f', 0.0))
# tensor
if num_parts == 0:
partnames= ('pytorch_model.bin',)
else:
partnames = (f'pytorch_model-{n:05}-of-{num_parts:05}.bin' for n in range(1, num_parts + 1))
for partname in partnames:
filename = f'{model_name}/{partname}'
print(f'\n* Loading part: {partname}')
model = torch.load(filename, map_location = 'cpu')
for name in model.keys():
# The original query_key_value tensor contains n_head_kv "kv groups",
# each consisting of n_head/n_head_kv query weights followed by one key
# and one value weight (shared by all query heads in the kv group).
# This layout makes it a big pain to work with in GGML.
# So we rearrange them here,, so that we have n_head query weights
# followed by n_head_kv key weights followed by n_head_kv value weights,
# in contiguous fashion.
if "query_key_value" in name:
qkv = model[name].view(
n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
model[name] = torch.cat((q,k,v)).reshape_as(model[name])
tensor = model[name]
# default type is fp32
ftype_cur = 1 if ftype == 1 and tensor.ndim > 1 else 0
print(f' |', name, tensor.shape, '->', tensor.dtype)
# header
sname = name.encode('utf-8')
fout.write(struct.pack("i" * 3, tensor.ndim, len(sname), ftype_cur))
fout.write(struct.pack("i" * tensor.ndim, *tensor.shape[::-1]))
fout.write(sname)
# save to file
aligned_pos = (fout.tell() + (GGML_MEM_ALIGN - 1)) & -GGML_MEM_ALIGN
fout.seek(aligned_pos)
tensor.to(dtype = torch.float16 if ftype_cur == 1 else torch.float32).numpy().tofile(fout)
fout.close()
print("GGML model file saved to " + output)
print("") |