Upload falcon-convert.py
Browse files- falcon-convert.py +157 -0
falcon-convert.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Convert Hugging Face falcon models to ggml format
|
2 |
+
#
|
3 |
+
# Usage:
|
4 |
+
#
|
5 |
+
# python3 falcon-convert.py 2 ~/huggingface/models/falcon-7b-instruct ./models/falcon-7b-ggmlv3-f16.bin
|
6 |
+
#
|
7 |
+
# This script is similar to "convert-pt-to-ggml.py"
|
8 |
+
#
|
9 |
+
|
10 |
+
import io
|
11 |
+
import os
|
12 |
+
import sys
|
13 |
+
import struct
|
14 |
+
import json
|
15 |
+
import code
|
16 |
+
import torch
|
17 |
+
import numpy as np
|
18 |
+
|
19 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
|
20 |
+
|
21 |
+
|
22 |
+
GGML_MEM_ALIGN = 32
|
23 |
+
|
24 |
+
|
25 |
+
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
26 |
+
def bytes_to_unicode():
|
27 |
+
"""
|
28 |
+
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
29 |
+
The reversible bpe codes work on unicode strings.
|
30 |
+
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
31 |
+
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
32 |
+
This is a significant percentage of your normal, say, 32K bpe vocab.
|
33 |
+
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
34 |
+
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
35 |
+
"""
|
36 |
+
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
37 |
+
cs = bs[:]
|
38 |
+
n = 0
|
39 |
+
for b in range(2**8):
|
40 |
+
if b not in bs:
|
41 |
+
bs.append(b)
|
42 |
+
cs.append(2**8+n)
|
43 |
+
n += 1
|
44 |
+
cs = [chr(n) for n in cs]
|
45 |
+
return dict(zip(bs, cs))
|
46 |
+
|
47 |
+
if len(sys.argv) < 4:
|
48 |
+
print("Usage: python3 falcon.py num_parts model_name output [use-f32]")
|
49 |
+
print(" num_parts: number of pytorch parts, use 0 if not a multipart model. example: 2")
|
50 |
+
print(" model_name: name of the model to convert.")
|
51 |
+
print(" output: the output file path will be written")
|
52 |
+
print(" use-f32: if present, use float32 instead of float16")
|
53 |
+
sys.exit(1)
|
54 |
+
num_parts = int(sys.argv[1])
|
55 |
+
model_name = sys.argv[2]
|
56 |
+
output = sys.argv[3]
|
57 |
+
|
58 |
+
# possible data types
|
59 |
+
# ftype == 0 -> float32
|
60 |
+
# ftype == 1 -> float16
|
61 |
+
#
|
62 |
+
# map from ftype to string
|
63 |
+
ftype_str = ["f32", "f16"]
|
64 |
+
ftype = 1
|
65 |
+
if len(sys.argv) > 4:
|
66 |
+
ftype = 0
|
67 |
+
|
68 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
69 |
+
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
70 |
+
hparams = config.to_dict()
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
print("* Loading model from: ", model_name)
|
75 |
+
|
76 |
+
#fname_out = dir_out + f"/falcon-7b-instruct-ggmlv3-{ftype_str[ftype]}.bin"
|
77 |
+
fout = open(output, "wb")
|
78 |
+
|
79 |
+
# magic
|
80 |
+
fout.write(b"ggjt"[::-1])
|
81 |
+
|
82 |
+
|
83 |
+
# config
|
84 |
+
n_vocab = hparams["vocab_size"]
|
85 |
+
n_embd = hparams["hidden_size"]
|
86 |
+
n_head = hparams["n_head"]
|
87 |
+
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
88 |
+
n_layer = hparams["n_layer"]
|
89 |
+
head_dim = n_embd // n_head
|
90 |
+
config_values = [
|
91 |
+
3,
|
92 |
+
n_vocab,
|
93 |
+
n_embd,
|
94 |
+
n_head,
|
95 |
+
n_head_kv,
|
96 |
+
n_layer,
|
97 |
+
ftype
|
98 |
+
]
|
99 |
+
fout.write(struct.pack("i" * len(config_values), *config_values))
|
100 |
+
|
101 |
+
# vocab
|
102 |
+
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
103 |
+
byte_encoder = bytes_to_unicode()
|
104 |
+
byte_decoder = {v:k for k, v in byte_encoder.items()}
|
105 |
+
|
106 |
+
for i in range(hparams["vocab_size"]):
|
107 |
+
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
108 |
+
fout.write(struct.pack("i", len(text)))
|
109 |
+
fout.write(text)
|
110 |
+
# score
|
111 |
+
fout.write(struct.pack('f', 0.0))
|
112 |
+
|
113 |
+
# tensor
|
114 |
+
if num_parts == 0:
|
115 |
+
partnames= ('pytorch_model.bin',)
|
116 |
+
else:
|
117 |
+
partnames = (f'pytorch_model-{n:05}-of-{num_parts:05}.bin' for n in range(1, num_parts + 1))
|
118 |
+
for partname in partnames:
|
119 |
+
filename = f'{model_name}/{partname}'
|
120 |
+
print(f'\n* Loading part: {partname}')
|
121 |
+
model = torch.load(filename, map_location = 'cpu')
|
122 |
+
for name in model.keys():
|
123 |
+
# The original query_key_value tensor contains n_head_kv "kv groups",
|
124 |
+
# each consisting of n_head/n_head_kv query weights followed by one key
|
125 |
+
# and one value weight (shared by all query heads in the kv group).
|
126 |
+
# This layout makes it a big pain to work with in GGML.
|
127 |
+
# So we rearrange them here,, so that we have n_head query weights
|
128 |
+
# followed by n_head_kv key weights followed by n_head_kv value weights,
|
129 |
+
# in contiguous fashion.
|
130 |
+
if "query_key_value" in name:
|
131 |
+
qkv = model[name].view(
|
132 |
+
n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
|
133 |
+
|
134 |
+
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
|
135 |
+
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
136 |
+
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
137 |
+
|
138 |
+
model[name] = torch.cat((q,k,v)).reshape_as(model[name])
|
139 |
+
tensor = model[name]
|
140 |
+
# default type is fp32
|
141 |
+
ftype_cur = 1 if ftype == 1 and tensor.ndim > 1 else 0
|
142 |
+
print(f' |', name, tensor.shape, '->', tensor.dtype)
|
143 |
+
# header
|
144 |
+
sname = name.encode('utf-8')
|
145 |
+
fout.write(struct.pack("i" * 3, tensor.ndim, len(sname), ftype_cur))
|
146 |
+
fout.write(struct.pack("i" * tensor.ndim, *tensor.shape[::-1]))
|
147 |
+
fout.write(sname)
|
148 |
+
|
149 |
+
# save to file
|
150 |
+
aligned_pos = (fout.tell() + (GGML_MEM_ALIGN - 1)) & -GGML_MEM_ALIGN
|
151 |
+
fout.seek(aligned_pos)
|
152 |
+
tensor.to(dtype = torch.float16 if ftype_cur == 1 else torch.float32).numpy().tofile(fout)
|
153 |
+
|
154 |
+
fout.close()
|
155 |
+
|
156 |
+
print("GGML model file saved to " + output)
|
157 |
+
print("")
|