File size: 5,224 Bytes
e48b15a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Convert Hugging Face falcon models to ggml format
#
# Usage:
#
#   python3 falcon-convert.py 2 ~/huggingface/models/falcon-7b-instruct ./models/falcon-7b-ggmlv3-f16.bin
#
# This script is similar to "convert-pt-to-ggml.py"
#

import io
import os
import sys
import struct
import json
import code
import torch
import numpy as np

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig


GGML_MEM_ALIGN = 32


# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a significant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

if len(sys.argv) < 4:
    print("Usage: python3 falcon.py num_parts model_name output [use-f32]")
    print("  num_parts: number of pytorch parts, use 0 if not a multipart model. example: 2")
    print("  model_name: name of the model to convert.")
    print("  output: the output file path will be written")
    print("  use-f32:    if present, use float32 instead of float16")
    sys.exit(1)
num_parts = int(sys.argv[1])
model_name = sys.argv[2]
output = sys.argv[3]

# possible data types
#   ftype == 0 -> float32
#   ftype == 1 -> float16
#
# map from ftype to string
ftype_str = ["f32", "f16"]
ftype = 1
if len(sys.argv) > 4:
    ftype = 0

tokenizer = AutoTokenizer.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
hparams = config.to_dict()



print("* Loading model from: ", model_name)

#fname_out = dir_out + f"/falcon-7b-instruct-ggmlv3-{ftype_str[ftype]}.bin"
fout = open(output, "wb")

# magic
fout.write(b"ggjt"[::-1])


# config
n_vocab = hparams["vocab_size"]
n_embd = hparams["hidden_size"]
n_head = hparams["n_head"]
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
n_layer = hparams["n_layer"]
head_dim = n_embd // n_head
config_values = [
    3,
    n_vocab,
    n_embd,
    n_head,
    n_head_kv,
    n_layer,
    ftype
]
fout.write(struct.pack("i" * len(config_values), *config_values))

# vocab
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
byte_encoder = bytes_to_unicode()
byte_decoder = {v:k for k, v in byte_encoder.items()}

for i in range(hparams["vocab_size"]):
    text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
    # score
    fout.write(struct.pack('f', 0.0))

# tensor
if num_parts == 0:
    partnames= ('pytorch_model.bin',)
else:
    partnames = (f'pytorch_model-{n:05}-of-{num_parts:05}.bin' for n in range(1, num_parts + 1))
for partname in partnames:
    filename = f'{model_name}/{partname}'
    print(f'\n* Loading part: {partname}')
    model = torch.load(filename, map_location = 'cpu')
    for name in model.keys():
        # The original query_key_value tensor contains n_head_kv "kv groups",
        # each consisting of n_head/n_head_kv query weights followed by one key
        # and one value weight (shared by all query heads in the kv group).
        # This layout makes it a big pain to work with in GGML.
        # So we rearrange them here,, so that we have n_head query weights
        # followed by n_head_kv key weights followed by n_head_kv value weights,
        # in contiguous fashion.
        if "query_key_value" in name:
            qkv = model[name].view(
                n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)

            q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
            k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
            v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)

            model[name] = torch.cat((q,k,v)).reshape_as(model[name])
        tensor = model[name]
        # default type is fp32
        ftype_cur = 1 if ftype == 1 and tensor.ndim > 1 else 0
        print(f'  |', name, tensor.shape, '->', tensor.dtype)
        # header
        sname = name.encode('utf-8')
        fout.write(struct.pack("i" * 3, tensor.ndim, len(sname), ftype_cur))
        fout.write(struct.pack("i" * tensor.ndim, *tensor.shape[::-1]))
        fout.write(sname)

        # save to file
        aligned_pos = (fout.tell() + (GGML_MEM_ALIGN - 1)) & -GGML_MEM_ALIGN
        fout.seek(aligned_pos)
        tensor.to(dtype = torch.float16 if ftype_cur == 1 else torch.float32).numpy().tofile(fout)

fout.close()

print("GGML model file saved to " + output)
print("")