seba commited on
Commit
af901b1
·
verified ·
1 Parent(s): 8c3d847

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ falcon_edge_3b.mlmodelc/model.mil filter=lfs diff=lfs merge=lfs -text
falcon_edge_3b.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:866173ded7c164e5c27c24d1804e353a945809d54a8f49708cf1480af7eee213
3
+ size 243
falcon_edge_3b.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e75d3ee63e4891a5b0592bf69e6e5e3b99d9cdd8304bc43fa60fb7a0fe3f393e
3
+ size 13436
falcon_edge_3b.mlmodelc/model.mil ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e42adcc750756a1c42e45e9ec9bb81d28532457a83b80e73870f880383b563b
3
+ size 62905007
falcon_edge_3b.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be0ebd66f33be503d96045915631bd460069037cb40d8fa5ac284792de9e457f
3
+ size 734283968
falcon_edge_3b_embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b8ba6463baafcc8eea94a2c442168997a07c452a43a28b26953e57936009256
3
+ size 134217856
falcon_edge_3b_lm_head.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c803f63522fd351bcb1eef33ccba50ae8d4ea88bb8fc2811ca1f3fb3be51e9f
3
+ size 243
falcon_edge_3b_lm_head.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f369abfe93a738838f46aec9d7d18e319be6ae627b804857a4e1401eb4b06f2d
3
+ size 3098
falcon_edge_3b_lm_head.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
falcon_edge_3b_lm_head.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2212bb66c4734951f503b1b5f532a2ad1d618cc10d3c10139c9fc0af731d3743
3
+ size 134222976
falcon_edge_generate.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import coremltools as ct
4
+ import time
5
+ from transformers import AutoTokenizer
6
+ import shutil
7
+ from argparse import ArgumentParser
8
+
9
+
10
+ def copy_compiled_model(mlmodel: ct.models.MLModel, dest: str):
11
+ compiled_model_path = mlmodel.get_compiled_model_path()
12
+ shutil.copytree(compiled_model_path, dest, dirs_exist_ok=True)
13
+
14
+
15
+ def load_mlmodel(path, function_name, copy_compiled):
16
+ extension = os.path.splitext(path)[1]
17
+ if extension == ".mlmodelc":
18
+ return ct.models.CompiledMLModel(
19
+ path,
20
+ function_name=function_name,
21
+ compute_units=ct.ComputeUnit.CPU_AND_NE,
22
+ )
23
+ else:
24
+ mlmodel = ct.models.MLModel(
25
+ path,
26
+ function_name=function_name,
27
+ compute_units=ct.ComputeUnit.CPU_AND_NE,
28
+ )
29
+ if copy_compiled:
30
+ copy_compiled_model(mlmodel, path.replace(".mlpackage", ".mlmodelc"))
31
+ return mlmodel
32
+
33
+
34
+ def load_embeddings(path):
35
+ return np.load(path)
36
+
37
+
38
+ class ModelContainer:
39
+ def __init__(
40
+ self,
41
+ embeddings_path,
42
+ mlmodel_path,
43
+ lm_head_path,
44
+ cache_length,
45
+ hf_model,
46
+ temp=0.7,
47
+ min_p=0.1,
48
+ ):
49
+ self.mlmodel_path = mlmodel_path
50
+ self.embeddings_path = embeddings_path
51
+ self.lm_head_path = lm_head_path
52
+ self.cache_length = cache_length
53
+ self.temp = temp
54
+ self.min_p = min_p
55
+ print("Loading embeddings...")
56
+ self.embeddings = load_embeddings(embeddings_path)
57
+ print("Loading generation model...")
58
+ self.generation_model = load_mlmodel(
59
+ mlmodel_path, f"model_input_1_cache_{cache_length}", copy_compiled=True
60
+ )
61
+ # self.prompt_model = None
62
+ print("Loading prompt model...")
63
+ self.prompt_model = load_mlmodel(
64
+ mlmodel_path.replace(".mlpackage", ".mlmodelc"),
65
+ f"model_input_64_cache_{cache_length}",
66
+ copy_compiled=False,
67
+ )
68
+ print("Loading lm head model...")
69
+ self.lm_head_model = load_mlmodel(
70
+ lm_head_path,
71
+ "min_p_length_1" if temp > 0 else "lm_head_length_1",
72
+ copy_compiled=True,
73
+ )
74
+ self.tokenizer = AutoTokenizer.from_pretrained(hf_model)
75
+ self.end_of_response_token_id = self.tokenizer("<|im_end|>").input_ids[0]
76
+
77
+ self.state = None
78
+ self.position = None
79
+ self.attention_mask = None
80
+
81
+ def initialize_generation(self):
82
+ self.state = self.generation_model.make_state()
83
+ attention_mask = np.arange(self.cache_length, dtype=np.int32)
84
+ attention_mask = attention_mask[:, None] >= attention_mask[None, :]
85
+ attention_mask = attention_mask[None, None, :, :]
86
+ self.attention_mask = np.where(
87
+ attention_mask,
88
+ np.array(0.0, dtype=np.float16),
89
+ np.array(-np.inf, dtype=np.float16),
90
+ )
91
+ self.position = 0
92
+
93
+ def load_prompt_model(self):
94
+ if self.prompt_model is None:
95
+ self.prompt_model = load_mlmodel(
96
+ self.mlmodel_path,
97
+ f"model_input_64_cache_{self.cache_length}",
98
+ copy_compiled=False,
99
+ )
100
+
101
+ def unload_prompt_model(self):
102
+ del self.prompt_model
103
+ self.prompt_model = None
104
+
105
+ def embed(self, ids):
106
+ return self.embeddings[ids] # .transpose(0, 2, 1) # [..., None, :]
107
+
108
+ def process_prompt(self, prompt):
109
+ if self.prompt_model is None:
110
+ self.load_prompt_model()
111
+ messages = [{"role": "user", "content": prompt}]
112
+ tokens = self.tokenizer.apply_chat_template(
113
+ messages, tokenize=True, add_generation_prompt=True
114
+ )
115
+ if self.position + len(tokens) >= self.cache_length:
116
+ return np.array([-1])
117
+ stop_processing = False
118
+ start_time = time.perf_counter()
119
+ processed_chunks = 0
120
+ for i in range(0, len(tokens), 64):
121
+ chunk = tokens[i : min(i + 64, len(tokens))]
122
+ if self.position + len(chunk) > self.cache_length:
123
+ stop_processing = True
124
+ break
125
+ processed_chunks += 1
126
+ embds = self.embed([chunk]).transpose(0, 2, 1)[
127
+ ..., None, :
128
+ ] # [..., None, :]
129
+ if len(chunk) < 64:
130
+ embds = np.concat(
131
+ (
132
+ embds,
133
+ np.zeros(
134
+ (1, embds.shape[1], 1, 64 - len(chunk)), dtype=np.float16
135
+ ),
136
+ ),
137
+ axis=-1,
138
+ )
139
+ kv_write_idx = np.array([self.position], dtype=np.int32)
140
+ positions = np.arange(self.position, self.position + 64, dtype=np.int32)[
141
+ None, :
142
+ ]
143
+ attention_mask = self.attention_mask[
144
+ :, :, self.position : self.position + 64
145
+ ]
146
+ pred = self.prompt_model.predict(
147
+ {
148
+ "hidden_states": embds,
149
+ "kv_write_idx": kv_write_idx,
150
+ "positions": positions,
151
+ "attention_mask": attention_mask,
152
+ },
153
+ self.state,
154
+ )
155
+ self.position += len(chunk)
156
+ self.unload_prompt_model()
157
+ end_time = time.perf_counter()
158
+ print(
159
+ f"==== Processed {processed_chunks * 64} tokens in {end_time - start_time:.2f} seconds, {processed_chunks * 64 / (end_time - start_time):.2f} tokens per second, current position: {self.position}",
160
+ )
161
+ if stop_processing:
162
+ return np.array([-1], dtype=np.int32)
163
+ output_hidden_states = pred["output_hidden_states"][..., [len(chunk) - 1]]
164
+ return self.lm_head(output_hidden_states)
165
+
166
+ def lm_head(self, hidden_states):
167
+ if self.temp > 0:
168
+ input_id = self.lm_head_model.predict(
169
+ {
170
+ "hidden_states": hidden_states,
171
+ "temp": np.array([self.temp], dtype=np.float16),
172
+ "p": np.array([self.min_p], dtype=np.float16),
173
+ "random_number": np.random.uniform(0.0, 1.0, (1,)),
174
+ }
175
+ )["sampled_index"][:, 0]
176
+ else:
177
+ input_id = self.lm_head_model.predict(
178
+ {
179
+ "hidden_states": hidden_states,
180
+ }
181
+ )[
182
+ "argmax"
183
+ ][:, 0]
184
+ return input_id
185
+
186
+ def generate(self, input_id: np.array):
187
+ stop_generation = False
188
+ # for i in range(max_new_tokens):
189
+ start_time = time.perf_counter()
190
+ generated_tokens = 0
191
+ while self.position < self.cache_length:
192
+ generated_tokens += 1
193
+ embd = self.embed(input_id).transpose(0, 3, 1, 2)
194
+ hidden_states = self.generation_model.predict(
195
+ {
196
+ "hidden_states": embd,
197
+ "kv_write_idx": np.array([self.position], dtype=np.int32),
198
+ "positions": np.array([[self.position]], dtype=np.int32),
199
+ "attention_mask": self.attention_mask[:, :, [self.position]],
200
+ },
201
+ self.state,
202
+ )["output_hidden_states"]
203
+ if stop_generation:
204
+ print()
205
+ # print("Loading prompt model...")
206
+ self.position += 1
207
+ break
208
+
209
+ input_id = self.lm_head(hidden_states)
210
+
211
+ input_id_item = input_id.item()
212
+ if input_id_item == self.end_of_response_token_id:
213
+ stop_generation = True
214
+ print(self.tokenizer.decode(input_id_item), end="", flush=True)
215
+ self.position += 1
216
+
217
+ end_time = time.perf_counter()
218
+ print(
219
+ f"==== Generated {generated_tokens} tokens in {end_time - start_time:.2f} seconds, {generated_tokens / (end_time - start_time):.2f} tokens per second, current position: {self.position}",
220
+ )
221
+ # if stop_generation:
222
+ # self.load_prompt_model()
223
+
224
+ def loop(self):
225
+ self.initialize_generation()
226
+ print("Begin conversation...")
227
+ while True:
228
+ print(">>> ", end="", flush=True)
229
+ self.load_prompt_model()
230
+ prompt = input()
231
+ prompt_result = self.process_prompt(prompt)
232
+ if prompt_result.item() == -1:
233
+ print("\n--- END OF CONVERSATION: MAX CONTEXT LENGTH REACHED ---\n")
234
+ break
235
+ print(self.tokenizer.decode(prompt_result.item()), end="", flush=True)
236
+ self.generate(prompt_result)
237
+ if self.position >= (self.cache_length):
238
+ print("\n--- END OF CONVERSATION: MAX CONTEXT LENGTH REACHED ---\n")
239
+ break
240
+
241
+
242
+ def parse_args():
243
+ parser = ArgumentParser()
244
+ parser.add_argument("--model", type=str, required=True)
245
+ parser.add_argument("--lm_head", type=str, required=True)
246
+ parser.add_argument("--embeddings", type=str, required=True)
247
+ parser.add_argument(
248
+ "--cache_length",
249
+ type=int,
250
+ choices=[512, 1024, 2048, 2048 + 1024, 4096, 4096 + 2048, 8192],
251
+ default=1024,
252
+ )
253
+ parser.add_argument("--min_p", type=float, default=0.1)
254
+ parser.add_argument("--temp", type=float, default=0.7)
255
+ # parser.add_argument("--hf_model", type=str, default="")
256
+
257
+ return parser.parse_args()
258
+
259
+
260
+ def main():
261
+ args = parse_args()
262
+ ModelContainer(
263
+ args.embeddings,
264
+ args.model,
265
+ args.lm_head,
266
+ args.cache_length,
267
+ "tiiuae/Falcon-E-1B-Instruct",
268
+ args.temp,
269
+ args.min_p,
270
+ ).loop()
271
+
272
+
273
+ if __name__ == "__main__":
274
+ main()