File size: 3,043 Bytes
edfe3bc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
import time
import numpy as np
from argparse import ArgumentParser
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")
parser = ArgumentParser()
parser.add_argument("--model_path", "--model-path", required=True)
parser.add_argument("--prompt", "-p", required=True)
parser.add_argument("--max-tokens", "--max_tokens", type=int, default=100)
parser.add_argument("--min_p", "--min-p", type=float, default=0.3)
parser.add_argument("--temp", type=float, default=1.0)
args = parser.parse_args()
import coremltools as ct
print("Loading model...")
if args.model_path.rstrip("/").endswith(".mlpackage"):
mf_model_1 = ct.models.MLModel(
args.model_path,
compute_units=ct.ComputeUnit.CPU_AND_NE,
function_name="length_1",
)
mf_model_64 = ct.models.MLModel(
args.model_path,
compute_units=ct.ComputeUnit.CPU_AND_NE,
function_name="length_64",
)
else:
mf_model_1 = ct.models.CompiledMLModel(
args.model_path,
compute_units=ct.ComputeUnit.CPU_AND_NE,
function_name="length_1",
)
mf_model_64 = ct.models.CompiledMLModel(
args.model_path,
compute_units=ct.ComputeUnit.CPU_AND_NE,
function_name="length_64",
)
def min_p_sample(logits, min_p, temp):
# logits = logits.astype(np.float16)
max_ = np.max(logits * (1 / temp), axis=1, keepdims=True)
logits = logits - max_
logits = np.exp(logits)
logits[logits < min_p] = 0
# logits = logits.astype(np.float32)
logits = np.cumsum(logits, axis=1)
sample = np.random.uniform(high=logits[:, -1:])
sample = np.argmax(logits > sample, axis=1).astype(np.int32)
return sample
length = len(tokenizer(args.prompt)["input_ids"])
input_ids = tokenizer(
args.prompt, return_tensors="np", padding="max_length", max_length=64
)["input_ids"].astype(np.int32)
print("Prompt:", args.prompt)
state = mf_model_64.make_state()
start = time.time()
pred = mf_model_64.predict(
{"input_ids": input_ids, "query_pos1": np.array([0], dtype=np.int32)}, state
)
prompt_time = time.time() - start
# input_ids = pred["logits"][..., length - 1].argmax(1, keepdims=True).astype(np.int32)
logits = pred["logits"][..., [length - 1]]
input_ids = min_p_sample(logits, args.min_p, args.temp)
print("Generated:")
print(tokenizer.decode(input_ids[0]), end="", flush=True)
start = time.time()
for i in range(args.max_tokens):
pred = mf_model_1.predict(
{"input_ids": input_ids, "query_pos1": np.array([i + length], dtype=np.int32)},
state,
)
input_ids = min_p_sample(pred["logits"], args.min_p, args.temp)
# input_ids = pred["logits"].argmax(1).astype(np.int32)
print(tokenizer.decode(input_ids[0]), end="", flush=True)
print("", "=" * 10)
generation_time = time.time() - start
print(
"Prompt:",
length / prompt_time,
"tokens-per-sec",
f"({64 / prompt_time} considering the processed padding)",
)
print("Generation:", args.max_tokens / generation_time, "tokens-per-sec")
|