| | |
| | """Test the full-unary popcount engine.""" |
| | import ctypes, numpy as np, os, time, sys |
| | os.environ["OMP_NUM_THREADS"] = "16" |
| |
|
| | MODEL_DIR = sys.argv[1] if len(sys.argv) > 1 else "deepseek-r1-1.5b-unary4" |
| | HF_DIR = "deepseek-r1-1.5b-hf" |
| | N_PLANES = int(sys.argv[2]) if len(sys.argv) > 2 else 4 |
| |
|
| | lib = ctypes.CDLL("./unary_full.so") |
| | lib.model_alloc.restype = ctypes.c_void_p |
| | lib.model_alloc.argtypes = [ctypes.c_int] |
| | lib.model_set_embed.argtypes = [ctypes.c_void_p, ctypes.c_void_p] |
| | lib.model_set_final_norm.argtypes = [ctypes.c_void_p, ctypes.c_void_p] |
| | lib.model_set_lm_head.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int] |
| | lib.layer_set_norms.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p] |
| | lib.layer_set_bias.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p] |
| | args = [ctypes.c_void_p, ctypes.c_int] |
| | for _ in range(7): |
| | args += [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int] |
| | args.append(ctypes.c_int) |
| | lib.layer_set_linears.argtypes = args |
| | lib.generate.restype = ctypes.c_int |
| | lib.generate.argtypes = [ |
| | ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, |
| | ctypes.c_void_p, ctypes.c_int, |
| | ctypes.c_float, ctypes.c_float, ctypes.c_int |
| | ] |
| | lib.model_reset_cache.argtypes = [ctypes.c_void_p] |
| | lib.model_free.argtypes = [ctypes.c_void_p] |
| |
|
| | _refs = [] |
| | def keep(a): |
| | _refs.append(a) |
| | return a.ctypes.data |
| |
|
| | print(f"Loading model from {MODEL_DIR} (w_planes={N_PLANES})...") |
| | m = lib.model_alloc(N_PLANES) |
| |
|
| | |
| | e = np.fromfile(os.path.join(MODEL_DIR, "model_embed_tokens_weight.fp16"), dtype=np.uint16) |
| | lib.model_set_embed(m, keep(e)) |
| | fn = np.fromfile(os.path.join(MODEL_DIR, "model_norm_weight.fp16"), dtype=np.float16).astype(np.float32) |
| | lib.model_set_final_norm(m, keep(fn)) |
| | lm = np.fromfile(os.path.join(MODEL_DIR, "lm_head_weight.fp16"), dtype=np.uint16) |
| | lib.model_set_lm_head(m, keep(lm), 151936, 1536) |
| |
|
| | PROJS = ["self_attn_q_proj", "self_attn_k_proj", "self_attn_v_proj", |
| | "self_attn_o_proj", "mlp_gate_proj", "mlp_up_proj", "mlp_down_proj"] |
| | DIMS = { |
| | "self_attn_q_proj": (1536, 1536), "self_attn_k_proj": (256, 1536), |
| | "self_attn_v_proj": (256, 1536), "self_attn_o_proj": (1536, 1536), |
| | "mlp_gate_proj": (8960, 1536), "mlp_up_proj": (8960, 1536), |
| | "mlp_down_proj": (1536, 8960), |
| | } |
| |
|
| | for l in range(28): |
| | in_n = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_input_layernorm_weight.fp16"), dtype=np.float16).astype(np.float32) |
| | po_n = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_post_attention_layernorm_weight.fp16"), dtype=np.float16).astype(np.float32) |
| | lib.layer_set_norms(m, l, keep(in_n), keep(po_n)) |
| | |
| | qb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_q_proj_bias.fp16"), dtype=np.float16).astype(np.float32) |
| | kb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_k_proj_bias.fp16"), dtype=np.float16).astype(np.float32) |
| | vb = np.fromfile(os.path.join(MODEL_DIR, f"model_layers_{l}_self_attn_v_proj_bias.fp16"), dtype=np.float16).astype(np.float32) |
| | lib.layer_set_bias(m, l, keep(qb), keep(kb), keep(vb)) |
| | |
| | pa = [] |
| | for p in PROJS: |
| | base = os.path.join(MODEL_DIR, f"model_layers_{l}_{p}_weight") |
| | s = np.fromfile(base + ".sign", dtype=np.uint64) |
| | pl = np.fromfile(base + ".planes", dtype=np.uint64) |
| | sc = np.fromfile(base + ".scales", dtype=np.float32) |
| | od, id_ = DIMS[p] |
| | pa.extend([keep(s), keep(pl), keep(sc), od, id_]) |
| | lib.layer_set_linears(m, l, *pa, N_PLANES) |
| | if (l + 1) % 7 == 0: |
| | print(f" Layer {l+1}/28") |
| |
|
| | print("Model loaded!") |
| |
|
| | from transformers import AutoTokenizer |
| | tok = AutoTokenizer.from_pretrained(HF_DIR, trust_remote_code=True) |
| |
|
| | msg = [{"role": "user", "content": "What is 2+2?"}] |
| | ids = tok.apply_chat_template(msg, add_generation_prompt=True) |
| | arr = np.array(ids, dtype=np.int32) |
| | out = np.zeros(30, dtype=np.int32) |
| |
|
| | lib.model_reset_cache(m) |
| | print(f"Prompt: {len(ids)} tokens, generating 30...") |
| | t0 = time.time() |
| | n = lib.generate(m, arr.ctypes.data, len(ids), out.ctypes.data, 30, |
| | ctypes.c_float(0.6), ctypes.c_float(0.9), tok.eos_token_id) |
| | dt = time.time() - t0 |
| | text = tok.decode(out[:n].tolist(), skip_special_tokens=False) |
| | print(f"\n=== {n} tokens, {dt:.1f}s, {n/dt:.1f} tok/s ===") |
| | print(text) |
| | print("===") |
| | lib.model_free(m) |
| |
|