Instructions to use yujiepan/deepseek-v4-tiny-random with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use yujiepan/deepseek-v4-tiny-random with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="yujiepan/deepseek-v4-tiny-random")# Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("yujiepan/deepseek-v4-tiny-random") model = AutoModelForCausalLM.from_pretrained("yujiepan/deepseek-v4-tiny-random") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use yujiepan/deepseek-v4-tiny-random with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "yujiepan/deepseek-v4-tiny-random" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "yujiepan/deepseek-v4-tiny-random", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/yujiepan/deepseek-v4-tiny-random
- SGLang
How to use yujiepan/deepseek-v4-tiny-random with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "yujiepan/deepseek-v4-tiny-random" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "yujiepan/deepseek-v4-tiny-random", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "yujiepan/deepseek-v4-tiny-random" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "yujiepan/deepseek-v4-tiny-random", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use yujiepan/deepseek-v4-tiny-random with Docker Model Runner:
docker model run hf.co/yujiepan/deepseek-v4-tiny-random
| #!/usr/bin/env python3 | |
| """ | |
| 从 HuggingFace 风格的 config.json 构建 inference/model.py 中的 Transformer, | |
| 导出 state_dict,键名与仓库内 model.safetensors.index.json / info.html 中的 Metadata 列一致 | |
| (例如 embed.weight、layers.0.attn.wq_a.weight、layers.0.ffn.shared_experts.w1.weight, | |
| 无 transformers 的 model. 前缀)。 | |
| 分片保存:指定 --output-dir 时,按 --index-json(默认仓库根目录 model.safetensors.index.json) | |
| 的 weight_map 将张量写入对应 model-XXXX-of-YYYY.safetensors,并生成同目录下的 | |
| model.safetensors.index.json(键名与参考索引一致,分片文件名与参考索引一致)。 | |
| 用法示例: | |
| python export_state_dict_from_config.py --config config.json --output /tmp/out.safetensors | |
| python export_state_dict_from_config.py --config config.json --output-dir /tmp/shards | |
| 完整 max_position_embeddings 会在每层分配 RoPE 缓冲,内存很大;默认将 max_seq_len 限制为 | |
| 65536,可用 --max-seq-len 覆盖。state_dict 不包含 persistent=False 的 buffer,与官方分片键集合对齐。 | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import sys | |
| from collections import defaultdict | |
| from pathlib import Path | |
| from typing import Any | |
| REPO_ROOT = Path(__file__).resolve().parent | |
| INFERENCE_DIR = REPO_ROOT / "inference" | |
| sys.path.insert(0, str(INFERENCE_DIR)) | |
| from model import ModelArgs, Transformer # noqa: E402 | |
| def without_mtp_tied_aliases(sd: dict[str, Any]) -> dict[str, Any]: | |
| """MTP 的 embed/head 与主模块共享参数,safetensors 不能重复保存这些别名。""" | |
| out: dict[str, Any] = {} | |
| for k, v in sd.items(): | |
| parts = k.split(".") | |
| if len(parts) >= 4 and parts[0] == "mtp" and parts[2] in {"embed", "head"}: | |
| continue | |
| out[k] = v | |
| return out | |
| def _load_index_weight_map(index_path: Path) -> dict[str, str]: | |
| with open(index_path, encoding="utf-8") as f: | |
| idx = json.load(f) | |
| wm = idx.get("weight_map") | |
| if not isinstance(wm, dict): | |
| raise SystemExit(f"{index_path} 缺少 weight_map") | |
| return dict(wm) | |
| def save_sharded_index_style( | |
| sd_inference: dict[str, Any], | |
| weight_map: dict[str, str], | |
| out_dir: Path, | |
| ) -> None: | |
| """按 weight_map 中的分片文件名写入多个 .safetensors,并写 model.safetensors.index.json。""" | |
| from safetensors.torch import save_file | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| shard_to_tensors: dict[str, dict[str, Any]] = defaultdict(dict) | |
| new_weight_map: dict[str, str] = {} | |
| for inf_key, shard_file in weight_map.items(): | |
| if inf_key not in sd_inference: | |
| continue | |
| shard_to_tensors[shard_file][inf_key] = sd_inference[inf_key] | |
| new_weight_map[inf_key] = shard_file | |
| if not new_weight_map: | |
| raise SystemExit("weight_map 与当前 state_dict 无交集,无法分片写出") | |
| total_size = 0 | |
| for shard_file, part in shard_to_tensors.items(): | |
| for t in part.values(): | |
| total_size += int(t.numel()) * int(t.element_size()) | |
| save_file(part, str(out_dir / shard_file)) | |
| index_out = { | |
| "metadata": {"total_size": total_size}, | |
| "weight_map": new_weight_map, | |
| } | |
| with open(out_dir / "model.safetensors.index.json", "w", encoding="utf-8") as f: | |
| json.dump(index_out, f, indent=2) | |
| f.write("\n") | |
| def _is_hf_deepseek_config(cfg: dict[str, Any]) -> bool: | |
| return cfg.get("model_type") == "deepseek_v4" or "num_hidden_layers" in cfg | |
| def hf_config_to_model_args( | |
| cfg: dict[str, Any], | |
| *, | |
| max_batch_size: int, | |
| max_seq_len: int, | |
| ) -> ModelArgs: | |
| """将仓库根目录下 HuggingFace 的 config.json 转为 inference.ModelArgs。""" | |
| rope = cfg.get("rope_scaling") or {} | |
| qc = cfg.get("quantization_config") or {} | |
| use_fp8 = qc.get("quant_method") == "fp8" or str(qc.get("fmt", "")).lower() in ( | |
| "e4m3", | |
| "e4m3fn", | |
| "fp8", | |
| ) | |
| n_layers = int(cfg["num_hidden_layers"]) | |
| ratios = list(cfg.get("compress_ratios") or ()) | |
| compress_ratios = tuple(int(x) for x in ratios) | |
| return ModelArgs( | |
| max_batch_size=max_batch_size, | |
| max_seq_len=max_seq_len, | |
| dtype="fp8" if use_fp8 else "bf16", | |
| scale_fmt=qc.get("scale_fmt") or cfg.get("scale_fmt") or "ue8m0", | |
| expert_dtype=cfg.get("expert_dtype"), | |
| scale_dtype=cfg.get("scale_dtype") or ("fp8" if use_fp8 else "fp32"), | |
| vocab_size=int(cfg["vocab_size"]), | |
| dim=int(cfg["hidden_size"]), | |
| moe_inter_dim=int(cfg["moe_intermediate_size"]), | |
| n_layers=n_layers, | |
| n_hash_layers=int(cfg.get("num_hash_layers", 0)), | |
| n_mtp_layers=int(cfg.get("num_nextn_predict_layers", 0)), | |
| n_heads=int(cfg["num_attention_heads"]), | |
| n_routed_experts=int(cfg["n_routed_experts"]), | |
| n_shared_experts=int(cfg.get("n_shared_experts", 1)), | |
| n_activated_experts=int(cfg["num_experts_per_tok"]), | |
| score_func=str(cfg.get("scoring_func", cfg.get( | |
| "score_func", "sqrtsoftplus"))), | |
| route_scale=float(cfg.get("routed_scaling_factor", | |
| cfg.get("route_scale", 1.0))), | |
| swiglu_limit=float(cfg.get("swiglu_limit", 0.0)), | |
| q_lora_rank=int(cfg["q_lora_rank"]), | |
| head_dim=int(cfg["head_dim"]), | |
| rope_head_dim=int(cfg["qk_rope_head_dim"]), | |
| norm_eps=float(cfg.get("rms_norm_eps", cfg.get("norm_eps", 1e-6))), | |
| o_groups=int(cfg["o_groups"]), | |
| o_lora_rank=int(cfg["o_lora_rank"]), | |
| window_size=int(cfg["sliding_window"]), | |
| compress_ratios=compress_ratios, | |
| compress_rope_theta=float(cfg.get("compress_rope_theta", 160000.0)), | |
| original_seq_len=int( | |
| rope.get( | |
| "original_max_position_embeddings", | |
| cfg.get("original_seq_len", 0), | |
| ) | |
| ), | |
| rope_theta=float(cfg.get("rope_theta", 10000.0)), | |
| rope_factor=float(rope.get("factor", cfg.get("rope_factor", 1.0))), | |
| beta_fast=int(rope.get("beta_fast", cfg.get("beta_fast", 32))), | |
| beta_slow=int(rope.get("beta_slow", cfg.get("beta_slow", 1))), | |
| index_n_heads=int(cfg["index_n_heads"]), | |
| index_head_dim=int(cfg["index_head_dim"]), | |
| index_topk=int(cfg["index_topk"]), | |
| hc_mult=int(cfg["hc_mult"]), | |
| hc_sinkhorn_iters=int(cfg["hc_sinkhorn_iters"]), | |
| hc_eps=float(cfg.get("hc_eps", 1e-6)), | |
| ) | |
| def load_model_args( | |
| config_path: Path, | |
| *, | |
| max_batch_size: int, | |
| max_seq_len: int | None, | |
| cap_seq_len: int, | |
| ) -> ModelArgs: | |
| with open(config_path, encoding="utf-8") as f: | |
| raw = json.load(f) | |
| if _is_hf_deepseek_config(raw): | |
| cfg_max = int(raw.get("max_position_embeddings", cap_seq_len)) | |
| mseq = max_seq_len if max_seq_len is not None else min( | |
| cfg_max, cap_seq_len) | |
| return hf_config_to_model_args( | |
| raw, max_batch_size=max_batch_size, max_seq_len=mseq | |
| ) | |
| return ModelArgs(**raw) | |
| def optional_prefix_keys( | |
| sd: dict[str, Any], prefix: str | None | |
| ) -> dict[str, Any]: | |
| if not prefix: | |
| return sd | |
| p = prefix.rstrip(".") + "." | |
| return {p + k: v for k, v in sd.items()} | |
| def validate_against_index( | |
| keys: set[str], index_path: Path | |
| ) -> tuple[set[str], set[str]]: | |
| with open(index_path, encoding="utf-8") as f: | |
| idx = json.load(f) | |
| ref = set(idx.get("weight_map", {}).keys()) | |
| missing = ref - keys | |
| extra = keys - ref | |
| return missing, extra | |
| def main() -> None: | |
| ap = argparse.ArgumentParser(description=__doc__) | |
| ap.add_argument( | |
| "--config", | |
| type=Path, | |
| default=REPO_ROOT / "config.json", | |
| help="config.json(HF 或 inference 格式)", | |
| ) | |
| out_group = ap.add_mutually_exclusive_group(required=True) | |
| out_group.add_argument( | |
| "--output", | |
| type=Path, | |
| help="输出单个 .safetensors 或 .pt / .pth", | |
| ) | |
| out_group.add_argument( | |
| "--output-dir", | |
| type=Path, | |
| default=None, | |
| help="按 index 的 weight_map 分片写入该目录(仅 safetensors + model.safetensors.index.json)", | |
| ) | |
| ap.add_argument("--device", type=str, default="cpu", | |
| help="cpu / cuda / meta") | |
| ap.add_argument("--max-batch-size", type=int, default=4) | |
| ap.add_argument( | |
| "--max-seq-len", | |
| type=int, | |
| default=None, | |
| help="覆盖 ModelArgs.max_seq_len;默认 min(max_position_embeddings, --cap-seq-len)", | |
| ) | |
| ap.add_argument( | |
| "--cap-seq-len", | |
| type=int, | |
| default=65536, | |
| help="HF 配置下默认 max_seq_len 上限,避免每层分配过大 RoPE 缓冲", | |
| ) | |
| ap.add_argument( | |
| "--prefix", | |
| type=str, | |
| default="", | |
| help='为输出键名加此前缀(如 "model.");默认与 index 一致,不加前缀', | |
| ) | |
| ap.add_argument( | |
| "--index-json", | |
| type=Path, | |
| default=None, | |
| help="校验或分片布局:默认在 --output-dir 时为仓库根 model.safetensors.index.json", | |
| ) | |
| ap.add_argument("--strict-index", action="store_true", | |
| help="与 --index-json 联用,要求键完全一致") | |
| args = ap.parse_args() | |
| import torch | |
| torch.set_default_dtype(torch.bfloat16) | |
| margs = load_model_args( | |
| args.config, | |
| max_batch_size=args.max_batch_size, | |
| max_seq_len=args.max_seq_len, | |
| cap_seq_len=args.cap_seq_len, | |
| ) | |
| with open(REPO_ROOT / "ds_config.json", "w", encoding="utf-8") as f: | |
| json.dump(margs.__dict__, f, indent=2) | |
| dev = torch.device(args.device) | |
| with torch.device(dev): | |
| from transformers import set_seed | |
| set_seed(42) | |
| model = Transformer(margs) | |
| n_params = sum(p.numel() for p in model.parameters()) | |
| print(f"Number of parameters: {n_params}") | |
| for k, v in model.named_modules(): | |
| if k.count('.') <= 3: | |
| n_params_k = sum(p.numel() for p in v.parameters()) | |
| print(k, f"{n_params_k} {n_params_k / n_params:.2%}") | |
| if dev.type != "meta": | |
| model = model.to(dev) | |
| for k, v in model.named_parameters(): | |
| if 'norm.weight' in k or k == "head.weight": | |
| v.data = v.data.to(torch.bfloat16) | |
| sd_inference = without_mtp_tied_aliases(model.state_dict()) | |
| index_path = args.index_json | |
| if args.output_dir is not None and index_path is None: | |
| index_path = REPO_ROOT / "model.safetensors.index.json" | |
| if index_path is not None: | |
| missing, extra = validate_against_index( | |
| set(sd_inference.keys()), index_path) | |
| if missing or extra: | |
| msg = f"index 对比: missing={len(missing)} extra={len(extra)}" | |
| if missing and len(missing) <= 20: | |
| msg += f"\n missing 样例: {sorted(missing)[:20]}" | |
| elif missing: | |
| msg += f"\n missing 样例: {sorted(missing)[:5]} ..." | |
| if extra and len(extra) <= 20: | |
| msg += f"\n extra 样例: {sorted(extra)[:20]}" | |
| elif extra: | |
| msg += f"\n extra 样例: {sorted(extra)[:5]} ..." | |
| if args.strict_index: | |
| raise SystemExit(msg) | |
| print(msg, file=sys.stderr) | |
| if args.output_dir is not None: | |
| if dev.type == "meta": | |
| raise SystemExit("device=meta 时无法写 safetensors,请改用 cpu/cuda") | |
| wm = _load_index_weight_map(index_path) # type: ignore[arg-type] | |
| save_sharded_index_style( | |
| sd_inference, | |
| wm, | |
| args.output_dir, | |
| ) | |
| written = sum(1 for k in wm if k in sd_inference) | |
| n_shards = len({wm[k] for k in wm if k in sd_inference}) | |
| print( | |
| f"Wrote {written} tensors in {n_shards} shard files under {args.output_dir}" | |
| ) | |
| return | |
| sd = optional_prefix_keys(sd_inference, args.prefix or None) | |
| out = args.output | |
| out.parent.mkdir(parents=True, exist_ok=True) | |
| suffix = out.suffix.lower() | |
| for k, v in sd.items(): | |
| print(k, v.shape, v.dtype) | |
| if suffix == ".safetensors": | |
| from safetensors.torch import save_file | |
| # meta 张量无法写入 safetensors | |
| if dev.type == "meta": | |
| raise SystemExit( | |
| "device=meta 时无法写 safetensors,请改用 cpu/cuda 或输出 .pt") | |
| save_file(sd, str(out)) | |
| else: | |
| if dev.type == "meta": | |
| torch.save(sd, out) | |
| else: | |
| torch.save(sd, out) | |
| print(f"Wrote {len(sd)} tensors to {out}") | |
| if __name__ == "__main__": | |
| main() | |