Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Smoke test: load Qwen (or any causal LM) from a local folder and run one generation. | |
| Usage: | |
| python3 training/local_model_inference_check.py --model-dir ./model | |
| python3 training/local_model_inference_check.py --model-dir /path/to/model --device cpu | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import sys | |
| def main() -> int: | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument( | |
| "--model-dir", | |
| default="model", | |
| help="Path to local folder with config + weights (e.g. ./model)", | |
| ) | |
| ap.add_argument("--device", default="auto", help="auto | cuda | cpu") | |
| ap.add_argument("--max-new-tokens", type=int, default=128) | |
| args = ap.parse_args() | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| did = "cuda" if torch.cuda.is_available() else "cpu" | |
| if args.device == "auto": | |
| dev = did | |
| else: | |
| dev = args.device | |
| print(f"[load] {args.model_dir!r} | device={dev}", flush=True) | |
| tok = AutoTokenizer.from_pretrained(args.model_dir, local_files_only=True, trust_remote_code=True) | |
| if tok.pad_token is None: | |
| tok.pad_token = tok.eos_token | |
| # fp16 on GPU is enough for 1.5B; CPU can stay fp32 | |
| dtype = torch.float16 if dev == "cuda" else torch.float32 | |
| model = AutoModelForCausalLM.from_pretrained( | |
| args.model_dir, | |
| local_files_only=True, | |
| trust_remote_code=True, | |
| torch_dtype=dtype, | |
| ) | |
| if dev == "cpu": | |
| model = model.to("cpu") | |
| else: | |
| model = model.to("cuda") | |
| model.eval() | |
| messages = [ | |
| {"role": "system", "content": "You reply briefly."}, | |
| {"role": "user", "content": "Say the capital of France in one line."}, | |
| ] | |
| if hasattr(tok, "apply_chat_template") and tok.chat_template is not None: | |
| prompt = tok.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| else: | |
| prompt = messages[0]["content"] + "\n" + messages[1]["content"] | |
| inputs = tok(prompt, return_tensors="pt") | |
| if dev == "cuda": | |
| inputs = {k: v.cuda() for k, v in inputs.items()} | |
| with torch.inference_mode(): | |
| out = model.generate( | |
| **inputs, | |
| max_new_tokens=args.max_new_tokens, | |
| do_sample=False, | |
| pad_token_id=tok.pad_token_id, | |
| ) | |
| new_tokens = out[0, inputs["input_ids"].shape[1] :] | |
| text = tok.decode(new_tokens, skip_special_tokens=True) | |
| print("\n[ok] generation:\n" + text.strip() + "\n", flush=True) | |
| return 0 | |
| if __name__ == "__main__": | |
| try: | |
| raise SystemExit(main()) | |
| except OSError as e: | |
| if "local_files_only" in str(e) or e.errno in (2, 20): | |
| print( | |
| "Hint: run from the parent of `model/`, e.g.:\n" | |
| " cd autodatalab-plus && python3 training/local_model_inference_check.py --model-dir model", | |
| file=sys.stderr, | |
| ) | |
| raise | |