from transformers import AutoModelForCausalLM, AutoTokenizer | |
from transformers.generation import GenerationConfig | |
from modeling_qwen import QWenLMHeadModel as QWEN | |
# Note: The default behavior now has injection attack prevention off. | |
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True) | |
# use bf16 | |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True, bf16=True).eval() | |
# use fp16 | |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True, fp16=True).eval() | |
# use cpu only | |
# model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="cpu", trust_remote_code=True).eval() | |
# use auto mode, automatically select precision based on the device. | |
#model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-7B", device_map="auto", trust_remote_code=True).eval() | |
model = QWEN.from_pretrained('/data3/user23215411/SYF/LLM-Pruner/prune_log/qwen_prune/pretrain-save',device_map="cuda") | |
#tokenizer.from_pretrained('/data3/user23215411/SYF/LLM-Pruner/prune_log/qwen_prune/pretrain-save') | |
# Specify hyperparameters for generation. But if you use transformers>=4.32.0, there is no need to do this. | |
# model.generation_config = GenerationConfig.from_pretrained("Qwen/Qwen-7B", trust_remote_code=True) | |
inputs = tokenizer('蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n埃塞俄比亚的首都是', return_tensors='pt') | |
inputs = inputs.to(model.device) | |
pred = model.generate(**inputs) | |
print(tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)) | |
# 蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n埃塞俄比亚的首都是亚的斯亚贝巴(Addis Ababa)... | |