File size: 1,656 Bytes
57a3662 b0e9f01 57a3662 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 |
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
# --------------------------------------------------------------------------
import os
from pathlib import Path
import torch
import torch.distributed as dist
from optimum.onnxruntime import ORTModelForCausalLM
from transformers import AutoConfig, AutoTokenizer, GenerationConfig
device_id = 0
device = torch.device(f"cuda:{device_id}") # Change to torch.device("cpu") if running on CPU
ep = "CUDAExecutionProvider" # change to CPUExecutionProvider if running on CPU
ep_options = {"device_id": device_id}
model_id = "mistralai/Mistral-7B-Instruct-v0.2"
model_path = "./Olive/examples/llama2/models/qlora/qlora-conversion-transformers_optimization-bnb_quantization/gpu-cuda_model"
model_path = Path(model_path)
if not (model_path / "config.json").exists():
config = AutoConfig.from_pretrained(model_id)
config.save_pretrained(model_path)
else:
config = AutoConfig.from_pretrained(model_path)
if not (model_path / "generation_config.json").exists():
gen_config = GenerationConfig.from_pretrained(model_id)
gen_config.save_pretrained(model_path)
else:
gen_config = GenerationConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = ORTModelForCausalLM.from_pretrained(
model_path,
config=config,
generation_config=gen_config,
use_io_binding=True,
# provider="CUDAExecutionProvider",
provider=ep,
provider_options={"device_id": device_id}
# provider_options={"device_id": str(rank)},
)
|