|
# ------------------------------------------------------------------------- |
|
# Copyright (c) Microsoft Corporation. All rights reserved. |
|
# Licensed under the MIT License. |
|
# -------------------------------------------------------------------------- |
|
import os |
|
from pathlib import Path |
|
|
|
import torch |
|
import torch.distributed as dist |
|
from optimum.onnxruntime import ORTModelForCausalLM |
|
from transformers import AutoConfig, AutoTokenizer, GenerationConfig |
|
|
|
device_id = 0 |
|
device = torch.device(f"cuda:{device_id}") # Change to torch.device("cpu") if running on CPU |
|
|
|
ep = "CUDAExecutionProvider" # change to CPUExecutionProvider if running on CPU |
|
ep_options = {"device_id": device_id} |
|
|
|
model_id = "mistralai/Mistral-7B-Instruct-v0.2" |
|
model_path = "./Olive/examples/llama2/models/qlora/qlora-conversion-transformers_optimization-bnb_quantization/gpu-cuda_model" |
|
|
|
model_path = Path(model_path) |
|
|
|
if not (model_path / "config.json").exists(): |
|
config = AutoConfig.from_pretrained(model_id) |
|
config.save_pretrained(model_path) |
|
else: |
|
config = AutoConfig.from_pretrained(model_path) |
|
|
|
if not (model_path / "generation_config.json").exists(): |
|
gen_config = GenerationConfig.from_pretrained(model_id) |
|
gen_config.save_pretrained(model_path) |
|
else: |
|
gen_config = GenerationConfig.from_pretrained(model_path) |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_id) |
|
|
|
model = ORTModelForCausalLM.from_pretrained( |
|
model_path, |
|
config=config, |
|
generation_config=gen_config, |
|
use_io_binding=True, |
|
# provider="CUDAExecutionProvider", |
|
provider=ep, |
|
provider_options={"device_id": device_id} |
|
# provider_options={"device_id": str(rank)}, |
|
) |
|
|