# ------------------------------------------------------------------------- # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------------------- import os from pathlib import Path import torch import torch.distributed as dist from optimum.onnxruntime import ORTModelForCausalLM from transformers import AutoConfig, AutoTokenizer, GenerationConfig device_id = 0 device = torch.device(f"cuda:{device_id}") # Change to torch.device("cpu") if running on CPU ep = "CUDAExecutionProvider" # change to CPUExecutionProvider if running on CPU ep_options = {"device_id": device_id} model_id = "mistralai/Mistral-7B-Instruct-v0.2" model_path = "./Olive/examples/llama2/models/qlora/qlora-conversion-transformers_optimization-bnb_quantization/gpu-cuda_model" model_path = Path(model_path) if not (model_path / "config.json").exists(): config = AutoConfig.from_pretrained(model_id) config.save_pretrained(model_path) else: config = AutoConfig.from_pretrained(model_path) if not (model_path / "generation_config.json").exists(): gen_config = GenerationConfig.from_pretrained(model_id) gen_config.save_pretrained(model_path) else: gen_config = GenerationConfig.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_id) model = ORTModelForCausalLM.from_pretrained( model_path, config=config, generation_config=gen_config, use_io_binding=True, # provider="CUDAExecutionProvider", provider=ep, provider_options={"device_id": device_id} # provider_options={"device_id": str(rank)}, )