from torch import cuda import transformers from accelerate import dispatch_model, infer_auto_device_map from accelerate.utils import get_balanced_memory from transformers import BitsAndBytesConfig, StoppingCriteria, StoppingCriteriaList from typing import Dict, List, Any class PreTrainedPipeline(): def __init__(self, path=""): path = "oleksandrfluxon/mpt-7b-instruct-evaluate" print("===> path", path) device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu' print("===> device", device) model = transformers.AutoModelForCausalLM.from_pretrained( 'oleksandrfluxon/mpt-7b-instruct-evaluate', trust_remote_code=True, load_in_8bit=True, # this requires the `bitsandbytes` library max_seq_len=8192, init_device=device ) model.eval() #model.to(device) print(f"===> Model loaded on {device}") tokenizer = transformers.AutoTokenizer.from_pretrained("mosaicml/mpt-7b") self.pipeline = transformers.pipeline('text-generation', model=model, tokenizer=tokenizer) print("===> init finished") def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ data args: inputs (:obj: `str`) parameters (:obj: `str`) Return: A :obj:`str`: todo """ # get inputs inputs = data.pop("inputs",data) parameters = data.pop("parameters", {}) date = data.pop("date", None) print("===> inputs", inputs) print("===> parameters", parameters) result = self.pipeline(inputs, **parameters) print("===> result", result) return result