from typing import Dict, List, Any import guidance from transformers import AutoTokenizer, AutoTokenizer, AutoModelForCausalLM, AutoConfig import torch class EndpointHandler(): def __init__(self, path=""): # Preload all the elements you are going to need at inference. name = "mosaicml/mpt-30b-instruct" config = AutoConfig.from_pretrained(name, trust_remote_code=True) config.attn_config["attn_impl"] = "triton" config.init_device = "cuda:0" # For fast initialization directly on GPU! model = AutoModelForCausalLM.from_pretrained( name, config=config, torch_dtype=torch.bfloat16, trust_remote_code=True # Load model weights in bfloat16 ) # model = AutoModelForCausalLM.from_pretrained("mosaicml/mpt-30b-chat", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") guidance.llm = guidance.llms.Transformers(model=model, tokenizer=tokenizer) def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ data args: inputs (:obj: `str` | `PIL.Image` | `np.array`) kwargs Return: A :obj:`list` | `dict`: will be serialized and returned """ prompt = data.pop("prompt",data) guidance_prompt = guidance(prompt) out = guidance_prompt() return out.text