smangrul's picture
Create handler.py
a56b530
raw
history blame
1.64 kB
from typing import Any, Dict
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftConfig, PeftModel
from transformers import pipeline
class EndpointHandler:
def __init__(self, path=""):
# load model and processor from path
self.device = "cuda" if torch.cuda.is_available() else "cpu"
config = PeftConfig.from_pretrained(path)
model = AutoModelForCausalLM.from_pretrained(
config.base_model_name_or_path,
return_dict=True,
torch_dtype=torch.float16,
trust_remote_code=True,
)
self.tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path, trust_remote_code=True)
model = PeftModel.from_pretrained(model, path)
self.model = model
self.model.to(torch.float16)
self.model.to(self.device)
self.model = self.model.merge_and_unload()
self.model.eval()
self.pipeline = pipeline('text-generation',
model = self.model,
tokenizer=self.tokenizer,
device=self.device,
torch_dtype=torch.float16)
def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
# process input
inputs = data.pop("inputs", data)
parameters = data.pop("parameters", None)
# pass inputs with all kwargs in data
if parameters is not None:
outputs = self.pipeline(**inputs, **parameters)
else:
outputs = self.pipeline(**inputs)
return outputs