File size: 1,712 Bytes
2e65025 5d3586d 2e65025 5d3586d 2e65025 5d3586d 2e65025 5d3586d 2e65025 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 |
from typing import Dict, List, Any
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification,pipeline
from transformers import pipeline
import deepspeed
class EndpointHandler():
def __init__(self, path=""):
# load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForSequenceClassification.from_pretrained(path)
# init deepspeed inference engine
ds_model = deepspeed.init_inference(
model=model, # Transformers models
mp_size=1, # Number of GPU
dtype=torch.half, # dtype of the weights (fp16)
# injection_policy={"BertLayer" : HFBertLayerPolicy}, # replace BertLayer with DS HFBertLayerPolicy
replace_method="auto", # Lets DS autmatically identify the layer to replace
replace_with_kernel_inject=True, # replace the model with the kernel injector
)
# create acclerated pipeline
self.pipeline = pipeline("text-classification", model=ds_model, tokenizer=tokenizer, device=0)
def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
"""
data args:
inputs (:obj: `str`)
date (:obj: `str`)
Return:
A :obj:`list` | `dict`: will be serialized and returned
"""
inputs = data.pop("inputs", data)
parameters = data.pop("parameters", None)
# pass inputs with all kwargs in data
if parameters is not None:
prediction = self.pipeline(inputs, **parameters)
else:
prediction = self.pipeline(inputs)
# postprocess the prediction
return prediction |