from typing import Dict, List, Any from optimum.onnxruntime import ORTModelForQuestionAnswering from transformers import AutoTokenizer, pipeline class EndpointHandler(): def __init__(self, path=""): # load the optimized model self.model = ORTModelForQuestionAnswering.from_pretrained(path, file_name="model_optimized_quantized.onnx") self.tokenizer = AutoTokenizer.from_pretrained(path) # create pipeline self.pipeline = pipeline("question-answering", model=self.model, tokenizer=self.tokenizer) def __call__(self, data: Any) -> List[List[Dict[str, float]]]: """ Args: data (:obj:): includes the input data and the parameters for the inference. Return: A :obj:`list`:. The list contains the answer and scores of the inference inputs """ inputs = data.get("inputs", data) # run the model prediction = self.pipeline(**inputs) # return prediction return prediction