File size: 6,733 Bytes

0ea9ef5

import os
import torch
import numpy as np
import triton_python_backend_utils as pb_utils
from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler


class TritonPythonModel:
    """Your Python model must use the same class name. Every Python model
    that is created must have "TritonPythonModel" as the class name.
    """

    @staticmethod
    def auto_complete_config(auto_complete_model_config):
        """`auto_complete_config` is called only once when loading the model
        assuming the server was not started with
        `--disable-auto-complete-config`. Implementing this function is
        optional. No implementation of `auto_complete_config` will do nothing.
        This function can be used to set `max_batch_size`, `input` and `output`
        properties of the model using `set_max_batch_size`, `add_input`, and
        `add_output`. These properties will allow Triton to load the model with
        minimal model configuration in absence of a configuration file. This
        function returns the `pb_utils.ModelConfig` object with these
        properties. You can use the `as_dict` function to gain read-only access
        to the `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object
        being returned from here will be used as the final configuration for
        the model.

        Note: The Python interpreter used to invoke this function will be
        destroyed upon returning from this function and as a result none of the
        objects created here will be available in the `initialize`, `execute`,
        or `finalize` functions.

        Parameters
        ----------
        auto_complete_model_config : pb_utils.ModelConfig
          An object containing the existing model configuration. You can build
          upon the configuration given by this object when setting the
          properties for this model.

        Returns
        -------
        pb_utils.ModelConfig
          An object containing the auto-completed model configuration
        """
        inputs = [{
            'name': 'PROMPT',
            'data_type': 'TYPE_STRING',
            'dims': [-1]
        }, {
            'name': 'IMAGES',
            'data_type': 'TYPE_STRING',  # Changed from TYPE_FP16 to TYPE_STRING
            'dims': [-1]  # Changed to indicate a variable-length array of strings
        }]

        outputs = [{
            'name': 'RESULTS',
            'data_type': 'TYPE_STRING',
            'dims': [-1]
        }]

        config = auto_complete_model_config.as_dict()
        input_names = []
        output_names = []
        for input in config['input']:
            input_names.append(input['name'])
        for output in config['output']:
            output_names.append(output['name'])

        for input in inputs:
            if input['name'] not in input_names:
                auto_complete_model_config.add_input(input)
        for output in outputs:
            if output['name'] not in output_names:
                auto_complete_model_config.add_output(output)

        auto_complete_model_config.set_dynamic_batching()

        return auto_complete_model_config

    def initialize(self, args):
        """`initialize` is called only once when the model is being loaded.
        Implementing `initialize` function is optional. This function allows
        the model to initialize any state associated with this model.

        Parameters
        ----------
        args : dict
          Both keys and values are strings. The dictionary keys and values are:
          * model_config: A JSON string containing the model configuration
          * model_instance_kind: A string containing model instance kind
          * model_instance_device_id: A string containing model instance device
            ID
          * model_repository: Model repository path
          * model_version: Model version
          * model_name: Model name
        """
        chat_handler = Llava15ChatHandler(clip_model_path="/weights/mmproj-model-f16.gguf", verbose=True)
        self.model = Llama(model_path="/weights/ggml-model-q4_0.gguf", chat_handler=chat_handler, n_ctx=2048, logits_all=True, n_gpu_layers=-1)
        print('Initialized...')

    def run_inference(self, prompt, image):
        image_data = f"data:image/png;base64,{image}"
        messages = [
            {"role": "system", "content": "You are an assistant who perfectly describes images."},
             {
                 "role": "user",
                 "content": [
                     {"type": "image_url", "image_url": {"url": image_data}},
                      {"type" : "text", "text": prompt}
                     ]
                 }
            ]
        result = self.model.create_chat_completion(messages=messages)
        output_string = result["choices"][0]["message"]["content"]
        output_data = np.array([output_string.encode('utf-8')], dtype=object)
        return output_data

    def execute(self, requests):
        """`execute` must be implemented in every Python model. `execute`
        function receives a list of pb_utils.InferenceRequest as the only
        argument. This function is called when an inference is requested
        for this model.

        Parameters
        ----------
        requests : list
          A list of pb_utils.InferenceRequest

        Returns
        -------
        list
          A list of pb_utils.InferenceResponse. The length of this list must
          be the same as `requests`
        """

        responses = []

        for request in requests:
            # Perform inference on the request and append it to responses
            # list...
            prompt = [
                t.decode("UTF-8")
                for t in pb_utils.get_input_tensor_by_name(request, "PROMPT")
                .as_numpy()
                .tolist()
            ][0]
            image = [
                t.decode("UTF-8")
                for t in pb_utils.get_input_tensor_by_name(request, "IMAGES")
                .as_numpy()
                .tolist()
            ][0]
            results = self.run_inference(prompt, image)

            # Sending results
            inference_response = pb_utils.InferenceResponse(output_tensors=[
                pb_utils.Tensor(
                    "RESULTS",
                    results,
                )
            ])

            responses.append(inference_response)

        return responses

    def finalize(self):
        """`finalize` is called only once when the model is being unloaded.
        Implementing `finalize` function is optional. This function allows
        the model to perform any necessary clean ups before exit.
        """
        print('Cleaning up...')