Add SpaceLlaVA Triton Server

Browse files

Files changed (4) hide show

docker/Dockerfile +35 -0
docker/client.py +62 -0
docker/models/spacellava/1/model.py +174 -0
docker/models/spacellava/config.pbtxt +24 -0

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,35 @@

+FROM nvcr.io/nvidia/tritonserver:22.11-py3
+WORKDIR /workspace
+RUN apt-get update && apt-get install cmake -y
+RUN pip install --upgrade pip && pip install --upgrade tensorrt
+RUN git clone https://github.com/NVIDIA/TensorRT.git -b main --single-branch \
+    && cd TensorRT \
+    && git submodule update --init --recursive
+ENV TRT_OSSPATH=/workspace/TensorRT
+WORKDIR ${TRT_OSSPATH}
+RUN mkdir -p build \
+    && cd build \
+    && cmake .. -DTRT_OUT_DIR=$PWD/out \
+    && cd plugin \
+    && make -j$(nproc)
+ENV PLUGIN_LIBS="${TRT_OSSPATH}/build/out/libnvinfer_plugin.so"
+WORKDIR /weights
+RUN wget https://huggingface.co/remyxai/SpaceLLaVA/resolve/main/ggml-model-q4_0.gguf
+RUN wget https://huggingface.co/remyxai/SpaceLLaVA/resolve/main/mmproj-model-f16.gguf
+RUN python3 -m pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
+RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.45 --force-reinstall --no-cache-dir
+WORKDIR /models
+COPY ./models/ .
+WORKDIR /workspace
+CMD ["tritonserver", "--model-store=/models"]

docker/client.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import argparse
+import time
+import base64
+import numpy as np
+import requests
+import os
+from urllib.parse import urlparse
+from tritonclient.http import InferenceServerClient, InferInput, InferRequestedOutput
+def download_image(image_url):
+    parsed_url = urlparse(image_url)
+    filename = os.path.basename(parsed_url.path)
+    response = requests.get(image_url)
+    if response.status_code == 200:
+        with open(filename, 'wb') as img_file:
+            img_file.write(response.content)
+        return filename
+    else:
+        raise Exception("Failed to download image")
+def image_to_base64_data_uri(image_input):
+    with open(image_input, "rb") as img_file:
+        base64_data = base64.b64encode(img_file.read()).decode('utf-8')
+    return base64_data
+def setup_argparse():
+    parser = argparse.ArgumentParser(description="Client for Triton Inference Server")
+    parser.add_argument("--image_path", type=str, required=True, help="Path to the image or URL of the image to process")
+    parser.add_argument("--prompt", type=str, required=True, help="Prompt to be used for the inference")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = setup_argparse()
+    triton_client = InferenceServerClient(url="localhost:8000", verbose=False)
+    if args.image_path.startswith('http://') or args.image_path.startswith('https://'):
+        image_path = download_image(args.image_path)
+    else:
+        image_path = args.image_path
+    image_data = image_to_base64_data_uri(image_path).encode('utf-8')
+    image_data_np = np.array([image_data], dtype=object)
+    prompt_np = np.array([args.prompt.encode('utf-8')], dtype=object)
+    images_in = InferInput(name="IMAGES", shape=[1], datatype="BYTES")
+    images_in.set_data_from_numpy(image_data_np, binary_data=True)
+    prompt_in = InferInput(name="PROMPT", shape=[1], datatype="BYTES")
+    prompt_in.set_data_from_numpy(prompt_np, binary_data=True)
+    results_out = InferRequestedOutput(name="RESULTS", binary_data=False)
+    start_time = time.time()
+    response = triton_client.infer(model_name="spacellava",
+                                   model_version="1",
+                                   inputs=[prompt_in, images_in],
+                                   outputs=[results_out])
+    results = response.get_response()["outputs"][0]["data"][0]
+    print("--- %s seconds ---" % (time.time() - start_time))
+    print(results)

docker/models/spacellava/1/model.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import os
+import torch
+import numpy as np
+import triton_python_backend_utils as pb_utils
+from llama_cpp import Llama
+from llama_cpp.llama_chat_format import Llava15ChatHandler
+class TritonPythonModel:
+    """Your Python model must use the same class name. Every Python model
+    that is created must have "TritonPythonModel" as the class name.
+    """
+    @staticmethod
+    def auto_complete_config(auto_complete_model_config):
+        """`auto_complete_config` is called only once when loading the model
+        assuming the server was not started with
+        `--disable-auto-complete-config`. Implementing this function is
+        optional. No implementation of `auto_complete_config` will do nothing.
+        This function can be used to set `max_batch_size`, `input` and `output`
+        properties of the model using `set_max_batch_size`, `add_input`, and
+        `add_output`. These properties will allow Triton to load the model with
+        minimal model configuration in absence of a configuration file. This
+        function returns the `pb_utils.ModelConfig` object with these
+        properties. You can use the `as_dict` function to gain read-only access
+        to the `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object
+        being returned from here will be used as the final configuration for
+        the model.
+        Note: The Python interpreter used to invoke this function will be
+        destroyed upon returning from this function and as a result none of the
+        objects created here will be available in the `initialize`, `execute`,
+        or `finalize` functions.
+        Parameters
+        ----------
+        auto_complete_model_config : pb_utils.ModelConfig
+          An object containing the existing model configuration. You can build
+          upon the configuration given by this object when setting the
+          properties for this model.
+        Returns
+        -------
+        pb_utils.ModelConfig
+          An object containing the auto-completed model configuration
+        """
+        inputs = [{
+            'name': 'PROMPT',
+            'data_type': 'TYPE_STRING',
+            'dims': [-1]
+        }, {
+            'name': 'IMAGES',
+            'data_type': 'TYPE_STRING',  # Changed from TYPE_FP16 to TYPE_STRING
+            'dims': [-1]  # Changed to indicate a variable-length array of strings
+        }]
+        outputs = [{
+            'name': 'RESULTS',
+            'data_type': 'TYPE_STRING',
+            'dims': [-1]
+        }]
+        config = auto_complete_model_config.as_dict()
+        input_names = []
+        output_names = []
+        for input in config['input']:
+            input_names.append(input['name'])
+        for output in config['output']:
+            output_names.append(output['name'])
+        for input in inputs:
+            if input['name'] not in input_names:
+                auto_complete_model_config.add_input(input)
+        for output in outputs:
+            if output['name'] not in output_names:
+                auto_complete_model_config.add_output(output)
+        auto_complete_model_config.set_dynamic_batching()
+        return auto_complete_model_config
+    def initialize(self, args):
+        """`initialize` is called only once when the model is being loaded.
+        Implementing `initialize` function is optional. This function allows
+        the model to initialize any state associated with this model.
+        Parameters
+        ----------
+        args : dict
+          Both keys and values are strings. The dictionary keys and values are:
+          * model_config: A JSON string containing the model configuration
+          * model_instance_kind: A string containing model instance kind
+          * model_instance_device_id: A string containing model instance device
+            ID
+          * model_repository: Model repository path
+          * model_version: Model version
+          * model_name: Model name
+        """
+        chat_handler = Llava15ChatHandler(clip_model_path="/weights/mmproj-model-f16.gguf", verbose=True)
+        self.model = Llama(model_path="/weights/ggml-model-q4_0.gguf", chat_handler=chat_handler, n_ctx=2048, logits_all=True, n_gpu_layers=-1)
+        print('Initialized...')
+    def run_inference(self, prompt, image):
+        image_data = f"data:image/png;base64,{image}"
+        messages = [
+            {"role": "system", "content": "You are an assistant who perfectly describes images."},
+             {
+                 "role": "user",
+                 "content": [
+                     {"type": "image_url", "image_url": {"url": image_data}},
+                      {"type" : "text", "text": prompt}
+                     ]
+                 }
+            ]
+        result = self.model.create_chat_completion(messages=messages)
+        output_string = result["choices"][0]["message"]["content"]
+        output_data = np.array([output_string.encode('utf-8')], dtype=object)
+        return output_data
+    def execute(self, requests):
+        """`execute` must be implemented in every Python model. `execute`
+        function receives a list of pb_utils.InferenceRequest as the only
+        argument. This function is called when an inference is requested
+        for this model.
+        Parameters
+        ----------
+        requests : list
+          A list of pb_utils.InferenceRequest
+        Returns
+        -------
+        list
+          A list of pb_utils.InferenceResponse. The length of this list must
+          be the same as `requests`
+        """
+        responses = []
+        for request in requests:
+            # Perform inference on the request and append it to responses
+            # list...
+            prompt = [
+                t.decode("UTF-8")
+                for t in pb_utils.get_input_tensor_by_name(request, "PROMPT")
+                .as_numpy()
+                .tolist()
+            ][0]
+            image = [
+                t.decode("UTF-8")
+                for t in pb_utils.get_input_tensor_by_name(request, "IMAGES")
+                .as_numpy()
+                .tolist()
+            ][0]
+            results = self.run_inference(prompt, image)
+            # Sending results
+            inference_response = pb_utils.InferenceResponse(output_tensors=[
+                pb_utils.Tensor(
+                    "RESULTS",
+                    results,
+                )
+            ])
+            responses.append(inference_response)
+        return responses
+    def finalize(self):
+        """`finalize` is called only once when the model is being unloaded.
+        Implementing `finalize` function is optional. This function allows
+        the model to perform any necessary clean ups before exit.
+        """
+        print('Cleaning up...')

docker/models/spacellava/config.pbtxt ADDED Viewed

	@@ -0,0 +1,24 @@

+name: "spacellava"
+max_batch_size: 0
+backend: "python"
+input [
+    {
+        name: "PROMPT"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+    },
+    {
+        name: "IMAGES"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+    }
+]
+output [
+    {
+        name: "RESULTS"
+        data_type: TYPE_STRING
+        dims: [ -1 ]
+    }
+]