salma-remyx
commited on
Commit
•
0ea9ef5
1
Parent(s):
161f871
Add SpaceLlaVA Triton Server
Browse files- docker/Dockerfile +35 -0
- docker/client.py +62 -0
- docker/models/spacellava/1/model.py +174 -0
- docker/models/spacellava/config.pbtxt +24 -0
docker/Dockerfile
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM nvcr.io/nvidia/tritonserver:22.11-py3
|
2 |
+
|
3 |
+
WORKDIR /workspace
|
4 |
+
|
5 |
+
RUN apt-get update && apt-get install cmake -y
|
6 |
+
|
7 |
+
RUN pip install --upgrade pip && pip install --upgrade tensorrt
|
8 |
+
|
9 |
+
RUN git clone https://github.com/NVIDIA/TensorRT.git -b main --single-branch \
|
10 |
+
&& cd TensorRT \
|
11 |
+
&& git submodule update --init --recursive
|
12 |
+
|
13 |
+
ENV TRT_OSSPATH=/workspace/TensorRT
|
14 |
+
WORKDIR ${TRT_OSSPATH}
|
15 |
+
|
16 |
+
RUN mkdir -p build \
|
17 |
+
&& cd build \
|
18 |
+
&& cmake .. -DTRT_OUT_DIR=$PWD/out \
|
19 |
+
&& cd plugin \
|
20 |
+
&& make -j$(nproc)
|
21 |
+
|
22 |
+
ENV PLUGIN_LIBS="${TRT_OSSPATH}/build/out/libnvinfer_plugin.so"
|
23 |
+
|
24 |
+
WORKDIR /weights
|
25 |
+
RUN wget https://huggingface.co/remyxai/SpaceLLaVA/resolve/main/ggml-model-q4_0.gguf
|
26 |
+
RUN wget https://huggingface.co/remyxai/SpaceLLaVA/resolve/main/mmproj-model-f16.gguf
|
27 |
+
|
28 |
+
RUN python3 -m pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
|
29 |
+
RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.45 --force-reinstall --no-cache-dir
|
30 |
+
|
31 |
+
WORKDIR /models
|
32 |
+
COPY ./models/ .
|
33 |
+
|
34 |
+
WORKDIR /workspace
|
35 |
+
CMD ["tritonserver", "--model-store=/models"]
|
docker/client.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import time
|
3 |
+
import base64
|
4 |
+
import numpy as np
|
5 |
+
import requests
|
6 |
+
import os
|
7 |
+
from urllib.parse import urlparse
|
8 |
+
from tritonclient.http import InferenceServerClient, InferInput, InferRequestedOutput
|
9 |
+
|
10 |
+
def download_image(image_url):
|
11 |
+
parsed_url = urlparse(image_url)
|
12 |
+
filename = os.path.basename(parsed_url.path)
|
13 |
+
response = requests.get(image_url)
|
14 |
+
if response.status_code == 200:
|
15 |
+
with open(filename, 'wb') as img_file:
|
16 |
+
img_file.write(response.content)
|
17 |
+
return filename
|
18 |
+
else:
|
19 |
+
raise Exception("Failed to download image")
|
20 |
+
|
21 |
+
def image_to_base64_data_uri(image_input):
|
22 |
+
with open(image_input, "rb") as img_file:
|
23 |
+
base64_data = base64.b64encode(img_file.read()).decode('utf-8')
|
24 |
+
return base64_data
|
25 |
+
|
26 |
+
def setup_argparse():
|
27 |
+
parser = argparse.ArgumentParser(description="Client for Triton Inference Server")
|
28 |
+
parser.add_argument("--image_path", type=str, required=True, help="Path to the image or URL of the image to process")
|
29 |
+
parser.add_argument("--prompt", type=str, required=True, help="Prompt to be used for the inference")
|
30 |
+
return parser.parse_args()
|
31 |
+
|
32 |
+
if __name__ == "__main__":
|
33 |
+
args = setup_argparse()
|
34 |
+
|
35 |
+
triton_client = InferenceServerClient(url="localhost:8000", verbose=False)
|
36 |
+
|
37 |
+
if args.image_path.startswith('http://') or args.image_path.startswith('https://'):
|
38 |
+
image_path = download_image(args.image_path)
|
39 |
+
else:
|
40 |
+
image_path = args.image_path
|
41 |
+
|
42 |
+
image_data = image_to_base64_data_uri(image_path).encode('utf-8')
|
43 |
+
image_data_np = np.array([image_data], dtype=object)
|
44 |
+
prompt_np = np.array([args.prompt.encode('utf-8')], dtype=object)
|
45 |
+
|
46 |
+
images_in = InferInput(name="IMAGES", shape=[1], datatype="BYTES")
|
47 |
+
images_in.set_data_from_numpy(image_data_np, binary_data=True)
|
48 |
+
prompt_in = InferInput(name="PROMPT", shape=[1], datatype="BYTES")
|
49 |
+
prompt_in.set_data_from_numpy(prompt_np, binary_data=True)
|
50 |
+
|
51 |
+
results_out = InferRequestedOutput(name="RESULTS", binary_data=False)
|
52 |
+
|
53 |
+
start_time = time.time()
|
54 |
+
response = triton_client.infer(model_name="spacellava",
|
55 |
+
model_version="1",
|
56 |
+
inputs=[prompt_in, images_in],
|
57 |
+
outputs=[results_out])
|
58 |
+
|
59 |
+
results = response.get_response()["outputs"][0]["data"][0]
|
60 |
+
print("--- %s seconds ---" % (time.time() - start_time))
|
61 |
+
print(results)
|
62 |
+
|
docker/models/spacellava/1/model.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import numpy as np
|
4 |
+
import triton_python_backend_utils as pb_utils
|
5 |
+
from llama_cpp import Llama
|
6 |
+
from llama_cpp.llama_chat_format import Llava15ChatHandler
|
7 |
+
|
8 |
+
|
9 |
+
class TritonPythonModel:
|
10 |
+
"""Your Python model must use the same class name. Every Python model
|
11 |
+
that is created must have "TritonPythonModel" as the class name.
|
12 |
+
"""
|
13 |
+
|
14 |
+
@staticmethod
|
15 |
+
def auto_complete_config(auto_complete_model_config):
|
16 |
+
"""`auto_complete_config` is called only once when loading the model
|
17 |
+
assuming the server was not started with
|
18 |
+
`--disable-auto-complete-config`. Implementing this function is
|
19 |
+
optional. No implementation of `auto_complete_config` will do nothing.
|
20 |
+
This function can be used to set `max_batch_size`, `input` and `output`
|
21 |
+
properties of the model using `set_max_batch_size`, `add_input`, and
|
22 |
+
`add_output`. These properties will allow Triton to load the model with
|
23 |
+
minimal model configuration in absence of a configuration file. This
|
24 |
+
function returns the `pb_utils.ModelConfig` object with these
|
25 |
+
properties. You can use the `as_dict` function to gain read-only access
|
26 |
+
to the `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object
|
27 |
+
being returned from here will be used as the final configuration for
|
28 |
+
the model.
|
29 |
+
|
30 |
+
Note: The Python interpreter used to invoke this function will be
|
31 |
+
destroyed upon returning from this function and as a result none of the
|
32 |
+
objects created here will be available in the `initialize`, `execute`,
|
33 |
+
or `finalize` functions.
|
34 |
+
|
35 |
+
Parameters
|
36 |
+
----------
|
37 |
+
auto_complete_model_config : pb_utils.ModelConfig
|
38 |
+
An object containing the existing model configuration. You can build
|
39 |
+
upon the configuration given by this object when setting the
|
40 |
+
properties for this model.
|
41 |
+
|
42 |
+
Returns
|
43 |
+
-------
|
44 |
+
pb_utils.ModelConfig
|
45 |
+
An object containing the auto-completed model configuration
|
46 |
+
"""
|
47 |
+
inputs = [{
|
48 |
+
'name': 'PROMPT',
|
49 |
+
'data_type': 'TYPE_STRING',
|
50 |
+
'dims': [-1]
|
51 |
+
}, {
|
52 |
+
'name': 'IMAGES',
|
53 |
+
'data_type': 'TYPE_STRING', # Changed from TYPE_FP16 to TYPE_STRING
|
54 |
+
'dims': [-1] # Changed to indicate a variable-length array of strings
|
55 |
+
}]
|
56 |
+
|
57 |
+
outputs = [{
|
58 |
+
'name': 'RESULTS',
|
59 |
+
'data_type': 'TYPE_STRING',
|
60 |
+
'dims': [-1]
|
61 |
+
}]
|
62 |
+
|
63 |
+
config = auto_complete_model_config.as_dict()
|
64 |
+
input_names = []
|
65 |
+
output_names = []
|
66 |
+
for input in config['input']:
|
67 |
+
input_names.append(input['name'])
|
68 |
+
for output in config['output']:
|
69 |
+
output_names.append(output['name'])
|
70 |
+
|
71 |
+
for input in inputs:
|
72 |
+
if input['name'] not in input_names:
|
73 |
+
auto_complete_model_config.add_input(input)
|
74 |
+
for output in outputs:
|
75 |
+
if output['name'] not in output_names:
|
76 |
+
auto_complete_model_config.add_output(output)
|
77 |
+
|
78 |
+
auto_complete_model_config.set_dynamic_batching()
|
79 |
+
|
80 |
+
return auto_complete_model_config
|
81 |
+
|
82 |
+
def initialize(self, args):
|
83 |
+
"""`initialize` is called only once when the model is being loaded.
|
84 |
+
Implementing `initialize` function is optional. This function allows
|
85 |
+
the model to initialize any state associated with this model.
|
86 |
+
|
87 |
+
Parameters
|
88 |
+
----------
|
89 |
+
args : dict
|
90 |
+
Both keys and values are strings. The dictionary keys and values are:
|
91 |
+
* model_config: A JSON string containing the model configuration
|
92 |
+
* model_instance_kind: A string containing model instance kind
|
93 |
+
* model_instance_device_id: A string containing model instance device
|
94 |
+
ID
|
95 |
+
* model_repository: Model repository path
|
96 |
+
* model_version: Model version
|
97 |
+
* model_name: Model name
|
98 |
+
"""
|
99 |
+
chat_handler = Llava15ChatHandler(clip_model_path="/weights/mmproj-model-f16.gguf", verbose=True)
|
100 |
+
self.model = Llama(model_path="/weights/ggml-model-q4_0.gguf", chat_handler=chat_handler, n_ctx=2048, logits_all=True, n_gpu_layers=-1)
|
101 |
+
print('Initialized...')
|
102 |
+
|
103 |
+
def run_inference(self, prompt, image):
|
104 |
+
image_data = f"data:image/png;base64,{image}"
|
105 |
+
messages = [
|
106 |
+
{"role": "system", "content": "You are an assistant who perfectly describes images."},
|
107 |
+
{
|
108 |
+
"role": "user",
|
109 |
+
"content": [
|
110 |
+
{"type": "image_url", "image_url": {"url": image_data}},
|
111 |
+
{"type" : "text", "text": prompt}
|
112 |
+
]
|
113 |
+
}
|
114 |
+
]
|
115 |
+
result = self.model.create_chat_completion(messages=messages)
|
116 |
+
output_string = result["choices"][0]["message"]["content"]
|
117 |
+
output_data = np.array([output_string.encode('utf-8')], dtype=object)
|
118 |
+
return output_data
|
119 |
+
|
120 |
+
def execute(self, requests):
|
121 |
+
"""`execute` must be implemented in every Python model. `execute`
|
122 |
+
function receives a list of pb_utils.InferenceRequest as the only
|
123 |
+
argument. This function is called when an inference is requested
|
124 |
+
for this model.
|
125 |
+
|
126 |
+
Parameters
|
127 |
+
----------
|
128 |
+
requests : list
|
129 |
+
A list of pb_utils.InferenceRequest
|
130 |
+
|
131 |
+
Returns
|
132 |
+
-------
|
133 |
+
list
|
134 |
+
A list of pb_utils.InferenceResponse. The length of this list must
|
135 |
+
be the same as `requests`
|
136 |
+
"""
|
137 |
+
|
138 |
+
responses = []
|
139 |
+
|
140 |
+
for request in requests:
|
141 |
+
# Perform inference on the request and append it to responses
|
142 |
+
# list...
|
143 |
+
prompt = [
|
144 |
+
t.decode("UTF-8")
|
145 |
+
for t in pb_utils.get_input_tensor_by_name(request, "PROMPT")
|
146 |
+
.as_numpy()
|
147 |
+
.tolist()
|
148 |
+
][0]
|
149 |
+
image = [
|
150 |
+
t.decode("UTF-8")
|
151 |
+
for t in pb_utils.get_input_tensor_by_name(request, "IMAGES")
|
152 |
+
.as_numpy()
|
153 |
+
.tolist()
|
154 |
+
][0]
|
155 |
+
results = self.run_inference(prompt, image)
|
156 |
+
|
157 |
+
# Sending results
|
158 |
+
inference_response = pb_utils.InferenceResponse(output_tensors=[
|
159 |
+
pb_utils.Tensor(
|
160 |
+
"RESULTS",
|
161 |
+
results,
|
162 |
+
)
|
163 |
+
])
|
164 |
+
|
165 |
+
responses.append(inference_response)
|
166 |
+
|
167 |
+
return responses
|
168 |
+
|
169 |
+
def finalize(self):
|
170 |
+
"""`finalize` is called only once when the model is being unloaded.
|
171 |
+
Implementing `finalize` function is optional. This function allows
|
172 |
+
the model to perform any necessary clean ups before exit.
|
173 |
+
"""
|
174 |
+
print('Cleaning up...')
|
docker/models/spacellava/config.pbtxt
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
name: "spacellava"
|
2 |
+
max_batch_size: 0
|
3 |
+
backend: "python"
|
4 |
+
|
5 |
+
input [
|
6 |
+
{
|
7 |
+
name: "PROMPT"
|
8 |
+
data_type: TYPE_STRING
|
9 |
+
dims: [ -1 ]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
name: "IMAGES"
|
13 |
+
data_type: TYPE_STRING
|
14 |
+
dims: [ -1 ]
|
15 |
+
}
|
16 |
+
]
|
17 |
+
|
18 |
+
output [
|
19 |
+
{
|
20 |
+
name: "RESULTS"
|
21 |
+
data_type: TYPE_STRING
|
22 |
+
dims: [ -1 ]
|
23 |
+
}
|
24 |
+
]
|