Text Generation
Transformers
Safetensors
GGUF
llava
remyx
Inference Endpoints
salma-remyx commited on
Commit
0ea9ef5
1 Parent(s): 161f871

Add SpaceLlaVA Triton Server

Browse files
docker/Dockerfile ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvcr.io/nvidia/tritonserver:22.11-py3
2
+
3
+ WORKDIR /workspace
4
+
5
+ RUN apt-get update && apt-get install cmake -y
6
+
7
+ RUN pip install --upgrade pip && pip install --upgrade tensorrt
8
+
9
+ RUN git clone https://github.com/NVIDIA/TensorRT.git -b main --single-branch \
10
+ && cd TensorRT \
11
+ && git submodule update --init --recursive
12
+
13
+ ENV TRT_OSSPATH=/workspace/TensorRT
14
+ WORKDIR ${TRT_OSSPATH}
15
+
16
+ RUN mkdir -p build \
17
+ && cd build \
18
+ && cmake .. -DTRT_OUT_DIR=$PWD/out \
19
+ && cd plugin \
20
+ && make -j$(nproc)
21
+
22
+ ENV PLUGIN_LIBS="${TRT_OSSPATH}/build/out/libnvinfer_plugin.so"
23
+
24
+ WORKDIR /weights
25
+ RUN wget https://huggingface.co/remyxai/SpaceLLaVA/resolve/main/ggml-model-q4_0.gguf
26
+ RUN wget https://huggingface.co/remyxai/SpaceLLaVA/resolve/main/mmproj-model-f16.gguf
27
+
28
+ RUN python3 -m pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
29
+ RUN CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.45 --force-reinstall --no-cache-dir
30
+
31
+ WORKDIR /models
32
+ COPY ./models/ .
33
+
34
+ WORKDIR /workspace
35
+ CMD ["tritonserver", "--model-store=/models"]
docker/client.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import time
3
+ import base64
4
+ import numpy as np
5
+ import requests
6
+ import os
7
+ from urllib.parse import urlparse
8
+ from tritonclient.http import InferenceServerClient, InferInput, InferRequestedOutput
9
+
10
+ def download_image(image_url):
11
+ parsed_url = urlparse(image_url)
12
+ filename = os.path.basename(parsed_url.path)
13
+ response = requests.get(image_url)
14
+ if response.status_code == 200:
15
+ with open(filename, 'wb') as img_file:
16
+ img_file.write(response.content)
17
+ return filename
18
+ else:
19
+ raise Exception("Failed to download image")
20
+
21
+ def image_to_base64_data_uri(image_input):
22
+ with open(image_input, "rb") as img_file:
23
+ base64_data = base64.b64encode(img_file.read()).decode('utf-8')
24
+ return base64_data
25
+
26
+ def setup_argparse():
27
+ parser = argparse.ArgumentParser(description="Client for Triton Inference Server")
28
+ parser.add_argument("--image_path", type=str, required=True, help="Path to the image or URL of the image to process")
29
+ parser.add_argument("--prompt", type=str, required=True, help="Prompt to be used for the inference")
30
+ return parser.parse_args()
31
+
32
+ if __name__ == "__main__":
33
+ args = setup_argparse()
34
+
35
+ triton_client = InferenceServerClient(url="localhost:8000", verbose=False)
36
+
37
+ if args.image_path.startswith('http://') or args.image_path.startswith('https://'):
38
+ image_path = download_image(args.image_path)
39
+ else:
40
+ image_path = args.image_path
41
+
42
+ image_data = image_to_base64_data_uri(image_path).encode('utf-8')
43
+ image_data_np = np.array([image_data], dtype=object)
44
+ prompt_np = np.array([args.prompt.encode('utf-8')], dtype=object)
45
+
46
+ images_in = InferInput(name="IMAGES", shape=[1], datatype="BYTES")
47
+ images_in.set_data_from_numpy(image_data_np, binary_data=True)
48
+ prompt_in = InferInput(name="PROMPT", shape=[1], datatype="BYTES")
49
+ prompt_in.set_data_from_numpy(prompt_np, binary_data=True)
50
+
51
+ results_out = InferRequestedOutput(name="RESULTS", binary_data=False)
52
+
53
+ start_time = time.time()
54
+ response = triton_client.infer(model_name="spacellava",
55
+ model_version="1",
56
+ inputs=[prompt_in, images_in],
57
+ outputs=[results_out])
58
+
59
+ results = response.get_response()["outputs"][0]["data"][0]
60
+ print("--- %s seconds ---" % (time.time() - start_time))
61
+ print(results)
62
+
docker/models/spacellava/1/model.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ import triton_python_backend_utils as pb_utils
5
+ from llama_cpp import Llama
6
+ from llama_cpp.llama_chat_format import Llava15ChatHandler
7
+
8
+
9
+ class TritonPythonModel:
10
+ """Your Python model must use the same class name. Every Python model
11
+ that is created must have "TritonPythonModel" as the class name.
12
+ """
13
+
14
+ @staticmethod
15
+ def auto_complete_config(auto_complete_model_config):
16
+ """`auto_complete_config` is called only once when loading the model
17
+ assuming the server was not started with
18
+ `--disable-auto-complete-config`. Implementing this function is
19
+ optional. No implementation of `auto_complete_config` will do nothing.
20
+ This function can be used to set `max_batch_size`, `input` and `output`
21
+ properties of the model using `set_max_batch_size`, `add_input`, and
22
+ `add_output`. These properties will allow Triton to load the model with
23
+ minimal model configuration in absence of a configuration file. This
24
+ function returns the `pb_utils.ModelConfig` object with these
25
+ properties. You can use the `as_dict` function to gain read-only access
26
+ to the `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object
27
+ being returned from here will be used as the final configuration for
28
+ the model.
29
+
30
+ Note: The Python interpreter used to invoke this function will be
31
+ destroyed upon returning from this function and as a result none of the
32
+ objects created here will be available in the `initialize`, `execute`,
33
+ or `finalize` functions.
34
+
35
+ Parameters
36
+ ----------
37
+ auto_complete_model_config : pb_utils.ModelConfig
38
+ An object containing the existing model configuration. You can build
39
+ upon the configuration given by this object when setting the
40
+ properties for this model.
41
+
42
+ Returns
43
+ -------
44
+ pb_utils.ModelConfig
45
+ An object containing the auto-completed model configuration
46
+ """
47
+ inputs = [{
48
+ 'name': 'PROMPT',
49
+ 'data_type': 'TYPE_STRING',
50
+ 'dims': [-1]
51
+ }, {
52
+ 'name': 'IMAGES',
53
+ 'data_type': 'TYPE_STRING', # Changed from TYPE_FP16 to TYPE_STRING
54
+ 'dims': [-1] # Changed to indicate a variable-length array of strings
55
+ }]
56
+
57
+ outputs = [{
58
+ 'name': 'RESULTS',
59
+ 'data_type': 'TYPE_STRING',
60
+ 'dims': [-1]
61
+ }]
62
+
63
+ config = auto_complete_model_config.as_dict()
64
+ input_names = []
65
+ output_names = []
66
+ for input in config['input']:
67
+ input_names.append(input['name'])
68
+ for output in config['output']:
69
+ output_names.append(output['name'])
70
+
71
+ for input in inputs:
72
+ if input['name'] not in input_names:
73
+ auto_complete_model_config.add_input(input)
74
+ for output in outputs:
75
+ if output['name'] not in output_names:
76
+ auto_complete_model_config.add_output(output)
77
+
78
+ auto_complete_model_config.set_dynamic_batching()
79
+
80
+ return auto_complete_model_config
81
+
82
+ def initialize(self, args):
83
+ """`initialize` is called only once when the model is being loaded.
84
+ Implementing `initialize` function is optional. This function allows
85
+ the model to initialize any state associated with this model.
86
+
87
+ Parameters
88
+ ----------
89
+ args : dict
90
+ Both keys and values are strings. The dictionary keys and values are:
91
+ * model_config: A JSON string containing the model configuration
92
+ * model_instance_kind: A string containing model instance kind
93
+ * model_instance_device_id: A string containing model instance device
94
+ ID
95
+ * model_repository: Model repository path
96
+ * model_version: Model version
97
+ * model_name: Model name
98
+ """
99
+ chat_handler = Llava15ChatHandler(clip_model_path="/weights/mmproj-model-f16.gguf", verbose=True)
100
+ self.model = Llama(model_path="/weights/ggml-model-q4_0.gguf", chat_handler=chat_handler, n_ctx=2048, logits_all=True, n_gpu_layers=-1)
101
+ print('Initialized...')
102
+
103
+ def run_inference(self, prompt, image):
104
+ image_data = f"data:image/png;base64,{image}"
105
+ messages = [
106
+ {"role": "system", "content": "You are an assistant who perfectly describes images."},
107
+ {
108
+ "role": "user",
109
+ "content": [
110
+ {"type": "image_url", "image_url": {"url": image_data}},
111
+ {"type" : "text", "text": prompt}
112
+ ]
113
+ }
114
+ ]
115
+ result = self.model.create_chat_completion(messages=messages)
116
+ output_string = result["choices"][0]["message"]["content"]
117
+ output_data = np.array([output_string.encode('utf-8')], dtype=object)
118
+ return output_data
119
+
120
+ def execute(self, requests):
121
+ """`execute` must be implemented in every Python model. `execute`
122
+ function receives a list of pb_utils.InferenceRequest as the only
123
+ argument. This function is called when an inference is requested
124
+ for this model.
125
+
126
+ Parameters
127
+ ----------
128
+ requests : list
129
+ A list of pb_utils.InferenceRequest
130
+
131
+ Returns
132
+ -------
133
+ list
134
+ A list of pb_utils.InferenceResponse. The length of this list must
135
+ be the same as `requests`
136
+ """
137
+
138
+ responses = []
139
+
140
+ for request in requests:
141
+ # Perform inference on the request and append it to responses
142
+ # list...
143
+ prompt = [
144
+ t.decode("UTF-8")
145
+ for t in pb_utils.get_input_tensor_by_name(request, "PROMPT")
146
+ .as_numpy()
147
+ .tolist()
148
+ ][0]
149
+ image = [
150
+ t.decode("UTF-8")
151
+ for t in pb_utils.get_input_tensor_by_name(request, "IMAGES")
152
+ .as_numpy()
153
+ .tolist()
154
+ ][0]
155
+ results = self.run_inference(prompt, image)
156
+
157
+ # Sending results
158
+ inference_response = pb_utils.InferenceResponse(output_tensors=[
159
+ pb_utils.Tensor(
160
+ "RESULTS",
161
+ results,
162
+ )
163
+ ])
164
+
165
+ responses.append(inference_response)
166
+
167
+ return responses
168
+
169
+ def finalize(self):
170
+ """`finalize` is called only once when the model is being unloaded.
171
+ Implementing `finalize` function is optional. This function allows
172
+ the model to perform any necessary clean ups before exit.
173
+ """
174
+ print('Cleaning up...')
docker/models/spacellava/config.pbtxt ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: "spacellava"
2
+ max_batch_size: 0
3
+ backend: "python"
4
+
5
+ input [
6
+ {
7
+ name: "PROMPT"
8
+ data_type: TYPE_STRING
9
+ dims: [ -1 ]
10
+ },
11
+ {
12
+ name: "IMAGES"
13
+ data_type: TYPE_STRING
14
+ dims: [ -1 ]
15
+ }
16
+ ]
17
+
18
+ output [
19
+ {
20
+ name: "RESULTS"
21
+ data_type: TYPE_STRING
22
+ dims: [ -1 ]
23
+ }
24
+ ]