|
import os |
|
import torch |
|
import numpy as np |
|
import triton_python_backend_utils as pb_utils |
|
from llama_cpp import Llama |
|
from llama_cpp.llama_chat_format import Llava15ChatHandler |
|
|
|
|
|
class TritonPythonModel: |
|
"""Your Python model must use the same class name. Every Python model |
|
that is created must have "TritonPythonModel" as the class name. |
|
""" |
|
|
|
@staticmethod |
|
def auto_complete_config(auto_complete_model_config): |
|
"""`auto_complete_config` is called only once when loading the model |
|
assuming the server was not started with |
|
`--disable-auto-complete-config`. Implementing this function is |
|
optional. No implementation of `auto_complete_config` will do nothing. |
|
This function can be used to set `max_batch_size`, `input` and `output` |
|
properties of the model using `set_max_batch_size`, `add_input`, and |
|
`add_output`. These properties will allow Triton to load the model with |
|
minimal model configuration in absence of a configuration file. This |
|
function returns the `pb_utils.ModelConfig` object with these |
|
properties. You can use the `as_dict` function to gain read-only access |
|
to the `pb_utils.ModelConfig` object. The `pb_utils.ModelConfig` object |
|
being returned from here will be used as the final configuration for |
|
the model. |
|
|
|
Note: The Python interpreter used to invoke this function will be |
|
destroyed upon returning from this function and as a result none of the |
|
objects created here will be available in the `initialize`, `execute`, |
|
or `finalize` functions. |
|
|
|
Parameters |
|
---------- |
|
auto_complete_model_config : pb_utils.ModelConfig |
|
An object containing the existing model configuration. You can build |
|
upon the configuration given by this object when setting the |
|
properties for this model. |
|
|
|
Returns |
|
------- |
|
pb_utils.ModelConfig |
|
An object containing the auto-completed model configuration |
|
""" |
|
inputs = [{ |
|
'name': 'PROMPT', |
|
'data_type': 'TYPE_STRING', |
|
'dims': [-1] |
|
}, { |
|
'name': 'IMAGES', |
|
'data_type': 'TYPE_STRING', |
|
'dims': [-1] |
|
}] |
|
|
|
outputs = [{ |
|
'name': 'RESULTS', |
|
'data_type': 'TYPE_STRING', |
|
'dims': [-1] |
|
}] |
|
|
|
config = auto_complete_model_config.as_dict() |
|
input_names = [] |
|
output_names = [] |
|
for input in config['input']: |
|
input_names.append(input['name']) |
|
for output in config['output']: |
|
output_names.append(output['name']) |
|
|
|
for input in inputs: |
|
if input['name'] not in input_names: |
|
auto_complete_model_config.add_input(input) |
|
for output in outputs: |
|
if output['name'] not in output_names: |
|
auto_complete_model_config.add_output(output) |
|
|
|
auto_complete_model_config.set_dynamic_batching() |
|
|
|
return auto_complete_model_config |
|
|
|
def initialize(self, args): |
|
"""`initialize` is called only once when the model is being loaded. |
|
Implementing `initialize` function is optional. This function allows |
|
the model to initialize any state associated with this model. |
|
|
|
Parameters |
|
---------- |
|
args : dict |
|
Both keys and values are strings. The dictionary keys and values are: |
|
* model_config: A JSON string containing the model configuration |
|
* model_instance_kind: A string containing model instance kind |
|
* model_instance_device_id: A string containing model instance device |
|
ID |
|
* model_repository: Model repository path |
|
* model_version: Model version |
|
* model_name: Model name |
|
""" |
|
chat_handler = Llava15ChatHandler(clip_model_path="/weights/mmproj-model-f16.gguf", verbose=True) |
|
self.model = Llama(model_path="/weights/ggml-model-q4_0.gguf", chat_handler=chat_handler, n_ctx=2048, logits_all=True, n_gpu_layers=-1) |
|
print('Initialized...') |
|
|
|
def run_inference(self, prompt, image): |
|
image_data = f"data:image/png;base64,{image}" |
|
messages = [ |
|
{"role": "system", "content": "You are an assistant who perfectly describes images."}, |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{"type": "image_url", "image_url": {"url": image_data}}, |
|
{"type" : "text", "text": prompt} |
|
] |
|
} |
|
] |
|
result = self.model.create_chat_completion(messages=messages) |
|
output_string = result["choices"][0]["message"]["content"] |
|
output_data = np.array([output_string.encode('utf-8')], dtype=object) |
|
return output_data |
|
|
|
def execute(self, requests): |
|
"""`execute` must be implemented in every Python model. `execute` |
|
function receives a list of pb_utils.InferenceRequest as the only |
|
argument. This function is called when an inference is requested |
|
for this model. |
|
|
|
Parameters |
|
---------- |
|
requests : list |
|
A list of pb_utils.InferenceRequest |
|
|
|
Returns |
|
------- |
|
list |
|
A list of pb_utils.InferenceResponse. The length of this list must |
|
be the same as `requests` |
|
""" |
|
|
|
responses = [] |
|
|
|
for request in requests: |
|
|
|
|
|
prompt = [ |
|
t.decode("UTF-8") |
|
for t in pb_utils.get_input_tensor_by_name(request, "PROMPT") |
|
.as_numpy() |
|
.tolist() |
|
][0] |
|
image = [ |
|
t.decode("UTF-8") |
|
for t in pb_utils.get_input_tensor_by_name(request, "IMAGES") |
|
.as_numpy() |
|
.tolist() |
|
][0] |
|
results = self.run_inference(prompt, image) |
|
|
|
|
|
inference_response = pb_utils.InferenceResponse(output_tensors=[ |
|
pb_utils.Tensor( |
|
"RESULTS", |
|
results, |
|
) |
|
]) |
|
|
|
responses.append(inference_response) |
|
|
|
return responses |
|
|
|
def finalize(self): |
|
"""`finalize` is called only once when the model is being unloaded. |
|
Implementing `finalize` function is optional. This function allows |
|
the model to perform any necessary clean ups before exit. |
|
""" |
|
print('Cleaning up...') |
|
|