from typing import Dict, List, Any from llama_cpp import Llama class EndpointHandler(): def __init__(self, path="", vision_model="obsidian3b"): self.model = Llama.from_pretrained("MrOvkill/gemma-2-inference-endpoint-GGUF", filename="gemma-2b.q8_0.gguf") def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: """ data args: inputs (:obj: `str`) image (:obj: `Image`) Return: A :obj:`list` | `dict`: will be serialized and returned """ # get inputs inputs = data.pop("inputs", "") temperature = data.pop("temperature", None) if not temperature: temperature = data.pop("temp", 0.33) if temperature > 3 or temperature < 0: return json.dumps({ "status": "error", "reason": "invalid temperature ( 0.01 - 1.00 )" }) top_p = data.pop("top-p", 0.85) if top_p > 3 or top_p < 0: return json.dumps({ "status": "error", "reason": "invalid top percentage ( 0.01 - 1.00 )" }) top_k = data.pop("top-k", 42) if top_k > 100 or top_k < 0: return json.dumps({ "status": "error", "reason": "invalid top k ( 1 - 99 )" }) #image = data.pop("image", None) res = self.model(inputs, temperature=temperature, top_p=top_p, top_k=42) return res #inputs = self.processor(inputs, image, return_tensors="pt") #res = self.model.generate(**inputs, do_sample=False, max_new_tokens=4096) #return self.processor.decode(res[0], skip_special_tokens=True) #if image: # perform image classification using Obsidian 3b vision #image_features = self.vision.encode_image(image) #image_embedding = self.vision.extract_feature(image_features) #image_caption = self.vision.generate_caption(image_embedding) # combine text and image captions #combined_captions = [inputs, image_caption] # run text classification on combined captions #prediction = self.pipeline(combined_captions, temperature=0.33, num_beams=5, stop=[], do_sample=True) #return prediction #else: # run text classification on plain text input # prediction = self.pipeline(inputs, temperature=0.33, num_beams=5, stop=[], do_sample=True) # return prediction