MrOvkill
/

gemma-2-inference-endpoint-GGUF

Text Generation

English

llama.cpp

Model card Files Files and versions Community

MrOvkill commited on Mar 6, 2024

Commit

96aa541

1 Parent(s): 3541e3b

Inference endpoint for Gemma 2b it is almost at release 0.1!

Browse files

Files changed (1) hide show

handler.py +14 -37

handler.py CHANGED Viewed

@@ -1,19 +1,13 @@
 from typing import Dict, List, Any
 from llama_cpp import Llama
 class EndpointHandler():
-    def __init__(self, path="", vision_model="obsidian3b"):
         self.model = Llama.from_pretrained("MrOvkill/gemma-2-inference-endpoint-GGUF", filename="gemma-2b.q8_0.gguf")
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
-        """
-        data args:
-            inputs (:obj: `str`)
-            image (:obj: `Image`)
-        Return:
-            A :obj:`list` | `dict`: will be serialized and returned
-        """
-        # get inputs
         inputs = data.pop("inputs", "")
         temperature = data.pop("temperature", None)
         if not temperature:
@@ -35,33 +29,16 @@ class EndpointHandler():
                 "status": "error",
                 "reason": "invalid top k ( 1 - 99 )"
             })
-        #image = data.pop("image", None)
-        res = self.model(inputs, temperature=temperature, top_p=top_p, top_k=42)
-        return res
-        #inputs = self.processor(inputs, image, return_tensors="pt")
-        #res = self.model.generate(**inputs, do_sample=False, max_new_tokens=4096)
-        #return self.processor.decode(res[0], skip_special_tokens=True)
-        #if image:
-            # perform image classification using Obsidian 3b vision
-            #image_features = self.vision.encode_image(image)
-            #image_embedding = self.vision.extract_feature(image_features)
-            #image_caption = self.vision.generate_caption(image_embedding)
-            # combine text and image captions
-            #combined_captions = [inputs, image_caption]
-            # run text classification on combined captions
-            #prediction = self.pipeline(combined_captions, temperature=0.33, num_beams=5, stop=[], do_sample=True)
-            #return prediction
-        #else:
-            # run text classification on plain text input
-        #    prediction = self.pipeline(inputs, temperature=0.33, num_beams=5, stop=[], do_sample=True)
-        #    return prediction

 from typing import Dict, List, Any
 from llama_cpp import Llama
+MAX_TOKENS=8192
 class EndpointHandler():
+    def __init__(self):
         self.model = Llama.from_pretrained("MrOvkill/gemma-2-inference-endpoint-GGUF", filename="gemma-2b.q8_0.gguf")
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         inputs = data.pop("inputs", "")
         temperature = data.pop("temperature", None)
         if not temperature:
                 "status": "error",
                 "reason": "invalid top k ( 1 - 99 )"
             })
+        system_prompt = data.pop("system-prompt", "You are Gemma. Assist user with whatever they require, in a safe and moral manner.")
+        format = data.pop("format", "<startofturn>system\n{system_prompt} <endoftext>\n<startofturn>user\n{prompt} <endofturn>\n<startofturn>model")
+        try:
+            format = format.format(system_prompt = system_prompt, prompt = inputs)
+        except Exception as e:
+            return json.dumps({
+                "status": "error",
+                "reason": "invalid format"
+            })
+        res = self.model(format, temperature=temperature, top_p=top_p, top_k=42)
+        return res