add handler

Browse files

Files changed (3) hide show

.gitignore +1 -0
handler.py +92 -0
requirements.txt +3 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ llama_env/

handler.py ADDED Viewed

	@@ -0,0 +1,92 @@

+from llama_cpp import Llama
+from typing import Dict, List, Any
+import os
+class EndpointHandler:
+    def __init__(self):
+        # Construct the model path assuming the model is in the same directory as the handler file
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        model_filename = "Phi-3-medium-128k-instruct-IQ2_XS.gguf"
+        self.model_path = os.path.join(script_dir, model_filename)
+        # Load the GGUF model using llama_cpp
+        self.llm = Llama(
+            model_path=self.model_path,
+            n_ctx=5000,  # Set context length to 5000 tokens
+            n_threads=12,  # Adjust the number of CPU threads as per your machine
+            n_gpu_layers=4  # Adjust based on GPU availability
+        )
+        # Define generation kwargs for the model
+        self.generation_kwargs = {
+            "max_tokens": 400,  # Respond with up to 400 tokens
+            "stop": ["<|end|>", "<|user|>", "<|assistant|>"],
+            "top_k": 1  # Greedy decoding
+        }
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Data args:
+            inputs (:obj:`dict`): The input prompts for the LLM including system instructions and user messages.
+        Return:
+            A :obj:`list` | `dict`: will be serialized and returned.
+        """
+        # Extract inputs
+        inputs = data.get("inputs", {})
+        system_instructions = inputs.get("system", "")
+        user_message = inputs.get("message", "")
+        if not user_message:
+            raise ValueError("No user message provided for the model.")
+        # Combine system instructions and user message
+        final_input = f"{system_instructions}\n{user_message}"
+        # Run inference with llama_cpp
+        response = self.llm.create_chat_completion(
+            messages=[
+                {"role": "system", "content": system_instructions},
+                {"role": "user", "content": user_message}
+            ],
+            **self.generation_kwargs
+        )
+        # Access generated text based on the response structure
+        try:
+            generated_text = response["choices"][0]["message"].get("content", "")
+        except (KeyError, IndexError):
+            raise ValueError("Unexpected response structure: missing 'content' in 'choices[0]['message']'")
+        # Return the generated text
+        return [{"generated_text": generated_text}]
+# Example usage:
+if __name__ == "__main__":
+    # Instantiate the handler ONCE
+    handler = EndpointHandler()
+    # Handlers can be called multiple times with different inputs and the model will remain in memory
+    data1 = {
+        "inputs": {
+            "system": "You are a helpful assistant.",
+            "message": "What is the meaning of life?"
+        }
+    }
+    data2 = {
+        "inputs": {
+            "system": "You are a knowledgeable assistant.",
+            "message": "Tell me about the history of the internet."
+        }
+    }
+    # First call - model already in memory
+    response1 = handler(data1)
+    print(response1)
+    # Second call - model still in memory
+    response2 = handler(data2)
+    print(response2)

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+llama-cpp-python
+torch
+transformers