MrOvkill
/

Phi-3-Instruct-Bloated

Text Generation

microsoft/Phi-3-mini-128k-instruct

NexaAIDev/Octopus-v4

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

MrOvkill commited on May 11

Commit

deb69ba

•

1 Parent(s): d82a4e5

Update handler.py

Files changed (1) hide show

handler.py +13 -9

handler.py CHANGED Viewed

@@ -4,8 +4,11 @@ from typing import Dict, List, Any
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-MAX_TOKENS=8192
-GPU_LAYERS=99 if torch.cuda.is_available() else 0
 class EndpointHandler():
     def __init__(self, data):
@@ -17,19 +20,20 @@ class EndpointHandler():
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         inputs = data.pop("inputs", "Q: What is the chemical composition of common concrete in 2024?\nA: ")
-        max_length = data.pop("max_length", 1024)
         try:
-            max_length = int(max_length)
         except Exception as e:
             return json.dumps({
                 "status": "error",
                 "reason": "max_length was passed as something that was absolutely not a plain old int"
             })
-        res = self.model(f"""
-<|user|>
-{inputs} <|end|>
-<|assistant|>
-""", max_new_tokens=max_new_tokens, do_sample=False)
         return res

 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
+PROMPT_FORMAT= """
+<|user|>
+{inputs} <|end|>
+<|assistant|>
+"""
 class EndpointHandler():
     def __init__(self, data):
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
         inputs = data.pop("inputs", "Q: What is the chemical composition of common concrete in 2024?\nA: ")
+        max_new_tokens = data.pop("max_length", 1024)
         try:
+            max_new_tokens = int(max_new_tokens)
         except Exception as e:
             return json.dumps({
                 "status": "error",
                 "reason": "max_length was passed as something that was absolutely not a plain old int"
             })
+        res = PROMPT_FORMAT.format(do_sample=False)
+        retrurn model(
+            res,
+            max_new_tokens=max_new_tokens,
+            do_sample=False
+        )
         return res