philschmid HF staff commited on
Commit
792e040
1 Parent(s): f81a275

Create handler.py

Browse files
Files changed (1) hide show
  1. handler.py +38 -0
handler.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ import torch
3
+ from transformers import AutoTokenizer
4
+ from auto_gptq import AutoGPTQForCausalLM
5
+
6
+ class EndpointHandler():
7
+ def __init__(self, path=""):
8
+ # Preload all the elements you are going to need at inference.
9
+ # pseudo:
10
+ self.tokenizer = AutoTokenizer.from_pretrained("philschmid/falcon-40b-instruct-GPTQ-inference-endpoints", use_fast=False)
11
+ self.model = AutoGPTQForCausalLM.from_quantized("philschmid/falcon-40b-instruct-GPTQ-inference-endpoints", device="cuda:0", use_triton=False, use_safetensors=True, torch_dtype=torch.float32, trust_remote_code=True)
12
+
13
+
14
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
15
+ """
16
+ data args:
17
+ inputs (:obj: `str` | `PIL.Image` | `np.array`)
18
+ kwargs
19
+ Return:
20
+ A :obj:`list` | `dict`: will be serialized and returned
21
+ """
22
+ # process input
23
+ inputs = data.pop("inputs", data)
24
+ parameters = data.pop("parameters", None)
25
+
26
+ # preprocess
27
+ input_ids = self.tokenizer(inputs, return_tensors="pt").input_ids
28
+
29
+ # pass inputs with all kwargs in data
30
+ if parameters is not None:
31
+ outputs = self.model.generate(input_ids, **parameters)
32
+ else:
33
+ outputs = self.model.generate(input_ids)
34
+
35
+ # postprocess the prediction
36
+ prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
37
+
38
+ return [{"generated_text": prediction}]