adbrebs commited on
Commit
c9c4226
1 Parent(s): fa2a93b

Upload 2 files

Browse files
Files changed (2) hide show
  1. handler.py +53 -0
  2. requirements.txt +1 -0
handler.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Dict, List, Any
3
+ from unsloth import FastLanguageModel
4
+
5
+
6
+ class EndpointHandler():
7
+ def __init__(self, path=""):
8
+ # Preload all the elements you are going to need at inference.
9
+ # pseudo:
10
+ # self.model= load_model(path)
11
+
12
+ max_seq_length = 2048
13
+ dtype = None
14
+ load_in_4bit = True
15
+ self.model, self.tokenizer = FastLanguageModel.from_pretrained(
16
+ model_name=path, # YOUR MODEL YOU USED FOR TRAINING
17
+ max_seq_length=max_seq_length,
18
+ dtype=dtype,
19
+ load_in_4bit=load_in_4bit,
20
+ )
21
+ FastLanguageModel.for_inference(self.model) # Enable native 2x faster inference
22
+
23
+
24
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
25
+ """
26
+ data args:
27
+ inputs (:obj: `str` | `PIL.Image` | `np.array`)
28
+ kwargs
29
+ Return:
30
+ A :obj:`list` | `dict`: will be serialized and returned
31
+ """
32
+
33
+ messages = data.pop("inputs", data)
34
+
35
+ # messages = [
36
+ # {"from": "human", "value": "What is a famous tall tower in Paris?"},
37
+ # ]
38
+ inputs = self.tokenizer.apply_chat_template(
39
+ messages,
40
+ tokenize=True,
41
+ add_generation_prompt=True, # Must add for generation
42
+ return_tensors="pt",
43
+ ).to("cuda")
44
+
45
+ outputs = self.model.generate(input_ids=inputs, max_new_tokens=1000, use_cache=True)
46
+ content = self.tokenizer.batch_decode(outputs)
47
+
48
+ pattern = r'\[INST\].*?\[/INST\]'
49
+ content = re.sub(pattern, '', content, flags=re.DOTALL)
50
+
51
+ content = content.replace('<s>', '').replace('</s>', '').strip()
52
+
53
+ return content
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git