oleksandrfluxon commited on
Commit
5638f9e
1 Parent(s): c7b6a1c

Update handler.py

Browse files
Files changed (1) hide show
  1. handler.py +49 -17
handler.py CHANGED
@@ -1,16 +1,47 @@
1
  import torch
2
 
3
  from typing import Any, Dict
4
- from transformers import AutoModelForCausalLM, AutoTokenizer
 
 
5
 
6
 
7
  class EndpointHandler:
8
  def __init__(self, path=""):
 
 
 
 
 
 
 
 
 
9
  # load model and tokenizer from path
10
- self.tokenizer = AutoTokenizer.from_pretrained(path)
11
- self.model = AutoModelForCausalLM.from_pretrained(
12
- path, device_map="auto", torch_dtype=torch.float16, trust_remote_code=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  )
 
 
14
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
15
 
16
  def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
@@ -18,16 +49,17 @@ class EndpointHandler:
18
  inputs = data.pop("inputs", data)
19
  parameters = data.pop("parameters", None)
20
 
21
- # preprocess
22
- inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
23
-
24
- # pass inputs with all kwargs in data
25
- if parameters is not None:
26
- outputs = self.model.generate(**inputs, **parameters)
27
- else:
28
- outputs = self.model.generate(**inputs)
29
-
30
- # postprocess the prediction
31
- prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
32
-
33
- return [{"generated_text": prediction}]
 
 
1
  import torch
2
 
3
  from typing import Any, Dict
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
5
+ from accelerate import dispatch_model, infer_auto_device_map
6
+ from accelerate.utils import get_balanced_memory
7
 
8
 
9
  class EndpointHandler:
10
  def __init__(self, path=""):
11
+ config = AutoConfig.from_pretrained(
12
+ path,
13
+ trust_remote_code=True
14
+ )
15
+ # config.attn_config['attn_impl'] = 'triton'
16
+ config.init_device = 'cuda:0' # For fast initialization directly on GPU!
17
+ config.max_seq_len = 4096 # (input + output) tokens can now be up to 4096
18
+
19
+
20
  # load model and tokenizer from path
21
+ self.tokenizer = transformers.AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b', padding_side="left")
22
+ model = AutoModelForCausalLM.from_pretrained(
23
+ path,
24
+ config,
25
+ device_map="auto",
26
+ torch_dtype=torch.float16,
27
+ trust_remote_code=True
28
+ )
29
+
30
+ max_memory = get_balanced_memory(
31
+ model,
32
+ max_memory=None,
33
+ no_split_module_classes=["MPTBlock"],
34
+ dtype='float16',
35
+ low_zero=False
36
+ )
37
+ device_map = infer_auto_device_map(
38
+ model,
39
+ max_memory=max_memory,
40
+ no_split_module_classes=["MPTBlock"],
41
+ dtype='float16'
42
  )
43
+ self.model = dispatch_model(model, device_map=device_map)
44
+
45
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
46
 
47
  def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
 
49
  inputs = data.pop("inputs", data)
50
  parameters = data.pop("parameters", None)
51
 
52
+ with torch.autocast('cuda'):
53
+ # preprocess
54
+ inputs = self.tokenizer(inputs, return_tensors="pt").to(self.device)
55
+
56
+ # pass inputs with all kwargs in data
57
+ if parameters is not None:
58
+ outputs = self.model.generate(**inputs, **parameters)
59
+ else:
60
+ outputs = self.model.generate(**inputs)
61
+
62
+ # postprocess the prediction
63
+ prediction = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
64
+
65
+ return [{"generated_text": prediction}]