Marlon Wiprud commited on
Commit
3740af9
1 Parent(s): 693f1b3

sketch: multigpu

Browse files
Files changed (1) hide show
  1. handler.py +42 -31
handler.py CHANGED
@@ -4,11 +4,12 @@ from PIL import Image
4
  import requests
5
  from transformers import AutoModelForCausalLM, LlamaTokenizer
6
  import torch
7
- from accelerate import (
8
- init_empty_weights,
9
- infer_auto_device_map,
10
- load_checkpoint_and_dispatch,
11
- )
 
12
 
13
 
14
  class EndpointHandler:
@@ -25,35 +26,45 @@ class EndpointHandler:
25
 
26
  self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
27
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # with init_empty_weights():
29
- # self.model = (
30
- # AutoModelForCausalLM.from_pretrained(
31
- # "THUDM/cogvlm-chat-hf",
32
- # torch_dtype=torch.bfloat16,
33
- # low_cpu_mem_usage=True,
34
- # trust_remote_code=True,
35
- # )
36
- # .to("cuda")
37
- # .eval()
38
  # )
39
 
40
- device_map = infer_auto_device_map(
41
- model,
42
- max_memory={
43
- 0: "16GiB",
44
- 1: "16GiB",
45
- 2: "16GiB",
46
- 3: "16GiB",
47
- "cpu": "180GiB",
48
- },
49
- no_split_module_classes="CogVLMDecoderLayer",
50
- )
51
- self.model = load_checkpoint_and_dispatch(
52
- model,
53
- "~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots", # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
54
- device_map=device_map,
55
- )
56
- model = model.eval()
 
 
57
 
58
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
59
  """
 
4
  import requests
5
  from transformers import AutoModelForCausalLM, LlamaTokenizer
6
  import torch
7
+
8
+ # from accelerate import (
9
+ # init_empty_weights,
10
+ # infer_auto_device_map,
11
+ # load_checkpoint_and_dispatch,
12
+ # )
13
 
14
 
15
  class EndpointHandler:
 
26
 
27
  self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
28
 
29
+ self.model = (
30
+ AutoModelForCausalLM.from_pretrained(
31
+ "THUDM/cogvlm-chat-hf",
32
+ torch_dtype=torch.bfloat16,
33
+ low_cpu_mem_usage=True,
34
+ trust_remote_code=True,
35
+ )
36
+ .to("cuda")
37
+ .eval()
38
+ )
39
+
40
+ # DISTRIBUTED GPUS
41
  # with init_empty_weights():
42
+ # self.model = AutoModelForCausalLM.from_pretrained(
43
+ # "THUDM/cogvlm-chat-hf",
44
+ # torch_dtype=torch.bfloat16,
45
+ # low_cpu_mem_usage=True,
46
+ # trust_remote_code=True,
 
 
 
 
47
  # )
48
 
49
+ # device_map = infer_auto_device_map(
50
+ # self.model,
51
+ # max_memory={
52
+ # 0: "16GiB",
53
+ # 1: "16GiB",
54
+ # 2: "16GiB",
55
+ # 3: "16GiB",
56
+ # "cpu": "180GiB",
57
+ # },
58
+ # no_split_module_classes="CogVLMDecoderLayer",
59
+ # )
60
+ # self.model = load_checkpoint_and_dispatch(
61
+ # self.model,
62
+ # "/home/ec2-user/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/8abca878c4257412c4c38eeafaed3fe27a036730", # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
63
+ # device_map=device_map,
64
+ # no_split_module_classes=["CogVLMDecoderLayer"],
65
+ # )
66
+ # self.model = self.model.eval()
67
+ ## DISTRIBUTED GPUS
68
 
69
  def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
70
  """