Marlon Wiprud commited on
Commit
a33ae41
1 Parent(s): afb44cc
Files changed (2) hide show
  1. handler.py +133 -0
  2. requirements.txt +11 -0
handler.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from transformers import pipeline
3
+ from PIL import Image
4
+ import requests
5
+ from transformers import AutoModelForCausalLM, LlamaTokenizer
6
+ import torch
7
+ from accelerate import (
8
+ init_empty_weights,
9
+ infer_auto_device_map,
10
+ load_checkpoint_and_dispatch,
11
+ )
12
+
13
+
14
+ class EndpointHandler:
15
+ def __init__(self, path=""):
16
+ # Preload all the elements you are going to need at inference.
17
+
18
+ # self.pipeline = pipeline(
19
+ # "text-generation", model="THUDM/cogvlm-chat-hf", trust_remote_code=True
20
+ # )
21
+
22
+ # self.model = AutoModelForCausalLM.from_pretrained(
23
+ # "THUDM/cogvlm-chat-hf", trust_remote_code=True
24
+ # )
25
+
26
+ self.tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
27
+
28
+ # self.model = (
29
+ # AutoModelForCausalLM.from_pretrained(
30
+ # "THUDM/cogvlm-chat-hf",
31
+ # torch_dtype=torch.bfloat16,
32
+ # low_cpu_mem_usage=True,
33
+ # trust_remote_code=True,
34
+ # )
35
+ # .to("cuda")
36
+ # .eval()
37
+ # )
38
+
39
+ # DISTRIBUTED GPUS
40
+ with init_empty_weights():
41
+ self.model = AutoModelForCausalLM.from_pretrained(
42
+ "THUDM/cogvlm-chat-hf",
43
+ torch_dtype=torch.bfloat16,
44
+ low_cpu_mem_usage=True,
45
+ trust_remote_code=True,
46
+ )
47
+
48
+ device_map = infer_auto_device_map(
49
+ self.model,
50
+ max_memory={
51
+ 0: "16GiB",
52
+ 1: "16GiB",
53
+ 2: "16GiB",
54
+ 3: "16GiB",
55
+ "cpu": "180GiB",
56
+ },
57
+ no_split_module_classes=["CogVLMDecoderLayer"],
58
+ )
59
+ self.model = load_checkpoint_and_dispatch(
60
+ self.model,
61
+ "~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/8abca878c4257412c4c38eeafaed3fe27a036730",
62
+ "~/.cache/huggingface/modules/transformers_modules/THUDM/cogvlm-chat-hf/8abca878c4257412c4c38eeafaed3fe27a036730", # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
63
+ # "/home/ec2-user/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/8abca878c4257412c4c38eeafaed3fe27a036730", # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
64
+ device_map=device_map,
65
+ no_split_module_classes=["CogVLMDecoderLayer"],
66
+ )
67
+ self.model = self.model.eval()
68
+ ## DISTRIBUTED GPUS
69
+
70
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
71
+ """
72
+ data args:
73
+ inputs (:obj: `str` | `PIL.Image` | `np.array`)
74
+ kwargs
75
+ Return:
76
+ A :obj:`list` | `dict`: will be serialized and returned
77
+ """
78
+
79
+ query = data["query"]
80
+ img_uri = data["img_uri"]
81
+
82
+ image = Image.open(
83
+ requests.get(
84
+ img_uri,
85
+ stream=True,
86
+ ).raw
87
+ ).convert("RGB")
88
+
89
+ inputs = self.model.build_conversation_input_ids(
90
+ self.tokenizer,
91
+ query=query,
92
+ history=[],
93
+ images=[image],
94
+ template_version="vqa",
95
+ ) # vqa mode
96
+
97
+ inputs = {
98
+ "input_ids": inputs["input_ids"].unsqueeze(0).to("cuda"),
99
+ "token_type_ids": inputs["token_type_ids"].unsqueeze(0).to("cuda"),
100
+ "attention_mask": inputs["attention_mask"].unsqueeze(0).to("cuda"),
101
+ "images": [[inputs["images"][0].to("cuda").to(torch.bfloat16)]],
102
+ }
103
+
104
+ gen_kwargs = {"max_length": 2048, "do_sample": False}
105
+
106
+ with torch.no_grad():
107
+ outputs = self.model.generate(**inputs, **gen_kwargs)
108
+ outputs = outputs[:, inputs["input_ids"].shape[1] :]
109
+ response = self.tokenizer.decode(outputs[0])
110
+ return response
111
+
112
+
113
+ # query = "How many houses are there in this cartoon?"
114
+ # image = Image.open(
115
+ # requests.get(
116
+ # "https://github.com/THUDM/CogVLM/blob/main/examples/3.jpg?raw=true", stream=True
117
+ # ).raw
118
+ # ).convert("RGB")
119
+ # inputs = model.build_conversation_input_ids(
120
+ # tokenizer, query=query, history=[], images=[image], template_version="vqa"
121
+ # ) # vqa mode
122
+ # inputs = {
123
+ # "input_ids": inputs["input_ids"].unsqueeze(0).to("cuda"),
124
+ # "token_type_ids": inputs["token_type_ids"].unsqueeze(0).to("cuda"),
125
+ # "attention_mask": inputs["attention_mask"].unsqueeze(0).to("cuda"),
126
+ # "images": [[inputs["images"][0].to("cuda").to(torch.bfloat16)]],
127
+ # }
128
+ # gen_kwargs = {"max_length": 2048, "do_sample": False}
129
+
130
+ # with torch.no_grad():
131
+ # outputs = model.generate(**inputs, **gen_kwargs)
132
+ # outputs = outputs[:, inputs["input_ids"].shape[1] :]
133
+ # print(tokenizer.decode(outputs[0]))
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ einops
2
+ Pillow==10.1.0
3
+ # torch==2.1.0
4
+ torch==1.13.1
5
+ # transformers==4.35.0
6
+ accelerate==0.24.1
7
+ sentencepiece==0.1.99
8
+ einops==0.7.0
9
+ # xformers==0.0.22.post7
10
+ xformers
11
+ triton==2.1.0