artistypl commited on
Commit
05ebb09
1 Parent(s): db7289c

Create chatllm.py

Browse files
Files changed (1) hide show
  1. chatllm.py +158 -0
chatllm.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, List, Optional, Tuple, Union
3
+
4
+ import torch
5
+ from langchain.llms.base import LLM
6
+ from langchain.llms.utils import enforce_stop_tokens
7
+ from transformers import AutoModel, AutoTokenizer
8
+
9
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
10
+
11
+ DEVICE = "cuda"
12
+ DEVICE_ID = "0"
13
+ CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
14
+
15
+
16
+ def torch_gc():
17
+ if torch.cuda.is_available():
18
+ with torch.cuda.device(CUDA_DEVICE):
19
+ torch.cuda.empty_cache()
20
+ torch.cuda.ipc_collect()
21
+
22
+ def auto_configure_device_map(num_gpus: int) -> Dict[str, int]:
23
+ # transformer.word_embeddings 占用1层
24
+ # transformer.final_layernorm 和 lm_head 占用1层
25
+ # transformer.layers 占用 28 层
26
+ # 总共30层分配到num_gpus张卡上
27
+ num_trans_layers = 28
28
+ per_gpu_layers = 30 / num_gpus
29
+
30
+ # bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError
31
+ # windows下 model.device 会被设置成 transformer.word_embeddings.device
32
+ # linux下 model.device 会被设置成 lm_head.device
33
+ # 在调用chat或者stream_chat时,input_ids会被放到model.device上
34
+ # 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError
35
+ # 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上
36
+ device_map = {'transformer.word_embeddings': 0,
37
+ 'transformer.final_layernorm': 0, 'lm_head': 0}
38
+
39
+ used = 2
40
+ gpu_target = 0
41
+ for i in range(num_trans_layers):
42
+ if used >= per_gpu_layers:
43
+ gpu_target += 1
44
+ used = 0
45
+ assert gpu_target < num_gpus
46
+ device_map[f'transformer.layers.{i}'] = gpu_target
47
+ used += 1
48
+
49
+ return device_map
50
+
51
+
52
+
53
+ class ChatLLM(LLM):
54
+ max_token: int = 10000
55
+ temperature: float = 0.1
56
+ top_p = 0.9
57
+ history = []
58
+ tokenizer: object = None
59
+ model: object = None
60
+
61
+ def __init__(self):
62
+ super().__init__()
63
+
64
+ @property
65
+ def _llm_type(self) -> str:
66
+ return "ChatLLM"
67
+
68
+ def _call(self,
69
+ prompt: str,
70
+ stop: Optional[List[str]] = None) -> str:
71
+
72
+ if self.model == 'Minimax':
73
+ import requests
74
+
75
+ group_id = os.getenv('group_id')
76
+ api_key = os.getenv('api_key')
77
+
78
+ url = f'https://api.minimax.chat/v1/text/chatcompletion?GroupId={group_id}'
79
+ headers = {
80
+ "Authorization": f"Bearer {api_key}",
81
+ "Content-Type": "application/json"
82
+ }
83
+ request_body = {
84
+ "model": "abab5-chat",
85
+ "tokens_to_generate": 512,
86
+ 'messages': []
87
+ }
88
+
89
+ for i in self.history:
90
+ h_input = i[0]
91
+ h_reply = i[1]
92
+ request_body['messages'].append({
93
+ "sender_type": "USER",
94
+ "text": h_input
95
+ })
96
+ request_body['messages'].append({"sender_type": "BOT", "text": h_reply})
97
+
98
+ request_body['messages'].append({"sender_type": "USER", "text": prompt})
99
+ resp = requests.post(url, headers=headers, json=request_body)
100
+ response = resp.json()['reply']
101
+ # 将当次的ai回复内容加入messages
102
+ request_body['messages'].append({"sender_type": "BOT", "text": response})
103
+ self.history.append((prompt, response))
104
+
105
+ else:
106
+
107
+ response, _ = self.model.chat(
108
+ self.tokenizer,
109
+ prompt,
110
+ history=self.history,
111
+ max_length=self.max_token,
112
+ temperature=self.temperature,
113
+ )
114
+ torch_gc()
115
+ if stop is not None:
116
+ response = enforce_stop_tokens(response, stop)
117
+ self.history = self.history+[[None, response]]
118
+ return response
119
+
120
+ def load_model(self,
121
+ model_name_or_path: str = "THUDM/chatglm-6b-int4",
122
+ llm_device=DEVICE,
123
+ device_map: Optional[Dict[str, int]] = None,
124
+ **kwargs):
125
+ self.tokenizer = AutoTokenizer.from_pretrained(
126
+ model_name_or_path,
127
+ trust_remote_code=True
128
+ )
129
+ if torch.cuda.is_available() and llm_device.lower().startswith("cuda"):
130
+ # 根据当前设备GPU数量决定是否进行多卡部署
131
+ num_gpus = torch.cuda.device_count()
132
+ if num_gpus < 2 and device_map is None:
133
+ self.model = (
134
+ AutoModel.from_pretrained(
135
+ model_name_or_path,
136
+ trust_remote_code=True,
137
+ **kwargs)
138
+ .half()
139
+ .cuda()
140
+ )
141
+ else:
142
+ from accelerate import dispatch_model
143
+
144
+ model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, **kwargs).half()
145
+ # 可传入device_map自定义每张卡的部署情况
146
+ if device_map is None:
147
+ device_map = auto_configure_device_map(num_gpus)
148
+
149
+ self.model = dispatch_model(model, device_map=device_map)
150
+ else:
151
+ self.model = (
152
+ AutoModel.from_pretrained(
153
+ model_name_or_path,
154
+ trust_remote_code=True)
155
+ .float()
156
+ .to(llm_device)
157
+ )
158
+ self.model = self.model.eval()