Spaces:
Sleeping
Sleeping
import os | |
from typing import Dict, List, Optional, Tuple, Union | |
import torch | |
from langchain.llms.base import LLM | |
from langchain.llms.utils import enforce_stop_tokens | |
from transformers import AutoModel, AutoTokenizer | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" | |
DEVICE = "cuda" | |
DEVICE_ID = "0" | |
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE | |
def torch_gc(): | |
if torch.cuda.is_available(): | |
with torch.cuda.device(CUDA_DEVICE): | |
torch.cuda.empty_cache() | |
torch.cuda.ipc_collect() | |
def auto_configure_device_map(num_gpus: int) -> Dict[str, int]: | |
# transformer.word_embeddings 占用1层 | |
# transformer.final_layernorm 和 lm_head 占用1层 | |
# transformer.layers 占用 28 层 | |
# 总共30层分配到num_gpus张卡上 | |
num_trans_layers = 28 | |
per_gpu_layers = 30 / num_gpus | |
# bugfix: 在linux中调用torch.embedding传入的weight,input不在同一device上,导致RuntimeError | |
# windows下 model.device 会被设置成 transformer.word_embeddings.device | |
# linux下 model.device 会被设置成 lm_head.device | |
# 在调用chat或者stream_chat时,input_ids会被放到model.device上 | |
# 如果transformer.word_embeddings.device和model.device不同,则会导致RuntimeError | |
# 因此这里将transformer.word_embeddings,transformer.final_layernorm,lm_head都放到第一张卡上 | |
device_map = {'transformer.word_embeddings': 0, | |
'transformer.final_layernorm': 0, 'lm_head': 0} | |
used = 2 | |
gpu_target = 0 | |
for i in range(num_trans_layers): | |
if used >= per_gpu_layers: | |
gpu_target += 1 | |
used = 0 | |
assert gpu_target < num_gpus | |
device_map[f'transformer.layers.{i}'] = gpu_target | |
used += 1 | |
return device_map | |
class ChatLLM(LLM): | |
max_token: int = 10000 | |
temperature: float = 0.1 | |
top_p = 0.9 | |
history = [] | |
tokenizer: object = None | |
model: object = None | |
def __init__(self): | |
super().__init__() | |
def _llm_type(self) -> str: | |
return "ChatLLM" | |
def _call(self, | |
prompt: str, | |
stop: Optional[List[str]] = None) -> str: | |
if self.model == 'Minimax': | |
import requests | |
group_id = os.getenv('group_id') | |
api_key = os.getenv('api_key') | |
url = f'https://api.minimax.chat/v1/text/chatcompletion?GroupId={group_id}' | |
headers = { | |
"Authorization": f"Bearer {api_key}", | |
"Content-Type": "application/json" | |
} | |
request_body = { | |
"model": "abab5-chat", | |
"tokens_to_generate": 512, | |
'messages': [] | |
} | |
for i in self.history: | |
h_input = i[0] | |
h_reply = i[1] | |
request_body['messages'].append({ | |
"sender_type": "USER", | |
"text": h_input | |
}) | |
request_body['messages'].append({"sender_type": "BOT", "text": h_reply}) | |
request_body['messages'].append({"sender_type": "USER", "text": prompt}) | |
resp = requests.post(url, headers=headers, json=request_body) | |
response = resp.json()['reply'] | |
# 将当次的ai回复内容加入messages | |
request_body['messages'].append({"sender_type": "BOT", "text": response}) | |
self.history.append((prompt, response)) | |
else: | |
response, _ = self.model.chat( | |
self.tokenizer, | |
prompt, | |
history=self.history, | |
max_length=self.max_token, | |
temperature=self.temperature, | |
) | |
torch_gc() | |
if stop is not None: | |
response = enforce_stop_tokens(response, stop) | |
self.history = self.history+[[None, response]] | |
return response | |
def load_model(self, | |
model_name_or_path: str = "THUDM/chatglm-6b-int4", | |
llm_device=DEVICE, | |
device_map: Optional[Dict[str, int]] = None, | |
**kwargs): | |
self.tokenizer = AutoTokenizer.from_pretrained( | |
model_name_or_path, | |
trust_remote_code=True | |
) | |
if torch.cuda.is_available() and llm_device.lower().startswith("cuda"): | |
# 根据当前设备GPU数量决定是否进行多卡部署 | |
num_gpus = torch.cuda.device_count() | |
if num_gpus < 2 and device_map is None: | |
self.model = ( | |
AutoModel.from_pretrained( | |
model_name_or_path, | |
trust_remote_code=True, | |
**kwargs) | |
.half() | |
.cuda() | |
) | |
else: | |
from accelerate import dispatch_model | |
model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=True, **kwargs).half() | |
# 可传入device_map自定义每张卡的部署情况 | |
if device_map is None: | |
device_map = auto_configure_device_map(num_gpus) | |
self.model = dispatch_model(model, device_map=device_map) | |
else: | |
self.model = ( | |
AutoModel.from_pretrained( | |
model_name_or_path, | |
trust_remote_code=True) | |
.float() | |
.to(llm_device) | |
) | |
self.model = self.model.eval() |