How to run inference using multiple GPUs
#9
by
NancyQingQing
- opened
I followed the guide for the section: dispatch the model into multiple GPUs with smaller VRAM. This is an example for you have two 24GB GPU and 16GB CPU memory. you can change the arguments of infer_auto_device_map with your own setting.
But I encountered a issue:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!
Here is whole error:
Traceback (most recent call last):
File "localpath/CogVLM/main.py", line 45, in <module>
outputs = model.generate(**inputs, **gen_kwargs)
File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/transformers/generation/utils.py", line 1673, in generate
return self.greedy_search(
File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/transformers/generation/utils.py", line 2521, in greedy_search
outputs = self(
File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
output = old_forward(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/cogvlm-chat-hf/modeling_cogvlm.py", line 610, in forward
outputs = self.model(
File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/cogvlm-chat-hf/modeling_cogvlm.py", line 378, in forward
images_features = self.encode_images(images)
File "/root/.cache/huggingface/modules/transformers_modules/cogvlm-chat-hf/modeling_cogvlm.py", line 350, in encode_images
images_features = self.vision(images)
File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/cogvlm-chat-hf/visual.py", line 129, in forward
x = self.transformer(x)
File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/cogvlm-chat-hf/visual.py", line 94, in forward
hidden_states = layer_module(hidden_states)
File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/root/.cache/huggingface/modules/transformers_modules/cogvlm-chat-hf/visual.py", line 83, in forward
output = mlp_input + mlp_output
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!
Here is my code (main.py):
import torch
import requests
from PIL import Image
from transformers import AutoModelForCausalLM, LlamaTokenizer
from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch, load_checkpoint_in_model
tokenizer = LlamaTokenizer.from_pretrained('localpath/vicuna-7b-v1.5')
with init_empty_weights():
model = AutoModelForCausalLM.from_pretrained(
'localpath/cogvlm-chat-hf',
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
trust_remote_code=True,
)
#model.tie_weights()
device_map = infer_auto_device_map(model, max_memory={0:'25GiB',1:'25GiB','cpu':'100GiB'}, no_split_module_classes='CogVLMDecoderLayer')
model = load_checkpoint_and_dispatch(
model,
'localpath/cogvlm-chat-hf', # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
device_map=device_map,
offload_state_dict=True
)
model = model.eval()
# check device for weights if u want to
#for n, p in model.named_parameters():
# print(f"{n}: {p.device}")
# chat example
query = 'Describe this image'
image = Image.open("localpath/CogVLM/examples/1.png").convert('RGB')
inputs = model.build_conversation_input_ids(tokenizer, query=query, history=[], images=[image]) # chat mode
inputs = {
'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'),
'images': [[inputs['images'][0].to('cuda').to(torch.float16)]],
}
gen_kwargs = {"max_length": 2048, "do_sample": False}
with torch.no_grad():
outputs = model.generate(**inputs, **gen_kwargs)
outputs = outputs[:, inputs['input_ids'].shape[1]:]
print(tokenizer.decode(outputs[0]))
try modifying no_split_module_classes=['CogVLMDecoderLayer']
to no_split_module_classes=['CogVLMDecoderLayer', 'TransformerLayer']
chenkq
changed discussion status to
closed