How to run inference using multiple GPUs

#9
by NancyQingQing - opened

I followed the guide for the section: dispatch the model into multiple GPUs with smaller VRAM. This is an example for you have two 24GB GPU and 16GB CPU memory. you can change the arguments of infer_auto_device_map with your own setting.

But I encountered a issue:
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!

Here is whole error:

Traceback (most recent call last):
  File "localpath/CogVLM/main.py", line 45, in <module>
    outputs = model.generate(**inputs, **gen_kwargs)
  File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
    return func(*args, **kwargs)
  File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/transformers/generation/utils.py", line 1673, in generate
    return self.greedy_search(
  File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/transformers/generation/utils.py", line 2521, in greedy_search
    outputs = self(
  File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/accelerate/hooks.py", line 165, in new_forward
    output = old_forward(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/cogvlm-chat-hf/modeling_cogvlm.py", line 610, in forward
    outputs = self.model(
  File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/cogvlm-chat-hf/modeling_cogvlm.py", line 378, in forward
    images_features = self.encode_images(images)
  File "/root/.cache/huggingface/modules/transformers_modules/cogvlm-chat-hf/modeling_cogvlm.py", line 350, in encode_images
    images_features = self.vision(images)
  File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/cogvlm-chat-hf/visual.py", line 129, in forward
    x = self.transformer(x)
  File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/cogvlm-chat-hf/visual.py", line 94, in forward
    hidden_states = layer_module(hidden_states)
  File "localpath/miniconda3/envs/llama/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/root/.cache/huggingface/modules/transformers_modules/cogvlm-chat-hf/visual.py", line 83, in forward
    output = mlp_input + mlp_output
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!

Here is my code (main.py):

import torch
import requests
from PIL import Image
from transformers import AutoModelForCausalLM, LlamaTokenizer
from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch, load_checkpoint_in_model

tokenizer = LlamaTokenizer.from_pretrained('localpath/vicuna-7b-v1.5')
with init_empty_weights():
    model = AutoModelForCausalLM.from_pretrained(
        'localpath/cogvlm-chat-hf',
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
        trust_remote_code=True,
    )
#model.tie_weights()
device_map = infer_auto_device_map(model, max_memory={0:'25GiB',1:'25GiB','cpu':'100GiB'}, no_split_module_classes='CogVLMDecoderLayer')

model = load_checkpoint_and_dispatch(
    model,
    'localpath/cogvlm-chat-hf',   # typical, '~/.cache/huggingface/hub/models--THUDM--cogvlm-chat-hf/snapshots/balabala'
    device_map=device_map,
    offload_state_dict=True
)
model = model.eval()

# check device for weights if u want to
#for n, p in model.named_parameters():
#    print(f"{n}: {p.device}")

# chat example
query = 'Describe this image'
image = Image.open("localpath/CogVLM/examples/1.png").convert('RGB')
inputs = model.build_conversation_input_ids(tokenizer, query=query, history=[], images=[image])  # chat mode
inputs = {
    'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'),
    'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'),
    'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'),
    'images': [[inputs['images'][0].to('cuda').to(torch.float16)]],
}
gen_kwargs = {"max_length": 2048, "do_sample": False}

with torch.no_grad():
    outputs = model.generate(**inputs, **gen_kwargs)
    outputs = outputs[:, inputs['input_ids'].shape[1]:]
    print(tokenizer.decode(outputs[0]))
Knowledge Engineering Group (KEG) & Data Mining at Tsinghua University org

try modifying no_split_module_classes=['CogVLMDecoderLayer'] to no_split_module_classes=['CogVLMDecoderLayer', 'TransformerLayer']

chenkq changed discussion status to closed

Sign up or log in to comment