How to load in multi-gpu instance ?

#19
by aastha6 - opened
import os, torch, gc
from mistral_inference.model import Transformer
from mistral_inference.generate import generate
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.messages import UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest

torch.cuda.empty_cache()
gc.collect()


# 1. load tokenizer
model_folder_path = "/home/ubuntu/mistral_models/codestral-22b-v0x1"
# model_folder_path = "/home/ubuntu/mistral_models/mistral-7b-instruct-v0x3"
mistral_tokenizer = MistralTokenizer.from_file(model_folder_path+"/tokenizer.model.v3")

# 2. chat completion request
user_question = "Explain Travelling Salesman Problem in a nutshell."
completion_request = ChatCompletionRequest(
    messages=[UserMessage(content=user_question)]
)

# 3. encode message
tokens = mistral_tokenizer.encode_chat_completion(completion_request).tokens

# 4. load model
model = Transformer.from_folder(
    model_folder_path, 
    # num_pipeline_ranks=4, 
    # device="cuda", 
    dtype=torch.bfloat16
)

# 5. generate results
out_tokens, _ = generate(
    [tokens],
    model,
    max_tokens=64,
    temperature=0.0,
    eos_id=mistral_tokenizer.instruct_tokenizer.tokenizer.eos_id
)

# 6. decode generated tokens 
result = mistral_tokenizer.instruct_tokenizer.tokenizer.decode(out_tokens[0])
print(result)

Using A10G with 4 gpus of 24GB each.

Error: OutOfMemoryError: CUDA out of memory. Tried to allocate 192.00 MiB. GPU

Tried setting num_pipeline_ranks=4 but facing different error: ValueError: Default process group has not been initialized, please make sure to call init_process_group.

Any suggestions would be really helpful !

I would recommend launching using vllm; simple set --tp 4 for 4 GPU. set CUDA_VISIBLE_DEVICE=0,1,2,3 https://docs.vllm.ai/en/stable/

Sign up or log in to comment