How much VRAM does this model need?

#21
by Ziizu - opened

I have an Nvidia A10 (24GB of VRAM) but I'm getting out of memory errors.

model_name = "teknium/OpenHermes-2.5-Mistral-7B"


def load_model(model_name: str):
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

    with torch.device("cuda:0"):
        model = transformers.AutoModelForCausalLM.from_pretrained(model_name).eval()
    
    return tokenizer, model

tokenizer, model = load_model(model_name)

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacty of 21.99 GiB of which 13.00 MiB is free. Including non-PyTorch memory, this process has 21.96 GiB memory in use. Of the allocated memory 21.58 GiB is allocated by PyTorch, and 99.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF...

I assumed 24GB would be enough for a 7B model, how much VRAM do I need to run this model?

Owner

I have an Nvidia A10 (24GB of VRAM) but I'm getting out of memory errors.

model_name = "teknium/OpenHermes-2.5-Mistral-7B"


def load_model(model_name: str):
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

    with torch.device("cuda:0"):
        model = transformers.AutoModelForCausalLM.from_pretrained(model_name).eval()
    
    return tokenizer, model

tokenizer, model = load_model(model_name)

OutOfMemoryError: CUDA out of memory. Tried to allocate 64.00 MiB. GPU 0 has a total capacty of 21.99 GiB of which 13.00 MiB is free. Including non-PyTorch memory, this process has 21.96 GiB memory in use. Of the allocated memory 21.58 GiB is allocated by PyTorch, and 99.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF...

I assumed 24GB would be enough for a 7B model, how much VRAM do I need to run this model?

You're loading it likely in fp32. in fp32, it needs 28GB. In fp/bf16 it needs 14GB, in 8bit, 7GB, and in 4bit, ~4GB - add 1GB to all for CUDA Kernel

@teknium Thanks for the response, this may be a naive question but how do I load in 16/8 bit?

I've tried loading in bf16:

model_name = "teknium/OpenHermes-2.5-Mistral-7B"

def load_model(model_name: str):
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

    with torch.device("cuda:0"):
        model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).eval()
    
    return tokenizer, model

tokenizer, model = load_model(model_name)

which gave:

Loading checkpoint shards:   0%|          | 0/2 [00:49<?, ?it/s]
---------------------------------------------------------------------------
OutOfMemoryError                          Traceback (most recent call last)
Cell In[3], line 19
     15         model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).eval()
     17     return model
---> 19 model = load_model(model_name)

Cell In[3], line 15
     11 def load_model(model_name: str):
     12     #tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
     14     with torch.device("cuda:0"):
---> 15         model = transformers.AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).eval()
     17     return model

File ~/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/models/auto/auto_factory.py:566, in _BaseAutoModelClass.from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs)
    564 elif type(config) in cls._model_mapping.keys():
    565     model_class = _get_model_class(config, cls._model_mapping)
--> 566     return model_class.from_pretrained(
    567         pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs
    568     )
    569 raise ValueError(
    570     f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n"
    571     f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}."
    572 )

File ~/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/modeling_utils.py:3706, in PreTrainedModel.from_pretrained(cls, pretrained_model_name_or_path, config, cache_dir, ignore_mismatched_sizes, force_download, local_files_only, token, revision, use_safetensors, *model_args, **kwargs)
   3697     if dtype_orig is not None:
   3698         torch.set_default_dtype(dtype_orig)
   3699     (
   3700         model,
   3701         missing_keys,
   3702         unexpected_keys,
   3703         mismatched_keys,
   3704         offload_index,
   3705         error_msgs,
-> 3706     ) = cls._load_pretrained_model(
   3707         model,
   3708         state_dict,
   3709         loaded_state_dict_keys,  # XXX: rename?
   3710         resolved_archive_file,
   3711         pretrained_model_name_or_path,
   3712         ignore_mismatched_sizes=ignore_mismatched_sizes,
   3713         sharded_metadata=sharded_metadata,
   3714         _fast_init=_fast_init,
   3715         low_cpu_mem_usage=low_cpu_mem_usage,
   3716         device_map=device_map,
   3717         offload_folder=offload_folder,
   3718         offload_state_dict=offload_state_dict,
   3719         dtype=torch_dtype,
   3720         is_quantized=(getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES),
   3721         keep_in_fp32_modules=keep_in_fp32_modules,
   3722     )
   3724 model.is_loaded_in_4bit = load_in_4bit
   3725 model.is_loaded_in_8bit = load_in_8bit

File ~/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/modeling_utils.py:4091, in PreTrainedModel._load_pretrained_model(cls, model, state_dict, loaded_keys, resolved_archive_file, pretrained_model_name_or_path, ignore_mismatched_sizes, sharded_metadata, _fast_init, low_cpu_mem_usage, device_map, offload_folder, offload_state_dict, dtype, is_quantized, keep_in_fp32_modules)
   4089 if shard_file in disk_only_shard_files:
   4090     continue
-> 4091 state_dict = load_state_dict(shard_file)
   4093 # Mistmatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not
   4094 # matching the weights in the model.
   4095 mismatched_keys += _find_mismatched_keys(
   4096     state_dict,
   4097     model_state_dict,
   (...)
   4101     ignore_mismatched_sizes,
   4102 )

File ~/text-generation-webui/installer_files/env/lib/python3.11/site-packages/transformers/modeling_utils.py:510, in load_state_dict(checkpoint_file)
    505     if metadata.get("format") not in ["pt", "tf", "flax"]:
    506         raise OSError(
    507             f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
    508             "you save your model with the `save_pretrained` method."
    509         )
--> 510     return safe_load_file(checkpoint_file)
    511 try:
    512     if (
    513         is_deepspeed_zero3_enabled() and torch.distributed.is_initialized() and torch.distributed.get_rank() > 0
    514     ) or (is_fsdp_enabled() and not is_local_dist_rank_0()):

File ~/text-generation-webui/installer_files/env/lib/python3.11/site-packages/safetensors/torch.py:310, in load_file(filename, device)
    308 with safe_open(filename, framework="pt", device=device) as f:
    309     for k in f.keys():
--> 310         result[k] = f.get_tensor(k)
    311 return result

File ~/text-generation-webui/installer_files/env/lib/python3.11/site-packages/torch/utils/_device.py:77, in DeviceContext.__torch_function__(self, func, types, args, kwargs)
     75 if func in _device_constructors() and kwargs.get('device') is None:
     76     kwargs['device'] = self.device
---> 77 return func(*args, **kwargs)

OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacty of 21.99 GiB of which 59.00 MiB is free. Including non-PyTorch memory, this process has 21.92 GiB memory in use. Of the allocated memory 21.44 GiB is allocated by PyTorch, and 203.16 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

For reference/context, I can load the quantised GGUF version of this model without issue.

Owner

Use this inference code from the repo

https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B/blob/main/transformers_inference.py

Let me know if it still has issues 🤗

I'm using the inference script above with the following prompt:
prompts = [
"""<|im_start|>system
You are a sentient, superintelligent artificial general intelligence, here to teach and assist me.<|im_end|>
<|im_start|>user
Explain how viruses casue diseases<|im_end|>
<|im_start|>assistant""",
]

for chat in prompts:
print(chat)
input_ids = tokenizer(chat, return_tensors="pt").input_ids.to("cuda")
generated_ids = model.generate(input_ids, max_new_tokens=1000,repetition_penalty=1.1, do_sample=False, eos_token_id=tokenizer.eos_token_id)
response = tokenizer.decode(generated_ids[0][input_ids.shape[-1]:], skip_special_tokens=True, clean_up_tokenization_space=True)
print(f"Response: {response}")

After about 13 minutes it generates this error: "IndexError: piece id is out of range."
I've tried 4bit as well, still not able to get a response. I have an RTX 3080 16Gb.
Any suggestions on how to make it work? Thank you.

Owner

Can you share the full log

Thank you @teknium for responding so fast. here's the traceback:
{
"name": "IndexError",
"message": "piece id is out of range.",
"stack": "---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
Cell In[6], line 14
12 #generated_ids = model.generate(input_ids, max_new_tokens=1000, temperature=0.8, repetition_penalty=1.1, do_sample=True, eos_token_id=tokenizer.eos_token_id)
13 generated_ids = model.generate(input_ids, max_new_tokens=100,repetition_penalty=1.1, do_sample=False, eos_token_id=tokenizer.eos_token_id)
---> 14 response = tokenizer.decode(generated_ids[0][input_ids.shape[-1]:], skip_special_tokens=True, clean_up_tokenization_space=True)
15 print(f"Response: {response}")

File /usr/local/lib/python3.9/dist-packages/transformers/tokenization_utils_base.py:3750, in PreTrainedTokenizerBase.decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, **kwargs)
3747 # Convert inputs to python lists
3748 token_ids = to_py_obj(token_ids)
-> 3750 return self._decode(
3751 token_ids=token_ids,
3752 skip_special_tokens=skip_special_tokens,
3753 clean_up_tokenization_spaces=clean_up_tokenization_spaces,
3754 **kwargs,
3755 )

File /usr/local/lib/python3.9/dist-packages/transformers/tokenization_utils.py:1001, in PreTrainedTokenizer._decode(self, token_ids, skip_special_tokens, clean_up_tokenization_spaces, spaces_between_special_tokens, **kwargs)
991 def _decode(
992 self,
993 token_ids: List[int],
(...)
997 **kwargs,
998 ) -> str:
999 self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
-> 1001 filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
1002 legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | {
1003 token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size
1004 }
1005 # To avoid mixing byte-level and unicode for byte-level BPT
1006 # we need to build string separately for added tokens and byte-level tokens
1007 # cf. https://github.com/huggingface/transformers/issues/1133

File /usr/local/lib/python3.9/dist-packages/transformers/tokenization_utils.py:982, in PreTrainedTokenizer.convert_ids_to_tokens(self, ids, skip_special_tokens)
980 tokens.append(self._added_tokens_decoder[index].content)
981 else:
--> 982 tokens.append(self._convert_id_to_token(index))
983 return tokens

File /usr/local/lib/python3.9/dist-packages/transformers/models/llama/tokenization_llama.py:280, in LlamaTokenizer._convert_id_to_token(self, index)
278 def _convert_id_to_token(self, index):
279 """Converts an index (integer) in a token (str) using the vocab."""
--> 280 token = self.sp_model.IdToPiece(index)
281 return token

File /usr/local/lib/python3.9/dist-packages/sentencepiece/init.py:1045, in _batchnize.._batched_func(self, arg)
1043 return [_func(self, n) for n in arg]
1044 else:
-> 1045 return _func(self, arg)

File /usr/local/lib/python3.9/dist-packages/sentencepiece/init.py:1038, in _batchnize.._func(v, n)
1036 def _func(v, n):
1037 if type(n) is int and (n < 0 or n >= v.piece_size()):
-> 1038 raise IndexError('piece id is out of range.')
1039 return func(v, n)

IndexError: piece id is out of range."
}

thank you.

Sign up or log in to comment