Text Generation
4-bit precision

Error running the code:

by Andyrasika - opened
quantized_model_dir = "/workspace/models/TheBloke_stable-vicuna-13B-GPTQ"

model_basename = "stable-vicuna-13B-GPTQ-4bit.compat.no-act-order"

use_strict = False

use_triton = False

tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=True)

quantize_config = BaseQuantizeConfig(

model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir,

# Prevent printing spurious transformers error when using pipeline with AutoGPTQ

peft_config = PromptTuningConfig(
    prompt_tuning_init_text="Human Assistant chat",

model = get_peft_model(model, peft_config)

gave error:

OutOfMemoryError                          Traceback (most recent call last)
Cell In[18], line 25
     17 tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=True)
     19 quantize_config = BaseQuantizeConfig(
     20         bits=4,
     21         group_size=128,
     22         desc_act=False
     23     )
---> 25 model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir,
     26         use_safetensors=True,
     27         strict=use_strict,
     28         device="cuda:0",
     29         model_basename=model_basename,
     30         use_triton=use_triton,
     31         quantize_config=quantize_config)
     33 # Prevent printing spurious transformers error when using pipeline with AutoGPTQ
     34 logging.set_verbosity(logging.CRITICAL)

File /kaggle/working/AutoGPTQ/auto_gptq/modeling/auto.py:108, in AutoGPTQForCausalLM.from_quantized(cls, model_name_or_path, device_map, max_memory, device, low_cpu_mem_usage, use_triton, inject_fused_attention, inject_fused_mlp, use_cuda_fp16, quantize_config, model_basename, use_safetensors, trust_remote_code, warmup_triton, trainable, disable_exllama, **kwargs)
    102 # TODO: do we need this filtering of kwargs? @PanQiWei is there a reason we can't just pass all kwargs?
    103 keywords = {
    104     key: kwargs[key]
    105     for key in list(signature(quant_func).parameters.keys()) + huggingface_kwargs
    106     if key in kwargs
    107 }
--> 108 return quant_func(
    109     model_name_or_path=model_name_or_path,
    110     device_map=device_map,
    111     max_memory=max_memory,
    112     device=device,
    113     low_cpu_mem_usage=low_cpu_mem_usage,
    114     use_triton=use_triton,
    115     inject_fused_attention=inject_fused_attention,
    116     inject_fused_mlp=inject_fused_mlp,
    117     use_cuda_fp16=use_cuda_fp16,
    118     quantize_config=quantize_config,
    119     model_basename=model_basename,
    120     use_safetensors=use_safetensors,
    121     trust_remote_code=trust_remote_code,
    122     warmup_triton=warmup_triton,
    123     trainable=trainable,
    124     disable_exllama=disable_exllama,
    125     **keywords
    126 )

File /kaggle/working/AutoGPTQ/auto_gptq/modeling/_base.py:875, in BaseGPTQForCausalLM.from_quantized(cls, model_name_or_path, device_map, max_memory, device, low_cpu_mem_usage, use_triton, torch_dtype, inject_fused_attention, inject_fused_mlp, use_cuda_fp16, quantize_config, model_basename, use_safetensors, trust_remote_code, warmup_triton, trainable, disable_exllama, **kwargs)
    872 if low_cpu_mem_usage:
    873     make_sure_no_tensor_in_meta_device(model, use_triton, quantize_config.desc_act, quantize_config.group_size, bits=quantize_config.bits)
--> 875 accelerate.utils.modeling.load_checkpoint_in_model(
    876     model,
    877     checkpoint=model_save_name,
    878     device_map=device_map,
    879     offload_state_dict=True,
    880     offload_buffers=True
    881 )
    882 model = simple_dispatch_model(model, device_map)
    884 # == step4: set seqlen == #

File /opt/conda/lib/python3.10/site-packages/accelerate/utils/modeling.py:1279, in load_checkpoint_in_model(model, checkpoint, device_map, offload_folder, dtype, offload_state_dict, offload_buffers, keep_in_fp32_modules, offload_8bit_bnb)
   1277 buffer_names = [name for name, _ in model.named_buffers()]
   1278 for checkpoint_file in checkpoint_files:
-> 1279     checkpoint = load_state_dict(checkpoint_file, device_map=device_map)
   1280     if device_map is None:
   1281         model.load_state_dict(checkpoint, strict=False)

File /opt/conda/lib/python3.10/site-packages/accelerate/utils/modeling.py:1111, in load_state_dict(checkpoint_file, device_map)
   1108 else:
   1109     # if we only have one device we can load everything directly
   1110     if len(set(device_map.values())) == 1:
-> 1111         return safe_load_file(checkpoint_file, device=list(device_map.values())[0])
   1113     devices = list(set(device_map.values()) - {"disk"})
   1114     # cpu device should always exist as fallback option

File /opt/conda/lib/python3.10/site-packages/safetensors/torch.py:261, in load_file(filename, device)
    259 with safe_open(filename, framework="pt", device=device) as f:
    260     for k in f.keys():
--> 261         result[k] = f.get_tensor(k)
    262 return result

OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB (GPU 0; 15.90 GiB total capacity; 14.90 GiB already allocated; 13.75 MiB free; 15.04 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
I used to get this error when trying to load a big model using low resources (e.g. T4 on Colab), in that case trying loading a small model (TheBloke/vicuna-7B-1.1-GPTQ) which usually works on a free instance on colab.

