File not found in AutoGptq

#11
by Kurapika993 - opened

While running: model = AutoGPTQForCausalLM.from_quantized(TheBloke/OpenAssistant-SFT-7-Llama-30B-GPTQ, device="cuda:5", use_triton=False)

I am getting this strange error, even if the quantiz.config file is present in the repo

FileNotFoundError: [Errno 2] No such file or directory: 'TheBloke/OpenAssistant-SFT-7-Llama-30B-GPTQ/quantize_config.json'

On cloning the model and running I am also getting this error

model = AutoGPTQForCausalLM.from_quantized('/path/OpenAssistant-SFT-7-Llama-30B-GPTQ', model_basename= 'OpenAssistant-SFT-7-Llama-30B-GPTQ-4bit', use_safetensors=True, device="cuda:5")

SafetensorError Traceback (most recent call last)
Cell In[46], line 1
----> 1 model = AutoGPTQForCausalLM.from_quantized('/home/das/model/OpenAssistant-SFT-7-Llama-30B-GPTQ', model_basename= 'OpenAssistant-SFT-7-Llama-30B-GPTQ-4bit', use_safetensors=True, device="cuda:5")

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/auto_gptq/modeling/auto.py:63, in AutoGPTQForCausalLM.from_quantized(cls, save_dir, device, use_safetensors, use_triton, max_memory, device_map, quantize_config, model_basename, trust_remote_code)
49 @classmethod
50 def from_quantized(
51 cls,
(...)
60 trust_remote_code: bool = False
61 ) -> BaseGPTQForCausalLM:
62 model_type = check_and_get_model_type(save_dir)
---> 63 return GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_quantized(
64 save_dir=save_dir,
65 device=device,
66 use_safetensors=use_safetensors,
67 use_triton=use_triton,
68 max_memory=max_memory,
69 device_map=device_map,
70 quantize_config=quantize_config,
71 model_basename=model_basename,
72 trust_remote_code=trust_remote_code
73 )

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/auto_gptq/modeling/_base.py:544, in BaseGPTQForCausalLM.from_quantized(cls, save_dir, device, use_safetensors, use_triton, max_memory, device_map, quantize_config, model_basename, trust_remote_code)
541 if not max_memory and not device_map:
542 device_map = {"": device}
--> 544 model = accelerate.load_checkpoint_and_dispatch(
545 model, model_save_name, device_map, max_memory, no_split_module_classes=[cls.layer_type]
546 )
548 model_config = model.config.to_dict()
549 seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/accelerate/big_modeling.py:479, in load_checkpoint_and_dispatch(model, checkpoint, device_map, max_memory, no_split_module_classes, offload_folder, offload_buffers, dtype, offload_state_dict, preload_module_classes)
477 if offload_state_dict is None and device_map is not None and "disk" in device_map.values():
478 offload_state_dict = True
--> 479 load_checkpoint_in_model(
480 model,
481 checkpoint,
482 device_map=device_map,
483 offload_folder=offload_folder,
484 dtype=dtype,
485 offload_state_dict=offload_state_dict,
486 offload_buffers=offload_buffers,
487 )
488 if device_map is None:
489 return model

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/accelerate/utils/modeling.py:971, in load_checkpoint_in_model(model, checkpoint, device_map, offload_folder, dtype, offload_state_dict, offload_buffers)
968 buffer_names = [name for name, _ in model.named_buffers()]
970 for checkpoint_file in checkpoint_files:
--> 971 checkpoint = load_state_dict(checkpoint_file, device_map=device_map)
972 if device_map is None:
973 model.load_state_dict(checkpoint, strict=False)

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/accelerate/utils/modeling.py:807, in load_state_dict(checkpoint_file, device_map)
803 if not is_safetensors_available():
804 raise ImportError(
805 f"To load {checkpoint_file}, the safetensors library is necessary pip install safetensors."
806 )
--> 807 with safe_open(checkpoint_file, framework="pt") as f:
808 metadata = f.metadata()
809 weight_names = f.keys()

SafetensorError: Error while deserializing header: HeaderTooLarge

AutoGPTQ doesn't currently support cloning models direct from HF. You need to download it first. The ability to directly query HF models is planned to be added in the near future.

Also, you will need to pass a model_basename parameter to tell it the name of the model file to use.

Here's an example of how to use it. I use the download-model.py script from text-generation-webui for quick and easy downloading of HF models.

root@1c6b80974469:/workspace/test# python ~/text-generation-webui/download-model.py TheBloke/OpenAssistant-SFT-7-Llama-30B-GPTQ --threads 2
Downloading the model to models/TheBloke_OpenAssistant-SFT-7-Llama-30B-GPTQ
...
root@1c6b80974469:/workspace# ll models/TheBloke_OpenAssistant-SFT-7-Llama-30B-GPTQ/
total 16551713
drwxrwxrwx  2 root root     3001577 May 15 21:37 ./
drwxrwxrwx 22 root root     3040539 May 15 21:37 ../
-rw-rw-rw-  1 root root 16940554392 May 15 21:43 OpenAssistant-SFT-7-Llama-30B-GPTQ-4bit.safetensors
-rw-rw-rw-  1 root root        7367 May 15 21:37 README.md
-rw-rw-rw-  1 root root         133 May 15 21:37 added_tokens.json
-rw-rw-rw-  1 root root         568 May 15 21:37 config.json
-rw-rw-rw-  1 root root         137 May 15 21:37 generation_config.json
-rw-rw-rw-  1 root root         337 May 15 21:37 huggingface-metadata.txt
-rw-rw-rw-  1 root root         122 May 15 21:37 quantize_config.json
-rw-rw-rw-  1 root root         477 May 15 21:37 special_tokens_map.json
-rw-rw-rw-  1 root root     1843612 May 15 21:37 tokenizer.json
-rw-rw-rw-  1 root root      499723 May 15 21:37 tokenizer.model
-rw-rw-rw-  1 root root         715 May 15 21:37 tokenizer_config.json

Now run this script:

from transformers import AutoTokenizer, pipeline, logging
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
import argparse

quantized_model_dir = "/workspace/models/TheBloke_OpenAssistant-SFT-7-Llama-30B-GPTQ"

model_basename = "OpenAssistant-SFT-7-Llama-30B-GPTQ-4bit"

use_strict = False

use_triton = False

print("Loading tokenizer")
tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=True)


print("Loading model")
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir,
        use_safetensors=True,
        strict=use_strict,
        model_basename=model_basename,
        device="cuda:0",
        use_triton=use_triton,
        quantize_config=None) # quantize_config will be loaded from the supplied quantize_config.json

# Inference using model.pipeline()

# Prevent printing spurious transformers error when using pipeline with AutoGPTQ - known bug.
logging.set_verbosity(logging.CRITICAL)

prompt = "Tell me about AI"
prompt_template=f'''### Human: {prompt}
### Assistant:'''

print("*** Pipeline:")
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.15
)

print(pipe(prompt_template)[0]['generated_text'])

# Inference using model.generate()
print("\n\n*** Generate:")

input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=0.7, max_new_tokens=512)
print(tokenizer.decode(output[0]))

Output:

root@1c6b80974469:/workspace# python simple_gptq_openassistant.py
Loading tokenizer
Loading model
*** Pipeline:
### Human: Tell me about AI
### Assistant: Artificial Intelligence (AI) is a field of computer science that focuses on creating machines capable of performing tasks that would normally require human intelligence. These tasks include understanding natural language, recognizing objects and patterns, making decisions based on data, and learning from experience.

The goal of AI research is to create intelligent systems that can work alongside humans or replace them in certain tasks. This includes developing algorithms and models that can learn from large amounts of data and make predictions or take actions based on that information.

There are many different types of AI, including machine learning, deep learning, natural language processing, robotics, and more. Each type has its own strengths and weaknesses, and they are used for a wide range of applications, such as image recognition, speech recognition, autonomous vehicles, chatbots, and much more.


*** Generate:
<s> ### Human: Tell me about AI
### Assistant: AI stands for Artificial Intelligence. It refers to the development of computer systems that can perform tasks that would normally require human intelligence, such as visual perception, speech recognition, decision-making, and language translation. AI systems are designed to learn from experience and adapt to new data, making them increasingly effective at performing complex tasks over time. Some common applications of AI include natural language processing, robotics, and machine learning.</s>
root@1c6b80974469:/workspace#

Can you give your environments requirement.txt

Just follow the instructions in the AutoGPTQ README. You should be able to just do: pip install auto-gptq. Or for the latest code:

git clone https://github.com/PanQiWei/AutoGPTQ
cd AutoGPTQ
pip install .

can you tell me about this error while loading the tokenizer, is there anything wrong with the model path ??

Loading tokenizer

HFValidationError Traceback (most recent call last)
Cell In[3], line 14
11 use_triton = False
13 print("Loading tokenizer")
---> 14 tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir, use_fast=True)

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:642, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
639 return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
641 # Next, let's try to use the tokenizer_config file to get the tokenizer class.
--> 642 tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
643 if "_commit_hash" in tokenizer_config:
644 kwargs["_commit_hash"] = tokenizer_config["_commit_hash"]

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:486, in get_tokenizer_config(pretrained_model_name_or_path, cache_dir, force_download, resume_download, proxies, use_auth_token, revision, local_files_only, subfolder, **kwargs)
424 """
425 Loads the tokenizer configuration from a pretrained model tokenizer configuration.
426
(...)
483 tokenizer_config = get_tokenizer_config("tokenizer-test")
484 ```"""
485 commit_hash = kwargs.get("_commit_hash", None)
--> 486 resolved_config_file = cached_file(
487 pretrained_model_name_or_path,
488 TOKENIZER_CONFIG_FILE,
489 cache_dir=cache_dir,
490 force_download=force_download,
491 resume_download=resume_download,
492 proxies=proxies,
493 use_auth_token=use_auth_token,
494 revision=revision,
495 local_files_only=local_files_only,
496 subfolder=subfolder,
497 _raise_exceptions_for_missing_entries=False,
498 _raise_exceptions_for_connection_errors=False,
499 _commit_hash=commit_hash,
500 )
501 if resolved_config_file is None:
502 logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.")

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/transformers/utils/hub.py:409, in cached_file(path_or_repo_id, filename, cache_dir, force_download, resume_download, proxies, use_auth_token, revision, local_files_only, subfolder, user_agent, _raise_exceptions_for_missing_entries, _raise_exceptions_for_connection_errors, _commit_hash)
406 user_agent = http_user_agent(user_agent)
407 try:
408 # Load from URL or cache if already cached
--> 409 resolved_file = hf_hub_download(
410 path_or_repo_id,
411 filename,
412 subfolder=None if len(subfolder) == 0 else subfolder,
413 revision=revision,
414 cache_dir=cache_dir,
415 user_agent=user_agent,
416 force_download=force_download,
417 proxies=proxies,
418 resume_download=resume_download,
419 use_auth_token=use_auth_token,
420 local_files_only=local_files_only,
421 )
423 except RepositoryNotFoundError:
424 raise EnvironmentError(
425 f"{path_or_repo_id} is not a local folder and is not a valid model identifier "
426 "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to "
427 "pass a token having permission to this repo with use_auth_token or log in with "
428 "huggingface-cli login and pass use_auth_token=True."
429 )

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:112, in validate_hf_hub_args.._inner_fn(*args, **kwargs)
107 for arg_name, arg_value in chain(
108 zip(signature.parameters, args), # Args values
109 kwargs.items(), # Kwargs values
110 ):
111 if arg_name in ["repo_id", "from_id", "to_id"]:
--> 112 validate_repo_id(arg_value)
114 elif arg_name == "token" and arg_value is not None:
115 has_token = True

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/huggingface_hub/utils/validators.py:160, in validate_repo_id(repo_id)
157 raise HFValidationError(f"Repo id must be a string, not {type(repo_id)}: '{repo_id}'.")
159 if repo_id.count("/") > 1:
--> 160 raise HFValidationError(
161 "Repo id must be in the form 'repo_name' or 'namespace/repo_name':"
162 f" '{repo_id}'. Use repo_type argument if needed."
163 )
165 if not REPO_ID_REGEX.match(repo_id):
166 raise HFValidationError(
167 "Repo id must use alphanumeric chars or '-', '
', '.', '--' and '..' are"
168 " forbidden, '-' and '.' cannot start or end the name, max length is 96:"
169 f" '{repo_id}'."
170 )

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/workspace/models/TheBloke_OpenAssistant-SFT-7-Llama-30B-GPTQ'. Use repo_type argument if neede

Looks like you either didn't download the model, or else didn't update the model path correctly

Oh sorry the it was the problem with the model path ..I have fixed that

I am getting this error while loading the model

The safetensors archive passed at models/TheBloke_OpenAssistant-SFT-7-Llama-30B-GPTQ/OpenAssistant-SFT-7-Llama-30B-GPTQ-4bit.safetensors does not contain metadata. Make sure to save your model with the save_pretrained method. Defaulting to 'pt' metadata.

ValueError Traceback (most recent call last)
Cell In[8], line 2
1 print("Loading model")
----> 2 model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir,
3 use_safetensors=True,
4 model_basename=model_basename,
5 device="cuda:5",
6 quantize_config=None)

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/auto_gptq/modeling/auto.py:63, in AutoGPTQForCausalLM.from_quantized(cls, save_dir, device, use_safetensors, use_triton, max_memory, device_map, quantize_config, model_basename, trust_remote_code)
49 @classmethod
50 def from_quantized(
51 cls,
(...)
60 trust_remote_code: bool = False
61 ) -> BaseGPTQForCausalLM:
62 model_type = check_and_get_model_type(save_dir)
---> 63 return GPTQ_CAUSAL_LM_MODEL_MAP[model_type].from_quantized(
64 save_dir=save_dir,
65 device=device,
66 use_safetensors=use_safetensors,
67 use_triton=use_triton,
68 max_memory=max_memory,
69 device_map=device_map,
70 quantize_config=quantize_config,
71 model_basename=model_basename,
72 trust_remote_code=trust_remote_code
73 )

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/auto_gptq/modeling/_base.py:544, in BaseGPTQForCausalLM.from_quantized(cls, save_dir, device, use_safetensors, use_triton, max_memory, device_map, quantize_config, model_basename, trust_remote_code)
541 if not max_memory and not device_map:
542 device_map = {"": device}
--> 544 model = accelerate.load_checkpoint_and_dispatch(
545 model, model_save_name, device_map, max_memory, no_split_module_classes=[cls.layer_type]
546 )
548 model_config = model.config.to_dict()
549 seq_len_keys = ["max_position_embeddings", "seq_length", "n_positions"]

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/accelerate/big_modeling.py:479, in load_checkpoint_and_dispatch(model, checkpoint, device_map, max_memory, no_split_module_classes, offload_folder, offload_buffers, dtype, offload_state_dict, preload_module_classes)
477 if offload_state_dict is None and device_map is not None and "disk" in device_map.values():
478 offload_state_dict = True
--> 479 load_checkpoint_in_model(
480 model,
481 checkpoint,
482 device_map=device_map,
483 offload_folder=offload_folder,
484 dtype=dtype,
485 offload_state_dict=offload_state_dict,
486 offload_buffers=offload_buffers,
487 )
488 if device_map is None:
489 return model

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/accelerate/utils/modeling.py:993, in load_checkpoint_in_model(model, checkpoint, device_map, offload_folder, dtype, offload_state_dict, offload_buffers)
991 offload_weight(param, param_name, state_dict_folder, index=state_dict_index)
992 else:
--> 993 set_module_tensor_to_device(model, param_name, param_device, value=param, dtype=dtype)
995 # Force Python to clean up.
996 del checkpoint

File ~/miniconda3/envs/textgen/lib/python3.10/site-packages/accelerate/utils/modeling.py:135, in set_module_tensor_to_device(module, tensor_name, device, value, dtype)
132 tensor_name = splits[-1]
134 if tensor_name not in module._parameters and tensor_name not in module._buffers:
--> 135 raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.")
136 is_buffer = tensor_name in module._buffers
137 old_value = getattr(module, tensor_name)

ValueError: QuantLinear() does not have a parameter or a buffer named bias.

Are you changing the code I sent you? You need to pass strict=False

Ya thats because I am getting unexpected keyword argument error like this on running the following

print("Loading model")
model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir,
use_safetensors=True,
strict=False,
model_basename=model_basename,
device="cuda:5",
use_triton=use_triton,
quantize_config=None)

Loading model

TypeError Traceback (most recent call last)
Cell In[15], line 2
1 print("Loading model")
----> 2 model = AutoGPTQForCausalLM.from_quantized(quantized_model_dir,
3 use_safetensors=True,
4 strict=False,
5 model_basename=model_basename,
6 device="cuda:5",
7 use_triton=use_triton,
8 quantize_config=None)

TypeError: AutoGPTQForCausalLM.from_quantized() got an unexpected keyword argument 'strict'

OK please install latest auto-gptq from source and then try again with the code I gave you, changing only the model path

git clone https://github.com/PanQiWei/AutoGPTQ
cd AutoGPTQ
pip install .
Kurapika993 changed discussion status to closed

Sign up or log in to comment