import importlib.metadata import warnings from copy import deepcopy from packaging import version from ..utils import is_accelerate_available, is_bitsandbytes_available, logging if is_bitsandbytes_available(): import bitsandbytes as bnb import torch import torch.nn as nn from ..pytorch_utils import Conv1D if is_accelerate_available(): from accelerate import init_empty_weights from accelerate.utils import find_tied_parameters logger = logging.get_logger(__name__) def set_module_quantized_tensor_to_device(module, tensor_name, device, value=None, fp16_statistics=None): """ A helper function to set a given tensor (parameter of buffer) of a module on a specific device (note that doing `param.to(device)` creates a new tensor not linked to the parameter, which is why we need this function). The function is adapted from `set_module_tensor_to_device` function from accelerate that is adapted to support the class `Int8Params` from `bitsandbytes`. Args: module (`torch.nn.Module`): The module in which the tensor we want to move lives. tensor_name (`str`): The full name of the parameter/buffer. device (`int`, `str` or `torch.device`): The device on which to set the tensor. value (`torch.Tensor`, *optional*): The value of the tensor (useful when going from the meta device to any other device). fp16_statistics (`torch.HalfTensor`, *optional*): The list of fp16 statistics to set on the module, used for serialization. """ # Recurse if needed if "." in tensor_name: splits = tensor_name.split(".") for split in splits[:-1]: new_module = getattr(module, split) if new_module is None: raise ValueError(f"{module} has no attribute {split}.") module = new_module tensor_name = splits[-1] if tensor_name not in module._parameters and tensor_name not in module._buffers: raise ValueError(f"{module} does not have a parameter or a buffer named {tensor_name}.") is_buffer = tensor_name in module._buffers old_value = getattr(module, tensor_name) if old_value.device == torch.device("meta") and device not in ["meta", torch.device("meta")] and value is None: raise ValueError(f"{tensor_name} is on the meta device, we need a `value` to put in on {device}.") is_4bit = False is_8bit = False if is_buffer or not is_bitsandbytes_available(): is_8bit = False is_4bit = False else: is_4bit = hasattr(bnb.nn, "Params4bit") and isinstance(module._parameters[tensor_name], bnb.nn.Params4bit) is_8bit = isinstance(module._parameters[tensor_name], bnb.nn.Int8Params) if is_8bit or is_4bit: param = module._parameters[tensor_name] if param.device.type != "cuda": if value is None: new_value = old_value.to(device) elif isinstance(value, torch.Tensor): new_value = value.to("cpu") if value.dtype == torch.int8: is_8bit_serializable = version.parse(importlib.metadata.version("bitsandbytes")) > version.parse( "0.37.2" ) if not is_8bit_serializable: raise ValueError( "Detected int8 weights but the version of bitsandbytes is not compatible with int8 serialization. " "Make sure to download the latest `bitsandbytes` version. `pip install --upgrade bitsandbytes`." ) else: new_value = torch.tensor(value, device="cpu") # Support models using `Conv1D` in place of `nn.Linear` (e.g. gpt2) by transposing the weight matrix prior to quantization. # Since weights are saved in the correct "orientation", we skip transposing when loading. if issubclass(module.source_cls, Conv1D) and fp16_statistics is None: new_value = new_value.T kwargs = old_value.__dict__ if is_8bit: new_value = bnb.nn.Int8Params(new_value, requires_grad=False, **kwargs).to(device) elif is_4bit: new_value = bnb.nn.Params4bit(new_value, requires_grad=False, **kwargs).to(device) module._parameters[tensor_name] = new_value if fp16_statistics is not None: setattr(module.weight, "SCB", fp16_statistics.to(device)) else: if value is None: new_value = old_value.to(device) elif isinstance(value, torch.Tensor): new_value = value.to(device) else: new_value = torch.tensor(value, device=device) if is_buffer: module._buffers[tensor_name] = new_value else: new_value = nn.Parameter(new_value, requires_grad=old_value.requires_grad) module._parameters[tensor_name] = new_value def _replace_with_bnb_linear( model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, has_been_replaced=False ): """ Private method that wraps the recursion for module replacement. Returns the converted model and a boolean that indicates if the conversion has been successfull or not. """ for name, module in model.named_children(): if current_key_name is None: current_key_name = [] current_key_name.append(name) if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert: # Check if the current key is not in the `modules_to_not_convert` if not any(key in ".".join(current_key_name) for key in modules_to_not_convert): with init_empty_weights(): if isinstance(module, Conv1D): in_features, out_features = module.weight.shape else: in_features = module.in_features out_features = module.out_features if quantization_config.quantization_method() == "llm_int8": model._modules[name] = bnb.nn.Linear8bitLt( in_features, out_features, module.bias is not None, has_fp16_weights=quantization_config.llm_int8_has_fp16_weight, threshold=quantization_config.llm_int8_threshold, ) has_been_replaced = True else: if ( quantization_config.llm_int8_skip_modules is not None and name in quantization_config.llm_int8_skip_modules ): pass else: model._modules[name] = bnb.nn.Linear4bit( in_features, out_features, module.bias is not None, quantization_config.bnb_4bit_compute_dtype, compress_statistics=quantization_config.bnb_4bit_use_double_quant, quant_type=quantization_config.bnb_4bit_quant_type, ) has_been_replaced = True # Store the module class in case we need to transpose the weight later model._modules[name].source_cls = type(module) # Force requires grad to False to avoid unexpected errors model._modules[name].requires_grad_(False) if len(list(module.children())) > 0: _, has_been_replaced = _replace_with_bnb_linear( module, modules_to_not_convert, current_key_name, quantization_config, has_been_replaced=has_been_replaced, ) # Remove the last key for recursion current_key_name.pop(-1) return model, has_been_replaced def replace_with_bnb_linear(model, modules_to_not_convert=None, current_key_name=None, quantization_config=None): """ A helper function to replace all `torch.nn.Linear` modules by `bnb.nn.Linear8bit` modules from the `bitsandbytes` library. This will enable running your models using mixed int8 precision as described by the paper `LLM.int8(): 8-bit Matrix Multiplication for Transformers at Scale`. Make sure `bitsandbytes` compiled with the correct CUDA version of your hardware is installed before running this function. `pip install -i https://test.pypi.org/simple/ bitsandbytes` The function will be run recursively and replace all `torch.nn.Linear` modules except for the `lm_head` that should be kept as a `torch.nn.Linear` module. The replacement is done under `init_empty_weights` context manager so no CPU/GPU memory is required to run this function. Int8 mixed-precision matrix decomposition works by separating a matrix multiplication into two streams: (1) and systematic feature outlier stream matrix multiplied in fp16 (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no predictive degradation is possible for very large models (>=176B parameters). Parameters: model (`torch.nn.Module`): Input model or `torch.nn.Module` as the function is run recursively. modules_to_not_convert (`List[`str`]`, *optional*, defaults to `["lm_head"]`): Names of the modules to not convert in `Linear8bitLt`. In practice we keep the `lm_head` in full precision for numerical stability reasons. current_key_name (`List[`str`]`, *optional*): An array to track the current key of the recursion. This is used to check whether the current key (part of it) is not in the list of modules to not convert (for instances modules that are offloaded to `cpu` or `disk`). """ modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert model, has_been_replaced = _replace_with_bnb_linear( model, modules_to_not_convert, current_key_name, quantization_config ) if not has_been_replaced: logger.warning( "You are loading your model in 8bit or 4bit but no linear modules were found in your model." " Please double check your model architecture, or submit an issue on github if you think this is" " a bug." ) return model # For backward compatibility def replace_8bit_linear(*args, **kwargs): warnings.warn( "`replace_8bit_linear` will be deprecated in a future version, please use `replace_with_bnb_linear` instead", FutureWarning, ) return replace_with_bnb_linear(*args, **kwargs) # For backward compatiblity def set_module_8bit_tensor_to_device(*args, **kwargs): warnings.warn( "`set_module_8bit_tensor_to_device` will be deprecated in a future version, please use `set_module_quantized_tensor_to_device` instead", FutureWarning, ) return set_module_quantized_tensor_to_device(*args, **kwargs) def get_keys_to_not_convert(model): r""" An utility function to get the key of the module to keep in full precision if any For example for CausalLM modules we may want to keep the lm_head in full precision for numerical stability reasons. For other architectures, we want to keep the tied weights of the model. The function will return a list of the keys of the modules to not convert in int8. Parameters: model (`torch.nn.Module`): Input model """ # Create a copy of the model and tie the weights, then # check if it contains tied weights tied_model = deepcopy(model) # this has 0 cost since it is done inside `init_empty_weights` context manager` tied_model.tie_weights() tied_params = find_tied_parameters(tied_model) # For compatibility with Accelerate < 0.18 if isinstance(tied_params, dict): tied_keys = sum(list(tied_params.values()), []) + list(tied_params.keys()) else: tied_keys = sum(tied_params, []) has_tied_params = len(tied_keys) > 0 # If there is not tied weights, we want to keep the lm_head(output_embedding) in full precision if not has_tied_params: output_emb = model.get_output_embeddings() if output_emb is not None: list_last_module = [name for name, module in model.named_modules() if id(module) == id(output_emb)] return list_last_module # otherwise, no tied weights, no output embedding defined, simply keep the last module in full precision list_modules = list(model.named_parameters()) list_last_module = [list_modules[-1][0]] # add last module together with tied weights intersection = set(list_last_module) - set(tied_keys) list_untouched = list(set(tied_keys)) + list(intersection) # remove ".weight" from the keys names_to_remove = [".weight", ".bias"] filtered_module_names = [] for name in list_untouched: for name_to_remove in names_to_remove: if name_to_remove in name: name = name.replace(name_to_remove, "") filtered_module_names.append(name) return filtered_module_names