| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| from collections import OrderedDict |
| from copy import deepcopy |
| from typing import Dict, List, Optional |
| from typing import OrderedDict as OrderedDictType |
| from typing import Union |
|
|
| import torch |
| from compressed_tensors.config import CompressionFormat |
| from compressed_tensors.modeling import ( |
| initialize_hooked_attention, |
| initialize_hooked_kv_cache, |
| ) |
| from compressed_tensors.quantization.lifecycle.initialize import ( |
| initialize_module_for_quantization, |
| is_attention_module, |
| ) |
| from compressed_tensors.quantization.quant_args import QuantizationArgs |
| from compressed_tensors.quantization.quant_config import ( |
| QuantizationConfig, |
| QuantizationStatus, |
| ) |
| from compressed_tensors.quantization.quant_scheme import QuantizationScheme |
| from compressed_tensors.utils.helpers import replace_module |
| from compressed_tensors.utils.match import ( |
| is_narrow_match, |
| match_named_modules, |
| match_targets, |
| ) |
| from compressed_tensors.utils.offload import update_parameter_data |
| from compressed_tensors.utils.safetensors_load import get_safetensors_folder |
| from loguru import logger |
| from safetensors import safe_open |
| from torch.nn import Module |
|
|
|
|
| __all__ = [ |
| "load_pretrained_quantization_parameters", |
| "apply_quantization_config", |
| ] |
|
|
| from compressed_tensors.quantization.utils.helpers import is_module_quantized |
| from compressed_tensors.utils.safetensors_load import ( |
| get_quantization_parameter_to_path_mapping, |
| ) |
|
|
|
|
| def load_pretrained_quantization_parameters( |
| model: Module, |
| model_name_or_path: Optional[str] = None, |
| load_weight_qparams: Optional[bool] = False, |
| ): |
| """ |
| Loads the quantization parameters (scale and zero point) from model_name_or_path to |
| a model that has already been initialized with a quantization config. |
| |
| NOTE: Will always load inputs/output parameters. Will conditioanlly load weight |
| parameters, if load_weight_qparams is set to True. |
| |
| :param model: model to load pretrained quantization parameters to |
| :param model_name_or_path: Hugging Face stub or local folder containing a quantized |
| model, which is used to load quantization parameters |
| :param load_weight_qparams: whether or not the weight quantization parameters |
| should be loaded |
| """ |
| model_path = get_safetensors_folder(model_name_or_path) |
| mapping = get_quantization_parameter_to_path_mapping(model_path) |
|
|
| for name, submodule in model.named_modules(): |
| if not is_module_quantized(submodule): |
| continue |
| if submodule.quantization_scheme.input_activations is not None: |
| base_name = "input" |
| _load_quant_args_from_mapping( |
| base_name=base_name, |
| module_name=name, |
| module=submodule, |
| mapping=mapping, |
| ) |
| if submodule.quantization_scheme.output_activations is not None: |
| base_name = "output" |
| _load_quant_args_from_mapping( |
| base_name=base_name, |
| module_name=name, |
| module=submodule, |
| mapping=mapping, |
| ) |
|
|
| if load_weight_qparams and submodule.quantization_scheme.weights: |
| base_name = "weight" |
| _load_quant_args_from_mapping( |
| base_name=base_name, |
| module_name=name, |
| module=submodule, |
| mapping=mapping, |
| ) |
|
|
|
|
| def apply_quantization_config( |
| model: Module, config: Union[QuantizationConfig, None], run_compressed: bool = False |
| ): |
| """ |
| Initializes the model for quantization in-place based on the given config. |
| Optionally coverts quantizable modules to compressed_linear modules |
| |
| :param model: model to apply quantization config to |
| :param config: quantization config |
| :param run_compressed: Whether the model will be run in compressed mode or |
| decompressed fully on load |
| """ |
| from compressed_tensors.linear.compressed_linear import CompressedLinear |
|
|
| config = deepcopy(config) |
| if config is None: |
| return dict() |
|
|
| |
| force_zero_point = config.quantization_status != QuantizationStatus.COMPRESSED |
|
|
| |
| if config.kv_cache_scheme is not None: |
| _apply_kv_cache_scheme( |
| model, config.kv_cache_scheme, config.quantization_status |
| ) |
|
|
| |
| |
| target_to_scheme = OrderedDict() |
| for scheme in config.config_groups.values(): |
| for target in scheme.targets: |
| target_to_scheme[target] = scheme |
|
|
| |
| for name, submodule in match_named_modules( |
| model, target_to_scheme, config.ignore, warn_on_fail=True |
| ): |
| |
| |
| matched_targets = match_targets(name, submodule, target_to_scheme) |
| scheme = _scheme_from_targets(target_to_scheme, matched_targets, name) |
| |
| submodule.quantization_scheme = scheme |
|
|
| |
| |
| if ( |
| run_compressed |
| and isinstance(submodule, torch.nn.Linear) |
| and config.format != CompressionFormat.dense.value |
| ): |
| |
| compressed_linear = CompressedLinear.from_linear( |
| submodule, |
| quantization_scheme=scheme, |
| quantization_format=config.format, |
| ) |
| replace_module(model, name, compressed_linear) |
|
|
| else: |
| if is_attention_module(submodule) and is_narrow_match( |
| model, scheme.targets, name |
| ): |
| initialize_hooked_attention(model, submodule) |
|
|
| initialize_module_for_quantization( |
| submodule, |
| force_zero_point=force_zero_point, |
| ) |
|
|
| submodule.quantization_status = config.quantization_status |
|
|
|
|
| def _apply_kv_cache_scheme( |
| model: torch.nn.Module, |
| kv_cache_scheme: QuantizationArgs, |
| status: QuantizationStatus, |
| ): |
| if not kv_cache_scheme.symmetric: |
| raise logger.warning("vLLM does not support asymmetric kv cache quantization") |
|
|
| |
| |
| |
| scheme = QuantizationScheme( |
| targets=[".*self_attn$"], |
| input_activations=kv_cache_scheme, |
| ) |
| for submodule in model.modules(): |
| if is_attention_module(submodule): |
| submodule.quantization_scheme = scheme |
| initialize_hooked_kv_cache(model, submodule) |
| initialize_module_for_quantization(submodule, force_zero_point=False) |
| submodule.quantization_status = status |
|
|
|
|
| def _load_quant_args_from_mapping( |
| base_name: str, module_name: str, module: Module, mapping: Dict |
| ): |
| |
| """ |
| Loads scale and zero point from a state_dict into the specified module |
| |
| :param base_name: quantization target, one of: weights, input_activations or |
| output_activations |
| :param module_name: pytorch module name to look up in state_dict |
| :module: pytorch module associated with module_name |
| :mapping: mapping to search fetch paths on disk for a given parameter |
| """ |
| scale_name = f"{base_name}_scale" |
| zp_name = f"{base_name}_zero_point" |
| g_idx_name = f"{base_name}_g_idx" |
|
|
| state_dict_scale_path = mapping.get(f"{module_name}.{scale_name}", None) |
| state_dict_zp_path = mapping.get(f"{module_name}.{zp_name}", None) |
| state_dict_g_idx_path = mapping.get(f"{module_name}.{g_idx_name}", None) |
|
|
| if state_dict_g_idx_path is not None: |
| with safe_open(state_dict_g_idx_path, framework="pt", device="cpu") as f: |
| state_dict_g_idx = f.get_tensor(f"{module_name}.{g_idx_name}") |
|
|
| update_parameter_data(module, state_dict_g_idx, g_idx_name) |
|
|
| if state_dict_scale_path is not None: |
| |
| with safe_open(state_dict_scale_path, framework="pt", device="cpu") as f: |
| state_dict_scale = f.get_tensor(f"{module_name}.{scale_name}") |
|
|
| update_parameter_data(module, state_dict_scale, scale_name) |
|
|
| if state_dict_zp_path is None: |
| |
| state_dict_zp = torch.zeros_like(state_dict_scale, device="cpu") |
| else: |
| with safe_open(state_dict_zp_path, framework="pt", device="cpu") as f: |
| state_dict_zp = f.get_tensor(f"{module_name}.{zp_name}") |
|
|
| update_parameter_data(module, state_dict_zp, zp_name) |
|
|
|
|
| def _scheme_from_targets( |
| target_to_scheme: OrderedDictType[str, QuantizationScheme], |
| targets: List[str], |
| name: str, |
| ) -> QuantizationScheme: |
| |
| |
| return target_to_scheme[targets[0]] |
|
|