| |
| |
| |
| |
|
|
| import logging |
| import re |
| from operator import attrgetter, itemgetter |
|
|
| import numpy as np |
| import torch.distributed as dist |
| import torch.nn as nn |
|
|
| from .modules import PQConv2d, PQEmbedding, PQLinear |
| from .pq import PQ |
|
|
|
|
| def quantize_model_( |
| model, |
| size_tracker, |
| layers_to_quantize, |
| block_sizes_config, |
| n_centroids_config, |
| step=0, |
| n_iter=15, |
| eps=1e-6, |
| max_tentatives=100, |
| verbose=True, |
| ): |
| """ |
| Quantize a model in-place by stages. All the targeted |
| layers are replaced by their quantized counterpart, |
| and the model is ready for the finetuning of the |
| centroids in a standard training loop (no modifications |
| required). Note that we do not quantize biases. |
| |
| Args: |
| - model: a nn.Module |
| - size_tracker: useful for tracking quatization statistics |
| - layers_to_quantize: a list containing regexps for |
| filtering the layers to quantize at each stage according |
| to their name (as in model.named_parameters()) |
| - block_sizes_config: dict like |
| { |
| 'Conv2d': ('kernel_size', {'(3, 3)': 9, '(1, 1)': 4}), |
| 'Linear': ('in_features', {'*': 8}) |
| } |
| For instance, all conv2d layers with kernel size 3x3 have |
| a block size of 9 and all Linear layers are quantized with |
| a block size of 8, irrespective of their size. |
| - n_centroids_config: dict like |
| { |
| 'Conv2d': ('kernel_size', {'*': 256}), |
| 'Linear': ('in_features', {'*': 256}) |
| } |
| For instance, all conv2d layers are quantized with 256 centroids |
| - step: the layers to quantize inplace corresponding |
| to layers_to_quantize[step] |
| """ |
|
|
| quantized_layers = get_layers(model, layers_to_quantize[step]) |
|
|
| for layer in quantized_layers: |
|
|
| |
| is_master_process = (not dist.is_initialized()) or ( |
| dist.is_initialized() and dist.get_rank() == 0 |
| ) |
| verbose = verbose and is_master_process |
|
|
| |
| module = attrgetter(layer)(model) |
| block_size = get_param(module, layer, block_sizes_config) |
| n_centroids = get_param(module, layer, n_centroids_config) |
| if verbose: |
| logging.info( |
| f"Quantizing layer {layer} with block size {block_size} and {n_centroids} centroids" |
| ) |
|
|
| |
| weight = module.weight.data.clone() |
| is_bias = "bias" in [x[0] for x in module.named_parameters()] |
| bias = module.bias.data.clone() if is_bias else None |
| quantizer = PQ( |
| weight, |
| block_size, |
| n_centroids=n_centroids, |
| n_iter=n_iter, |
| eps=eps, |
| max_tentatives=max_tentatives, |
| verbose=verbose, |
| ) |
|
|
| |
| quantizer.encode() |
| centroids = quantizer.centroids.contiguous() |
| assignments = quantizer.assignments.contiguous() |
|
|
| |
| if dist.is_initialized(): |
| dist.broadcast(centroids, 0) |
| dist.broadcast(assignments, 0) |
|
|
| |
| if isinstance(module, nn.Linear): |
| out_features, in_features = map( |
| lambda k: module.__dict__[k], ["out_features", "in_features"] |
| ) |
| quantized_module = PQLinear( |
| centroids, assignments, bias, in_features, out_features |
| ) |
| elif isinstance(module, nn.Embedding): |
| num_embeddings, embedding_dim = map( |
| lambda k: module.__dict__[k], ["num_embeddings", "embedding_dim"] |
| ) |
| quantized_module = PQEmbedding( |
| centroids, assignments, num_embeddings, embedding_dim |
| ) |
| elif isinstance(module, nn.Conv2d): |
| out_channels, in_channels, kernel_size = map( |
| lambda k: module.__dict__[k], |
| ["out_channels", "in_channels", "kernel_size"], |
| ) |
| stride, padding, dilation, groups, padding_mode = map( |
| lambda k: module.__dict__[k], |
| ["stride", "padding", "dilation", "groups", "padding_mode"], |
| ) |
|
|
| quantized_module = PQConv2d( |
| centroids, |
| assignments, |
| bias, |
| in_channels, |
| out_channels, |
| kernel_size, |
| stride=stride, |
| padding=padding, |
| dilation=dilation, |
| groups=groups, |
| padding_mode=padding_mode, |
| ) |
| else: |
| raise ValueError(f"Module {module} not yet supported for quantization") |
|
|
| |
| attrsetter(layer)(model, quantized_module) |
|
|
| |
| size_tracker.update(weight, block_size, n_centroids) |
|
|
| |
| return quantized_layers |
|
|
|
|
| def get_layers(model, filter_regexp): |
| """ |
| Filters out the layers according to a regexp. Note that |
| we omit biases. |
| |
| Args: |
| - model: a nn.Module |
| - filter_regexp: a regexp to filter the layers to keep |
| according to their name in model.named_parameters(). |
| For instance, the regexp: |
| |
| down_layers\\.[123456]\\.(conv[12]|identity\\.conv)) |
| |
| is keeping blocks down_layers from 1 to 6, and inside |
| each block is keeping conv1, conv2 and identity.conv. |
| |
| Remarks: |
| - We add (module\\.)? at the beginning of the regexp to |
| account for the possible use of nn.parallel.DataParallel |
| """ |
|
|
| |
| all_layers = map(itemgetter(0), model.named_parameters()) |
|
|
| |
| all_layers = filter(lambda x: "bias" not in x, all_layers) |
|
|
| |
| all_layers = map(lambda x: x.replace(".weight_orig", ""), all_layers) |
| all_layers = map(lambda x: x.replace(".weight", ""), all_layers) |
|
|
| |
| filter_regexp = "(module\\.)?" + "(" + filter_regexp + ")" |
| r = re.compile(filter_regexp) |
|
|
| return list(filter(r.match, all_layers)) |
|
|
|
|
| def get_param(module, layer_name, param_config): |
| """ |
| Given a quantization configuration, get the right parameter |
| for the module to be quantized. |
| |
| Args: |
| - module: a nn.Module |
| - layer_name: the name of the layer |
| - param_config: a dict like |
| { |
| 'Conv2d': ('kernel_size', {'(3, 3)': 9, '(1, 1)': 4}), |
| 'Linear': ('in_features', {'*': 8}) |
| } |
| For instance, all conv2d layers with kernel size 3x3 have |
| a block size of 9 and all Linear layers are quantized with |
| a block size of 8, irrespective of their size. |
| |
| Remarks: |
| - if 'fuzzy_name' is passed as a parameter, layers whose layer_name |
| include 'fuzzy_name' will be assigned the given parameter. |
| In the following example, conv.expand layers will have a block |
| size of 9 while conv.reduce will have a block size of 4 and all |
| other layers will have a block size of 2. |
| { |
| 'Conv2d': ('fuzzy_name', {'expand': 9, 'reduce': 4, '*': 2}), |
| 'Linear': ('fuzzy_name', {'classifier': 8, 'projection': 4}) |
| } |
| |
| """ |
|
|
| layer_type = module.__class__.__name__ |
|
|
| if layer_type not in param_config: |
| raise KeyError(f"Layer type {layer_type} not in config for layer {module}") |
|
|
| feature, params = param_config[module.__class__.__name__] |
|
|
| if feature != "fuzzy_name": |
| feature_value = str(getattr(module, feature)) |
| if feature_value not in params: |
| if "*" in params: |
| feature_value = "*" |
| else: |
| raise KeyError( |
| f"{feature}={feature_value} not in config for layer {module}" |
| ) |
| else: |
| feature_values = [name for name in params if name in layer_name] |
| if len(feature_values) == 0: |
| if "*" in params: |
| feature_value = "*" |
| else: |
| raise KeyError(f"name={layer_name} not in config for {module}") |
| else: |
| feature_value = feature_values[0] |
|
|
| return params[feature_value] |
|
|
|
|
| class SizeTracker(object): |
| """ |
| Class to keep track of the compressed network size with iPQ. |
| |
| Args: |
| - model: a nn.Module |
| |
| Remarks: |
| - The compressed size is the sum of three components |
| for each layer in the network: |
| (1) Storing the centroids given by iPQ in fp16 |
| (2) Storing the assignments of the blocks in int8 |
| (3) Storing all non-compressed elements such as biases |
| - This cost in only valid if we use 256 centroids (then |
| indexing can indeed by done with int8). |
| """ |
|
|
| def __init__(self, model): |
| self.model = model |
| self.size_non_compressed_model = self.compute_size() |
| self.size_non_quantized = self.size_non_compressed_model |
| self.size_index = 0 |
| self.size_centroids = 0 |
| self.n_quantized_layers = 0 |
|
|
| def compute_size(self): |
| """ |
| Computes the size of the model (in MB). |
| """ |
|
|
| res = 0 |
| for _, p in self.model.named_parameters(): |
| res += p.numel() |
| return res * 4 / 1024 / 1024 |
|
|
| def update(self, W, block_size, n_centroids): |
| """ |
| Updates the running statistics when quantizing a new layer. |
| """ |
|
|
| |
| bits_per_weight = np.log2(n_centroids) / block_size |
| self.n_quantized_layers += 1 |
|
|
| |
| size_index_layer = bits_per_weight * W.numel() / 8 / 1024 / 1024 |
| self.size_index += size_index_layer |
|
|
| |
| size_centroids_layer = n_centroids * block_size * 2 / 1024 / 1024 |
| self.size_centroids += size_centroids_layer |
|
|
| |
| size_uncompressed_layer = W.numel() * 4 / 1024 / 1024 |
| self.size_non_quantized -= size_uncompressed_layer |
|
|
| def __repr__(self): |
| size_compressed = ( |
| self.size_index + self.size_centroids + self.size_non_quantized |
| ) |
| compression_ratio = self.size_non_compressed_model / size_compressed |
| return ( |
| f"Non-compressed model size: {self.size_non_compressed_model:.2f} MB. " |
| f"After quantizing {self.n_quantized_layers} layers, size " |
| f"(indexing + centroids + other): {self.size_index:.2f} MB + " |
| f"{self.size_centroids:.2f} MB + {self.size_non_quantized:.2f} MB = " |
| f"{size_compressed:.2f} MB, compression ratio: {compression_ratio:.2f}x" |
| ) |
|
|
|
|
| def attrsetter(*items): |
| def resolve_attr(obj, attr): |
| attrs = attr.split(".") |
| head = attrs[:-1] |
| tail = attrs[-1] |
|
|
| for name in head: |
| obj = getattr(obj, name) |
| return obj, tail |
|
|
| def g(obj, val): |
| for attr in items: |
| resolved_obj, resolved_attr = resolve_attr(obj, attr) |
| setattr(resolved_obj, resolved_attr, val) |
|
|
| return g |
|
|