# Copyright (c) Facebook, Inc. and its affiliates. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # This file defines example configuration arguments for quantizing # a transformer model with product quantization # Number of Centroids for Product Quantization, by default 256 (byte-aligned) n_centroids: Linear: key: in_features value: {"*": 256} Embedding: key: embedding_dim value: {"*": 256} # Block Sizes for Product Quantization # We suggest: 8 for FFN, 4 for ATTN, 4 for embedding projections, 8 for embeddings block_sizes: Linear: key: fuzzy_name value: {fc: 8, attn: 4, emb: 4} Embedding: key: fuzzy_name value: {emb: 8} # Layers to Quantize Sequentially # We suggest: first FFN, then EMB, then ATTN layers_to_quantize: - decoder\\.layers\\.\d+\\.fc[12] - decoder\\.embed_tokens\\.embeddings\\.[012]\\.[01] - decoder\\.layers\\.\d+\\.self_attn\\.(k_proj|v_proj|q_proj|out_proj)