ISTA-DASLab
/

Llama-2-7b-AQLM-2Bit-1x16-hf

@@ -1,34 +1,92 @@
 {
-    "architectures": [
-        "LlamaForCausalLM_AQLM"
-    ],
-    "auto_map": {
-        "AutoConfig": "configuration_llama_aqlm.LlamaConfig",
-        "AutoModelForCausalLM": "modeling_llama_aqlm.LlamaForCausalLM"
-    },
-    "bos_token_id": 1,
-    "eos_token_id": 2,
-    "hidden_act": "silu",
     "hidden_size": 4096,
-    "initializer_range": 0.02,
     "intermediate_size": 11008,
-    "max_position_embeddings": 4096,
-    "model_type": "llama_aqlm",
-    "num_attention_heads": 32,
     "num_hidden_layers": 32,
     "num_key_value_heads": 32,
-    "pretraining_tp": 1,
     "rms_norm_eps": 1e-05,
     "rope_scaling": null,
-    "tie_word_embeddings": false,
     "torch_dtype": "float16",
-    "transformers_version": "4.31.0.dev0",
-    "use_cache": true,
-    "vocab_size": 32000,
     "aqlm": {
         "nbits_per_codebook": 16,
         "num_codebooks": 1,
         "out_group_size": 1,
         "in_group_size": 8
     }
 }

 {
+    "vocab_size": 32000,
+    "max_position_embeddings": 4096,
     "hidden_size": 4096,
     "intermediate_size": 11008,
     "num_hidden_layers": 32,
+    "num_attention_heads": 32,
     "num_key_value_heads": 32,
+    "hidden_act": "silu",
+    "initializer_range": 0.02,
     "rms_norm_eps": 1e-05,
+    "pretraining_tp": 1,
+    "use_cache": true,
+    "rope_theta": 10000.0,
     "rope_scaling": null,
+    "attention_bias": false,
+    "attention_dropout": 0.0,
+    "return_dict": true,
+    "output_hidden_states": false,
+    "output_attentions": false,
+    "torchscript": false,
     "torch_dtype": "float16",
+    "use_bfloat16": false,
+    "tf_legacy_loss": false,
+    "pruned_heads": {},
+    "tie_word_embeddings": false,
+    "is_encoder_decoder": false,
+    "is_decoder": false,
+    "cross_attention_hidden_size": null,
+    "add_cross_attention": false,
+    "tie_encoder_decoder": false,
+    "max_length": 20,
+    "min_length": 0,
+    "do_sample": false,
+    "early_stopping": false,
+    "num_beams": 1,
+    "num_beam_groups": 1,
+    "diversity_penalty": 0.0,
+    "temperature": 1.0,
+    "top_k": 50,
+    "top_p": 1.0,
+    "typical_p": 1.0,
+    "repetition_penalty": 1.0,
+    "length_penalty": 1.0,
+    "no_repeat_ngram_size": 0,
+    "encoder_no_repeat_ngram_size": 0,
+    "bad_words_ids": null,
+    "num_return_sequences": 1,
+    "chunk_size_feed_forward": 0,
+    "output_scores": false,
+    "return_dict_in_generate": false,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "remove_invalid_values": false,
+    "exponential_decay_length_penalty": null,
+    "suppress_tokens": null,
+    "begin_suppress_tokens": null,
+    "architectures": [
+        "LlamaForCausalLM"
+    ],
+    "finetuning_task": null,
+    "id2label": {
+        "0": "LABEL_0",
+        "1": "LABEL_1"
+    },
+    "label2id": {
+        "LABEL_0": 0,
+        "LABEL_1": 1
+    },
+    "tokenizer_class": null,
+    "prefix": null,
+    "bos_token_id": 1,
+    "pad_token_id": null,
+    "eos_token_id": 2,
+    "sep_token_id": null,
+    "decoder_start_token_id": null,
+    "task_specific_params": null,
+    "problem_type": null,
+    "_name_or_path": "",
+    "transformers_version": "4.36.2",
     "aqlm": {
         "nbits_per_codebook": 16,
         "num_codebooks": 1,
         "out_group_size": 1,
         "in_group_size": 8
+    },
+    "model_type": "llama_aqlm",
+    "auto_map": {
+        "AutoConfig": "configuration_llama_aqlm.LlamaConfig",
+        "AutoModelForCausalLM": "modeling_llama_aqlm.LlamaForCausalLM"
     }
 }

inference.py CHANGED Viewed

@@ -135,7 +135,7 @@ def forward_pass_quantized_linear(
     bias: Optional[torch.Tensor],
 ) -> torch.Tensor:
     if input.is_cuda:
-        return aqlm_gemm_stupid(input, codes, codebooks, scales, bias)
     else:
         dequantized_weight = _dequantize_weight(
             unpack_int_data(codes, codebooks.shape[0].bit_length() - 1),
@@ -160,7 +160,6 @@ def forward_pass_quantized_linear(
         "in_group_size",
         "num_input_groups",
         "num_input_groups_next_power_of_2",
-        "has_bias",
         "compute_in_fp32",
     ],
 )
@@ -168,7 +167,7 @@ def forward_pass_quantized_linear(
 def _aqlm_gemv_simple(
     input_vec_ptr,
     output_vec_ptr,
-    codes_i16_ptr,
     codebooks_ptr,
     scales_ptr,
     bias_ptr,
@@ -181,7 +180,6 @@ def _aqlm_gemv_simple(
     num_input_groups: tl.constexpr,
     num_input_groups_next_power_of_2: tl.constexpr,
     compute_in_fp32: tl.constexpr,
-    has_bias: tl.constexpr,
     UNUSED: tl.constexpr,
 ):
     # variables ending with "_i" mean "for i-th output unit"
@@ -203,7 +201,7 @@ def _aqlm_gemv_simple(
     # Stage 2: load integer codes for the active row
     # [in_features // in_group_size, num_codebooks]
     codes_i_ptrs = (
-        codes_i16_ptr
         + pid * num_input_groups * num_codebooks
         + tl.arange(0, num_input_groups_next_power_of_2)[:, None] * num_codebooks
         + tl.arange(0, num_codebooks)[None, :]
@@ -211,15 +209,12 @@ def _aqlm_gemv_simple(
     codes_i_mask_1d = tl.arange(0, num_input_groups_next_power_of_2) < num_input_groups
     codes_i = tl.load(codes_i_ptrs, mask=codes_i_mask_1d[:, None])  # [in_features//in_group_size, num_codebooks]
-    if codes_i.dtype == tl.int16:
-        codes_i = codes_i.to(tl.int32)
-        codes_i = (codes_i) + (codes_i < 0) * codebook_size  # aka 2 ** nbits_per_codebook
-        # ^-- (because codes are int16 tensors that contain uint data)
-        # The following alternative does not work:
-        #     codes_i = codes_i.to(tl.int32) % codebook_size # aka 2 ** nbits_per_codebook
-    else:
-        codes_i = codes_i.to(tl.int32)
     # shift codes_i so that codebooks after 0th point to correct indices in codebooks_ptr
     codes_i += tl.arange(0, num_codebooks)[None, :] * codebook_size  # aka 2 ** nbits_per_codebook
@@ -280,7 +275,7 @@ def aqlm_gemv_simple(
     assert input_vec.ndim == 2 and input_vec.shape[0] == 1, "do reshape; now!"
     assert scales.shape == (out_features // out_group_size, 1, 1, 1)
     assert in_features % in_group_size == 0
-    assert codebooks.shape[1] == 2**16
     output_vec = torch.empty(1, out_features, device=device, dtype=dtype)
     # 1D launch kernel where each block computes output unit
@@ -301,7 +296,6 @@ def aqlm_gemv_simple(
         num_input_groups,
         next_power_of_2(num_input_groups),
         compute_in_fp32,
-        bias is not None,
     )
     return output_vec
@@ -315,11 +309,67 @@ def aqlm_gemm_stupid(
     bias: Optional[torch.Tensor],
     compute_in_fp32: bool = True,
 ):
-    original_shape = input.shape
-    input = input.reshape(-1, original_shape[-1])
-    return torch.cat(
-        [
-            aqlm_gemv_simple(input_vec.unsqueeze(0), codes_i16, codebooks, scales, bias, compute_in_fp32)
-            for input_vec in input
-        ]
-    ).reshape(original_shape[:-1] + (-1,))

     bias: Optional[torch.Tensor],
 ) -> torch.Tensor:
     if input.is_cuda:
+        return triton_matmul(input, codes, codebooks, scales, bias)
     else:
         dequantized_weight = _dequantize_weight(
             unpack_int_data(codes, codebooks.shape[0].bit_length() - 1),
         "in_group_size",
         "num_input_groups",
         "num_input_groups_next_power_of_2",
         "compute_in_fp32",
     ],
 )
 def _aqlm_gemv_simple(
     input_vec_ptr,
     output_vec_ptr,
+    codes_ptr,
     codebooks_ptr,
     scales_ptr,
     bias_ptr,
     num_input_groups: tl.constexpr,
     num_input_groups_next_power_of_2: tl.constexpr,
     compute_in_fp32: tl.constexpr,
     UNUSED: tl.constexpr,
 ):
     # variables ending with "_i" mean "for i-th output unit"
     # Stage 2: load integer codes for the active row
     # [in_features // in_group_size, num_codebooks]
     codes_i_ptrs = (
+        codes_ptr
         + pid * num_input_groups * num_codebooks
         + tl.arange(0, num_input_groups_next_power_of_2)[:, None] * num_codebooks
         + tl.arange(0, num_codebooks)[None, :]
     codes_i_mask_1d = tl.arange(0, num_input_groups_next_power_of_2) < num_input_groups
     codes_i = tl.load(codes_i_ptrs, mask=codes_i_mask_1d[:, None])  # [in_features//in_group_size, num_codebooks]
+    codes_i = codes_i.to(tl.int32)
+    codes_i = (codes_i) + (codes_i < 0) * codebook_size  # aka 2 ** nbits_per_codebook
+    # ^-- (because codes are int16 tensors that contain uint data)
+    # The following alternative does not work:
+    #     codes_i = codes_i.to(tl.int32) % codebook_size # aka 2 ** nbits_per_codeboo
     # shift codes_i so that codebooks after 0th point to correct indices in codebooks_ptr
     codes_i += tl.arange(0, num_codebooks)[None, :] * codebook_size  # aka 2 ** nbits_per_codebook
     assert input_vec.ndim == 2 and input_vec.shape[0] == 1, "do reshape; now!"
     assert scales.shape == (out_features // out_group_size, 1, 1, 1)
     assert in_features % in_group_size == 0
+    assert codebooks.shape[1] < 2**32
     output_vec = torch.empty(1, out_features, device=device, dtype=dtype)
     # 1D launch kernel where each block computes output unit
         num_input_groups,
         next_power_of_2(num_input_groups),
         compute_in_fp32,
     )
     return output_vec
     bias: Optional[torch.Tensor],
     compute_in_fp32: bool = True,
 ):
+    device, dtype = codebooks.device, codebooks.dtype
+    num_codebooks, codebook_size, out_group_size, in_group_size = codebooks.shape
+    in_features = input.shape[1]
+    out_features = codes_i16.shape[0] * out_group_size
+    num_input_groups = codes_i16.shape[1]
+    assert input.ndim == 2
+    assert scales.shape == (out_features // out_group_size, 1, 1, 1)
+    assert in_features % in_group_size == 0
+    assert codebooks.shape[1] < 2**32
+    output = torch.empty(input.shape[0], out_features, device=device, dtype=dtype)
+    for i in range(input.shape[0]):
+        # 1D launch kernel where each block computes output unit
+        grid = lambda META: (out_features // out_group_size,)
+        _aqlm_gemv_simple[grid](
+            input[i],
+            output[i],
+            codes_i16,
+            codebooks,
+            scales,
+            bias,
+            in_features,
+            out_features,
+            num_codebooks,
+            codebook_size,
+            out_group_size,
+            in_group_size,
+            num_input_groups,
+            next_power_of_2(num_input_groups),
+            compute_in_fp32,
+        )
+    return output
+def triton_matmul(
+    input: torch.Tensor,
+    codes: torch.IntTensor,
+    codebooks: torch.Tensor,
+    scales: torch.Tensor,
+    bias: Optional[torch.Tensor],
+    compute_in_fp32: bool = True,
+) -> torch.Tensor:
+    input_shape = input.shape
+    input = input.reshape(-1, input_shape[-1])
+    if input.shape[0] == 1:
+        return aqlm_gemv_simple(
+            input,
+            codes,
+            codebooks,
+            scales,
+            bias,
+            compute_in_fp32,
+        ).reshape(input_shape[:-1] + (-1,))
+    else:
+        return aqlm_gemm_stupid(
+            input,
+            codes,
+            codebooks,
+            scales,
+            bias,
+            compute_in_fp32,
+        ).reshape(input_shape[:-1] + (-1,))