[NewFeature] Support inference of LLaMA (7B/13B) using int8 quantization

Files changed (3) hide show

README.md +8 -5
lyra_llama/lyra_llama.py +21 -19
lyra_llama/model.py +28 -18

README.md CHANGED Viewed

@@ -23,21 +23,23 @@ We use the LLaMA.13B model for measurement, but this optimized inference is appl
 * Evaluated at tokens/s
 * test on A100 40G
-* fp16 precision
 ### LLaMA-Ziya-13B
 | Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
 | --- | --- | --- | --- | --- | --- |
 | Torch LLaMA | 31.74 | 289.2 | 521.37 | 775.69 | OOM |
-| lyraLLaMA | 73.2 | 565.6 | 1179.59 | 1795.63 | 3061.27 |
 ### LLaMA-Vicuna-13B
 | Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
 | --- | --- | --- | --- | --- | --- |
 | Torch LLaMA | 24.65| 167.3 | 322.97 | 407.99 | OOM |
-| lyraLLaMA | 53.67 | 421.38 | 804.31 | 1519.28| 2679.82 |
 ## Docker Environment Recommendation
@@ -62,8 +64,9 @@ tokenizer_path = "./models/"
 dtype='fp16'
 prompt = "今天天气大概 25度，有点小雨，吹着风，我想去户外散步，应该穿什么样的衣服 裤子鞋子搭配"
 max_output_length = 512
-model = lyraLLaMA(model_path, tokenizer_path, dtype)
 prompt = '<human>:' + prompt.strip() + '\n<bot>:'
@@ -105,7 +108,7 @@ Outputs:
 3. Support Vector Machines (SVMs): SVMs are a type of supervised learning algorithm that can be used for both classification and regression tasks. They work by finding the best hyperplane that separates the data into different classes. SVMs are commonly used in applications such as image classification and natural language processing.
 ## TODO
-1. Support for int8 and int4
 2. Inference for longer context situations
 3. Streaming inference mode.

 * Evaluated at tokens/s
 * test on A100 40G
+* fp16 and int8 precision
 ### LLaMA-Ziya-13B
 | Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
 | --- | --- | --- | --- | --- | --- |
 | Torch LLaMA | 31.74 | 289.2 | 521.37 | 775.69 | OOM |
+| lyraLLaMA fp16 | 73.2 | 565.6 | 1179.59 | 1795.63 | 3061.27 |
+| lyraLLaMA int8 | 104 | 770.5 | 1389.9 | 2390.4 | 3782.1 |
 ### LLaMA-Vicuna-13B
 | Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
 | --- | --- | --- | --- | --- | --- |
 | Torch LLaMA | 24.65| 167.3 | 322.97 | 407.99 | OOM |
+| lyraLLaMA fp16 | 53.67 | 421.38 | 804.31 | 1519.28| 2679.82 |
+| lyraLLaMA int8 | 138.48 | 993.22 | 1741 | 2816.81 | 4146.52 |
 ## Docker Environment Recommendation
 dtype='fp16'
 prompt = "今天天气大概 25度，有点小雨，吹着风，我想去户外散步，应该穿什么样的衣服 裤子鞋子搭配"
 max_output_length = 512
+int8_mode = 0 # To use int8 mode, set int8_mode=1
+model = lyraLLaMA(model_path, tokenizer_path, dtype, int8_mode)
 prompt = '<human>:' + prompt.strip() + '\n<bot>:'
 3. Support Vector Machines (SVMs): SVMs are a type of supervised learning algorithm that can be used for both classification and regression tasks. They work by finding the best hyperplane that separates the data into different classes. SVMs are commonly used in applications such as image classification and natural language processing.
 ## TODO
+1. Support for int4
 2. Inference for longer context situations
 3. Streaming inference mode.

lyra_llama/lyra_llama.py CHANGED Viewed

@@ -3,21 +3,23 @@ from __future__ import annotations
 import configparser
 import pathlib
 import typing
 import torch
 import transformers
 from torch.nn.utils.rnn import pad_sequence
 from .config import LYRA_LLAMA_PARAM, LIB_SO_PATH
-from .model import LLaMAModel
-class lyraLLaMA:
     def __init__(self, model_path, tokenizer_path=None, dtype='fp16', int8_mode=0) -> None:
         self.model_path = model_path
         self.tokenizer_path = tokenizer_path
         self.dtype = dtype
-        if dtype != 'int8':
-            int8_mode = 0
         self.int8_mode = int8_mode
         self.model, self.tokenizer = self.load_model_and_tokenizer()
@@ -32,7 +34,7 @@ class lyraLLaMA:
         print(f'Loading tokenizer from {tokenizer_path}')
         tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)
-        checkpoint_path = pathlib.Path(self.tokenizer_path)
         config_path = checkpoint_path / 'config.ini'
         if config_path.exists():
@@ -46,15 +48,15 @@ class lyraLLaMA:
             model_args = dict(
                 head_num=cfg.getint(model_name, 'head_num'),
                 size_per_head=cfg.getint(model_name, "size_per_head"),
                 layer_num=cfg.getint(model_name, "num_layer"),
-                tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
                 vocab_size=cfg.getint(model_name, "vocab_size"),
                 start_id=cfg.getint(model_name, "start_id"),
                 end_id=cfg.getint(model_name, "end_id"),
                 weights_data_type=cfg.get(model_name, "weight_data_type"),
-                layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
-                rotary_embedding_dim=cfg.getint(model_name, 'rotary_embedding'),
-                inter_size=cfg.getint(model_name, 'inter_size'),
                 inference_data_type=inference_data_type)
         else:
             inference_data_type = self.dtype
@@ -62,28 +64,29 @@ class lyraLLaMA:
                 inference_data_type = LYRA_LLAMA_PARAM.weights_data_type
             model_args = dict(head_num=LYRA_LLAMA_PARAM.num_heads,
                               size_per_head=LYRA_LLAMA_PARAM.size_per_head,
                               vocab_size=LYRA_LLAMA_PARAM.vocab_size,
                               start_id=LYRA_LLAMA_PARAM.start_id or tokenizer.bos_token_id,
                               end_id=LYRA_LLAMA_PARAM.end_id or tokenizer.eos_token_id,
-                              layer_num=LYRA_LLAMA_PARAM.num_layers,
-                              tensor_para_size=LYRA_LLAMA_PARAM.tensor_para_size,
                               weights_data_type=LYRA_LLAMA_PARAM.weights_data_type,
-                              layernorm_eps=LYRA_LLAMA_PARAM.layernorm_eps,
-                              rotary_embedding_dim=LYRA_LLAMA_PARAM.rotary_embedding,
-                              inter_size=LYRA_LLAMA_PARAM.inter_size,
                               inference_data_type=inference_data_type)
         # update common parameters
         model_args.update(dict(
             lib_path=LIB_SO_PATH,
-            model_path=self.model_path,
             max_seq_len=0,  # for position seq embedding
             pipeline_para_size=LYRA_LLAMA_PARAM.pipeline_para_size,
             use_gptj_residual=LYRA_LLAMA_PARAM.use_gptj_residual,
             # shared_contexts_ratio=LYRA_LLAMA_PARAM.shared_contexts_ratio,
         ))
-        print('[FT][INFO] Load Our FT Highly Optimized ChatGLM6B model')
         for k, v in model_args.items():
             print(f' - {k.ljust(25, ".")}: {v}')
@@ -101,9 +104,8 @@ class lyraLLaMA:
             print('[FT][WARNING] Given end_id is not matched with neither pad '
                   'token id nor eos token id of the pretrained tokenizer.')
-        print(f'Loading tokenizer from {self.model_path}')
-        model = LLaMAModel(**model_args)
         return model, tokenizer
     def generate(self, prompts: typing.List[str] | str,

 import configparser
 import pathlib
 import typing
+import os
 import torch
 import transformers
 from torch.nn.utils.rnn import pad_sequence
 from .config import LYRA_LLAMA_PARAM, LIB_SO_PATH
+from .model import LlamaModel
+class lyraLlama:
     def __init__(self, model_path, tokenizer_path=None, dtype='fp16', int8_mode=0) -> None:
         self.model_path = model_path
         self.tokenizer_path = tokenizer_path
         self.dtype = dtype
+        # if dtype != 'int8':
+        #     int8_mode = 0
         self.int8_mode = int8_mode
         self.model, self.tokenizer = self.load_model_and_tokenizer()
         print(f'Loading tokenizer from {tokenizer_path}')
         tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)
+        checkpoint_path = pathlib.Path(self.model_path)
         config_path = checkpoint_path / 'config.ini'
         if config_path.exists():
             model_args = dict(
                 head_num=cfg.getint(model_name, 'head_num'),
                 size_per_head=cfg.getint(model_name, "size_per_head"),
+                inter_size=cfg.getint(model_name, 'inter_size'),
                 layer_num=cfg.getint(model_name, "num_layer"),
+                rotary_embedding_dim=cfg.getint(model_name, 'rotary_embedding'),
+                layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
                 vocab_size=cfg.getint(model_name, "vocab_size"),
                 start_id=cfg.getint(model_name, "start_id"),
                 end_id=cfg.getint(model_name, "end_id"),
                 weights_data_type=cfg.get(model_name, "weight_data_type"),
+                tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
                 inference_data_type=inference_data_type)
         else:
             inference_data_type = self.dtype
                 inference_data_type = LYRA_LLAMA_PARAM.weights_data_type
             model_args = dict(head_num=LYRA_LLAMA_PARAM.num_heads,
                               size_per_head=LYRA_LLAMA_PARAM.size_per_head,
+                              inter_size=LYRA_LLAMA_PARAM.inter_size,
+                              layer_num=LYRA_LLAMA_PARAM.num_layers,
+                              rotary_embedding_dim=LYRA_LLAMA_PARAM.rotary_embedding,
+                              layernorm_eps=LYRA_LLAMA_PARAM.layernorm_eps,
                               vocab_size=LYRA_LLAMA_PARAM.vocab_size,
                               start_id=LYRA_LLAMA_PARAM.start_id or tokenizer.bos_token_id,
                               end_id=LYRA_LLAMA_PARAM.end_id or tokenizer.eos_token_id,
                               weights_data_type=LYRA_LLAMA_PARAM.weights_data_type,
+                              tensor_para_size=LYRA_LLAMA_PARAM.tensor_para_size,
                               inference_data_type=inference_data_type)
         # update common parameters
         model_args.update(dict(
             lib_path=LIB_SO_PATH,
+            model_path=os.path.join(self.model_path, "1-gpu-fp16.bin"),
             max_seq_len=0,  # for position seq embedding
             pipeline_para_size=LYRA_LLAMA_PARAM.pipeline_para_size,
             use_gptj_residual=LYRA_LLAMA_PARAM.use_gptj_residual,
+            int8_mode=self.int8_mode
             # shared_contexts_ratio=LYRA_LLAMA_PARAM.shared_contexts_ratio,
         ))
+        print('[FT][INFO] Load Our FT Highly Optimized LLaMA model')
         for k, v in model_args.items():
             print(f' - {k.ljust(25, ".")}: {v}')
             print('[FT][WARNING] Given end_id is not matched with neither pad '
                   'token id nor eos token id of the pretrained tokenizer.')
+        print(f'Loading model from {self.model_path}')
+        model = LlamaModel(**model_args)
         return model, tokenizer
     def generate(self, prompts: typing.List[str] | str,

lyra_llama/model.py CHANGED Viewed

@@ -16,24 +16,34 @@ from __future__ import print_function
 import copy
 import os
 import numpy as np
 import torch
 import torch.distributed as dist
 import torch.nn as nn
-class LLaMAModel(nn.Module):
     def __init__(self,
-                 head_num, size_per_head, inter_size,
-                 vocab_size, rotary_embedding_dim,
                  start_id, end_id, layer_num,
-                 max_seq_len, layernorm_eps,
-                 tensor_para_size, pipeline_para_size,
                  use_gptj_residual,
-                 lib_path,
                  model_path,
                  inference_data_type: str = "fp16",
-                 weights_data_type: np.dtype = np.float16):
         super().__init__()
         self.head_num = head_num
         self.size_per_head = size_per_head
@@ -46,7 +56,9 @@ class LLaMAModel(nn.Module):
         self.layer_num = layer_num
         self.use_gptj_residual = use_gptj_residual
         self.layernorm_eps = layernorm_eps
         self.tensor_para_size = tensor_para_size
         self.pipeline_para_size = pipeline_para_size
         self.build_model = False
@@ -79,23 +91,22 @@ class LLaMAModel(nn.Module):
         self.pipeline_para_rank = self.rank // self.tensor_para_size
         self.model = torch.classes.FasterTransformer.LlamaOp(
-            self.head_num,
-            self.size_per_head,
-            self.inter_size,
             self.layer_num,
             self.vocab_size,
             self.rotary_embedding_dim,
             self.layernorm_eps,
-            self.start_id,
-            self.end_id,
-            self.tensor_para_size,
-            self.pipeline_para_size,
             self.max_seq_len,
             self.use_gptj_residual,
             model_path,
-            inference_data_type)
         self.build_model = True
     def forward(self,
                 start_ids: torch.Tensor,
@@ -111,8 +122,7 @@ class LLaMAModel(nn.Module):
                 random_seed: torch.Tensor = None,
                 return_output_length=False,
                 return_cum_log_probs=0):
-        if not self.build_model:
-            self.cuda()
         input_len = start_ids.size(1)
         assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."

 import copy
 import os
+import pathlib
+import typing
 import numpy as np
 import torch
 import torch.distributed as dist
 import torch.nn as nn
+str_type_map = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
+class LlamaModel(nn.Module):
     def __init__(self,
+                 head_num,
+                 size_per_head,
+                 inter_size,
+                 vocab_size,
+                 rotary_embedding_dim,
                  start_id, end_id, layer_num,
+                 max_seq_len: int,
+                 layernorm_eps,
+                 tensor_para_size: int,
+                 pipeline_para_size: int,
                  use_gptj_residual,
+                 lib_path: typing.Union[str, pathlib.Path],
                  model_path,
+                 int8_mode: int = 0,
                  inference_data_type: str = "fp16",
+                 weights_data_type: typing.Union[str, np.dtype] = np.float32):
         super().__init__()
         self.head_num = head_num
         self.size_per_head = size_per_head
         self.layer_num = layer_num
         self.use_gptj_residual = use_gptj_residual
         self.layernorm_eps = layernorm_eps
+        self.int8_mode = int8_mode
+        # multi-gpu params
         self.tensor_para_size = tensor_para_size
         self.pipeline_para_size = pipeline_para_size
         self.build_model = False
         self.pipeline_para_rank = self.rank // self.tensor_para_size
         self.model = torch.classes.FasterTransformer.LlamaOp(
+            self.head_num, self.size_per_head, self.inter_size,
             self.layer_num,
             self.vocab_size,
             self.rotary_embedding_dim,
             self.layernorm_eps,
+            self.start_id, self.end_id,
+            self.tensor_para_size, self.pipeline_para_size,
             self.max_seq_len,
             self.use_gptj_residual,
+            self.int8_mode,
             model_path,
+            self.weights_data_type,
+            self.inference_data_type)
         self.build_model = True
+        torch.cuda.empty_cache()
     def forward(self,
                 start_ids: torch.Tensor,
                 random_seed: torch.Tensor = None,
                 return_output_length=False,
                 return_cum_log_probs=0):
         input_len = start_ids.size(1)
         assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."