carsonhxsu commited on
Commit
70750a1
1 Parent(s): 03bfeeb

[NewFeature] Support inference of LLaMA (7B/13B) using int8 quantization

Browse files
Files changed (3) hide show
  1. README.md +8 -5
  2. lyra_llama/lyra_llama.py +21 -19
  3. lyra_llama/model.py +28 -18
README.md CHANGED
@@ -23,21 +23,23 @@ We use the LLaMA.13B model for measurement, but this optimized inference is appl
23
 
24
  * Evaluated at tokens/s
25
  * test on A100 40G
26
- * fp16 precision
27
 
28
  ### LLaMA-Ziya-13B
29
 
30
  | Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
31
  | --- | --- | --- | --- | --- | --- |
32
  | Torch LLaMA | 31.74 | 289.2 | 521.37 | 775.69 | OOM |
33
- | lyraLLaMA | 73.2 | 565.6 | 1179.59 | 1795.63 | 3061.27 |
 
34
 
35
  ### LLaMA-Vicuna-13B
36
 
37
  | Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
38
  | --- | --- | --- | --- | --- | --- |
39
  | Torch LLaMA | 24.65| 167.3 | 322.97 | 407.99 | OOM |
40
- | lyraLLaMA | 53.67 | 421.38 | 804.31 | 1519.28| 2679.82 |
 
41
 
42
  ## Docker Environment Recommendation
43
 
@@ -62,8 +64,9 @@ tokenizer_path = "./models/"
62
  dtype='fp16'
63
  prompt = "今天天气大概 25度,有点小雨,吹着风,我想去户外散步,应该穿什么样的衣服 裤子鞋子搭配"
64
  max_output_length = 512
 
65
 
66
- model = lyraLLaMA(model_path, tokenizer_path, dtype)
67
 
68
  prompt = '<human>:' + prompt.strip() + '\n<bot>:'
69
 
@@ -105,7 +108,7 @@ Outputs:
105
  3. Support Vector Machines (SVMs): SVMs are a type of supervised learning algorithm that can be used for both classification and regression tasks. They work by finding the best hyperplane that separates the data into different classes. SVMs are commonly used in applications such as image classification and natural language processing.
106
 
107
  ## TODO
108
- 1. Support for int8 and int4
109
  2. Inference for longer context situations
110
  3. Streaming inference mode.
111
 
 
23
 
24
  * Evaluated at tokens/s
25
  * test on A100 40G
26
+ * fp16 and int8 precision
27
 
28
  ### LLaMA-Ziya-13B
29
 
30
  | Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
31
  | --- | --- | --- | --- | --- | --- |
32
  | Torch LLaMA | 31.74 | 289.2 | 521.37 | 775.69 | OOM |
33
+ | lyraLLaMA fp16 | 73.2 | 565.6 | 1179.59 | 1795.63 | 3061.27 |
34
+ | lyraLLaMA int8 | 104 | 770.5 | 1389.9 | 2390.4 | 3782.1 |
35
 
36
  ### LLaMA-Vicuna-13B
37
 
38
  | Version | Batch Size 1 | Batch Size 8 | Batch Size 16 | Batch Size 32 | Batch Size 64 |
39
  | --- | --- | --- | --- | --- | --- |
40
  | Torch LLaMA | 24.65| 167.3 | 322.97 | 407.99 | OOM |
41
+ | lyraLLaMA fp16 | 53.67 | 421.38 | 804.31 | 1519.28| 2679.82 |
42
+ | lyraLLaMA int8 | 138.48 | 993.22 | 1741 | 2816.81 | 4146.52 |
43
 
44
  ## Docker Environment Recommendation
45
 
 
64
  dtype='fp16'
65
  prompt = "今天天气大概 25度,有点小雨,吹着风,我想去户外散步,应该穿什么样的衣服 裤子鞋子搭配"
66
  max_output_length = 512
67
+ int8_mode = 0 # To use int8 mode, set int8_mode=1
68
 
69
+ model = lyraLLaMA(model_path, tokenizer_path, dtype, int8_mode)
70
 
71
  prompt = '<human>:' + prompt.strip() + '\n<bot>:'
72
 
 
108
  3. Support Vector Machines (SVMs): SVMs are a type of supervised learning algorithm that can be used for both classification and regression tasks. They work by finding the best hyperplane that separates the data into different classes. SVMs are commonly used in applications such as image classification and natural language processing.
109
 
110
  ## TODO
111
+ 1. Support for int4
112
  2. Inference for longer context situations
113
  3. Streaming inference mode.
114
 
lyra_llama/lyra_llama.py CHANGED
@@ -3,21 +3,23 @@ from __future__ import annotations
3
  import configparser
4
  import pathlib
5
  import typing
 
6
 
7
  import torch
8
  import transformers
9
  from torch.nn.utils.rnn import pad_sequence
10
 
11
  from .config import LYRA_LLAMA_PARAM, LIB_SO_PATH
12
- from .model import LLaMAModel
13
 
14
- class lyraLLaMA:
 
15
  def __init__(self, model_path, tokenizer_path=None, dtype='fp16', int8_mode=0) -> None:
16
  self.model_path = model_path
17
  self.tokenizer_path = tokenizer_path
18
  self.dtype = dtype
19
- if dtype != 'int8':
20
- int8_mode = 0
21
  self.int8_mode = int8_mode
22
 
23
  self.model, self.tokenizer = self.load_model_and_tokenizer()
@@ -32,7 +34,7 @@ class lyraLLaMA:
32
  print(f'Loading tokenizer from {tokenizer_path}')
33
  tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)
34
 
35
- checkpoint_path = pathlib.Path(self.tokenizer_path)
36
  config_path = checkpoint_path / 'config.ini'
37
 
38
  if config_path.exists():
@@ -46,15 +48,15 @@ class lyraLLaMA:
46
  model_args = dict(
47
  head_num=cfg.getint(model_name, 'head_num'),
48
  size_per_head=cfg.getint(model_name, "size_per_head"),
 
49
  layer_num=cfg.getint(model_name, "num_layer"),
50
- tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
 
51
  vocab_size=cfg.getint(model_name, "vocab_size"),
52
  start_id=cfg.getint(model_name, "start_id"),
53
  end_id=cfg.getint(model_name, "end_id"),
54
  weights_data_type=cfg.get(model_name, "weight_data_type"),
55
- layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
56
- rotary_embedding_dim=cfg.getint(model_name, 'rotary_embedding'),
57
- inter_size=cfg.getint(model_name, 'inter_size'),
58
  inference_data_type=inference_data_type)
59
  else:
60
  inference_data_type = self.dtype
@@ -62,28 +64,29 @@ class lyraLLaMA:
62
  inference_data_type = LYRA_LLAMA_PARAM.weights_data_type
63
  model_args = dict(head_num=LYRA_LLAMA_PARAM.num_heads,
64
  size_per_head=LYRA_LLAMA_PARAM.size_per_head,
 
 
 
 
65
  vocab_size=LYRA_LLAMA_PARAM.vocab_size,
66
  start_id=LYRA_LLAMA_PARAM.start_id or tokenizer.bos_token_id,
67
  end_id=LYRA_LLAMA_PARAM.end_id or tokenizer.eos_token_id,
68
- layer_num=LYRA_LLAMA_PARAM.num_layers,
69
- tensor_para_size=LYRA_LLAMA_PARAM.tensor_para_size,
70
  weights_data_type=LYRA_LLAMA_PARAM.weights_data_type,
71
- layernorm_eps=LYRA_LLAMA_PARAM.layernorm_eps,
72
- rotary_embedding_dim=LYRA_LLAMA_PARAM.rotary_embedding,
73
- inter_size=LYRA_LLAMA_PARAM.inter_size,
74
  inference_data_type=inference_data_type)
75
 
76
  # update common parameters
77
  model_args.update(dict(
78
  lib_path=LIB_SO_PATH,
79
- model_path=self.model_path,
80
  max_seq_len=0, # for position seq embedding
81
  pipeline_para_size=LYRA_LLAMA_PARAM.pipeline_para_size,
82
  use_gptj_residual=LYRA_LLAMA_PARAM.use_gptj_residual,
 
83
  # shared_contexts_ratio=LYRA_LLAMA_PARAM.shared_contexts_ratio,
84
  ))
85
 
86
- print('[FT][INFO] Load Our FT Highly Optimized ChatGLM6B model')
87
  for k, v in model_args.items():
88
  print(f' - {k.ljust(25, ".")}: {v}')
89
 
@@ -101,9 +104,8 @@ class lyraLLaMA:
101
  print('[FT][WARNING] Given end_id is not matched with neither pad '
102
  'token id nor eos token id of the pretrained tokenizer.')
103
 
104
- print(f'Loading tokenizer from {self.model_path}')
105
- model = LLaMAModel(**model_args)
106
-
107
  return model, tokenizer
108
 
109
  def generate(self, prompts: typing.List[str] | str,
 
3
  import configparser
4
  import pathlib
5
  import typing
6
+ import os
7
 
8
  import torch
9
  import transformers
10
  from torch.nn.utils.rnn import pad_sequence
11
 
12
  from .config import LYRA_LLAMA_PARAM, LIB_SO_PATH
13
+ from .model import LlamaModel
14
 
15
+
16
+ class lyraLlama:
17
  def __init__(self, model_path, tokenizer_path=None, dtype='fp16', int8_mode=0) -> None:
18
  self.model_path = model_path
19
  self.tokenizer_path = tokenizer_path
20
  self.dtype = dtype
21
+ # if dtype != 'int8':
22
+ # int8_mode = 0
23
  self.int8_mode = int8_mode
24
 
25
  self.model, self.tokenizer = self.load_model_and_tokenizer()
 
34
  print(f'Loading tokenizer from {tokenizer_path}')
35
  tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_path)
36
 
37
+ checkpoint_path = pathlib.Path(self.model_path)
38
  config_path = checkpoint_path / 'config.ini'
39
 
40
  if config_path.exists():
 
48
  model_args = dict(
49
  head_num=cfg.getint(model_name, 'head_num'),
50
  size_per_head=cfg.getint(model_name, "size_per_head"),
51
+ inter_size=cfg.getint(model_name, 'inter_size'),
52
  layer_num=cfg.getint(model_name, "num_layer"),
53
+ rotary_embedding_dim=cfg.getint(model_name, 'rotary_embedding'),
54
+ layernorm_eps=cfg.getfloat(model_name, 'layernorm_eps'),
55
  vocab_size=cfg.getint(model_name, "vocab_size"),
56
  start_id=cfg.getint(model_name, "start_id"),
57
  end_id=cfg.getint(model_name, "end_id"),
58
  weights_data_type=cfg.get(model_name, "weight_data_type"),
59
+ tensor_para_size=cfg.getint(model_name, "tensor_para_size"),
 
 
60
  inference_data_type=inference_data_type)
61
  else:
62
  inference_data_type = self.dtype
 
64
  inference_data_type = LYRA_LLAMA_PARAM.weights_data_type
65
  model_args = dict(head_num=LYRA_LLAMA_PARAM.num_heads,
66
  size_per_head=LYRA_LLAMA_PARAM.size_per_head,
67
+ inter_size=LYRA_LLAMA_PARAM.inter_size,
68
+ layer_num=LYRA_LLAMA_PARAM.num_layers,
69
+ rotary_embedding_dim=LYRA_LLAMA_PARAM.rotary_embedding,
70
+ layernorm_eps=LYRA_LLAMA_PARAM.layernorm_eps,
71
  vocab_size=LYRA_LLAMA_PARAM.vocab_size,
72
  start_id=LYRA_LLAMA_PARAM.start_id or tokenizer.bos_token_id,
73
  end_id=LYRA_LLAMA_PARAM.end_id or tokenizer.eos_token_id,
 
 
74
  weights_data_type=LYRA_LLAMA_PARAM.weights_data_type,
75
+ tensor_para_size=LYRA_LLAMA_PARAM.tensor_para_size,
 
 
76
  inference_data_type=inference_data_type)
77
 
78
  # update common parameters
79
  model_args.update(dict(
80
  lib_path=LIB_SO_PATH,
81
+ model_path=os.path.join(self.model_path, "1-gpu-fp16.bin"),
82
  max_seq_len=0, # for position seq embedding
83
  pipeline_para_size=LYRA_LLAMA_PARAM.pipeline_para_size,
84
  use_gptj_residual=LYRA_LLAMA_PARAM.use_gptj_residual,
85
+ int8_mode=self.int8_mode
86
  # shared_contexts_ratio=LYRA_LLAMA_PARAM.shared_contexts_ratio,
87
  ))
88
 
89
+ print('[FT][INFO] Load Our FT Highly Optimized LLaMA model')
90
  for k, v in model_args.items():
91
  print(f' - {k.ljust(25, ".")}: {v}')
92
 
 
104
  print('[FT][WARNING] Given end_id is not matched with neither pad '
105
  'token id nor eos token id of the pretrained tokenizer.')
106
 
107
+ print(f'Loading model from {self.model_path}')
108
+ model = LlamaModel(**model_args)
 
109
  return model, tokenizer
110
 
111
  def generate(self, prompts: typing.List[str] | str,
lyra_llama/model.py CHANGED
@@ -16,24 +16,34 @@ from __future__ import print_function
16
 
17
  import copy
18
  import os
 
 
19
 
20
  import numpy as np
21
  import torch
22
  import torch.distributed as dist
23
  import torch.nn as nn
24
 
25
- class LLaMAModel(nn.Module):
 
 
26
  def __init__(self,
27
- head_num, size_per_head, inter_size,
28
- vocab_size, rotary_embedding_dim,
 
 
 
29
  start_id, end_id, layer_num,
30
- max_seq_len, layernorm_eps,
31
- tensor_para_size, pipeline_para_size,
 
 
32
  use_gptj_residual,
33
- lib_path,
34
  model_path,
 
35
  inference_data_type: str = "fp16",
36
- weights_data_type: np.dtype = np.float16):
37
  super().__init__()
38
  self.head_num = head_num
39
  self.size_per_head = size_per_head
@@ -46,7 +56,9 @@ class LLaMAModel(nn.Module):
46
  self.layer_num = layer_num
47
  self.use_gptj_residual = use_gptj_residual
48
  self.layernorm_eps = layernorm_eps
 
49
 
 
50
  self.tensor_para_size = tensor_para_size
51
  self.pipeline_para_size = pipeline_para_size
52
  self.build_model = False
@@ -79,23 +91,22 @@ class LLaMAModel(nn.Module):
79
  self.pipeline_para_rank = self.rank // self.tensor_para_size
80
 
81
  self.model = torch.classes.FasterTransformer.LlamaOp(
82
- self.head_num,
83
- self.size_per_head,
84
- self.inter_size,
85
  self.layer_num,
86
  self.vocab_size,
87
  self.rotary_embedding_dim,
88
  self.layernorm_eps,
89
- self.start_id,
90
- self.end_id,
91
- self.tensor_para_size,
92
- self.pipeline_para_size,
93
  self.max_seq_len,
94
  self.use_gptj_residual,
 
95
  model_path,
96
- inference_data_type)
97
-
 
98
  self.build_model = True
 
99
 
100
  def forward(self,
101
  start_ids: torch.Tensor,
@@ -111,8 +122,7 @@ class LLaMAModel(nn.Module):
111
  random_seed: torch.Tensor = None,
112
  return_output_length=False,
113
  return_cum_log_probs=0):
114
- if not self.build_model:
115
- self.cuda()
116
  input_len = start_ids.size(1)
117
  assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
118
 
 
16
 
17
  import copy
18
  import os
19
+ import pathlib
20
+ import typing
21
 
22
  import numpy as np
23
  import torch
24
  import torch.distributed as dist
25
  import torch.nn as nn
26
 
27
+ str_type_map = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
28
+
29
+ class LlamaModel(nn.Module):
30
  def __init__(self,
31
+ head_num,
32
+ size_per_head,
33
+ inter_size,
34
+ vocab_size,
35
+ rotary_embedding_dim,
36
  start_id, end_id, layer_num,
37
+ max_seq_len: int,
38
+ layernorm_eps,
39
+ tensor_para_size: int,
40
+ pipeline_para_size: int,
41
  use_gptj_residual,
42
+ lib_path: typing.Union[str, pathlib.Path],
43
  model_path,
44
+ int8_mode: int = 0,
45
  inference_data_type: str = "fp16",
46
+ weights_data_type: typing.Union[str, np.dtype] = np.float32):
47
  super().__init__()
48
  self.head_num = head_num
49
  self.size_per_head = size_per_head
 
56
  self.layer_num = layer_num
57
  self.use_gptj_residual = use_gptj_residual
58
  self.layernorm_eps = layernorm_eps
59
+ self.int8_mode = int8_mode
60
 
61
+ # multi-gpu params
62
  self.tensor_para_size = tensor_para_size
63
  self.pipeline_para_size = pipeline_para_size
64
  self.build_model = False
 
91
  self.pipeline_para_rank = self.rank // self.tensor_para_size
92
 
93
  self.model = torch.classes.FasterTransformer.LlamaOp(
94
+ self.head_num, self.size_per_head, self.inter_size,
 
 
95
  self.layer_num,
96
  self.vocab_size,
97
  self.rotary_embedding_dim,
98
  self.layernorm_eps,
99
+ self.start_id, self.end_id,
100
+ self.tensor_para_size, self.pipeline_para_size,
 
 
101
  self.max_seq_len,
102
  self.use_gptj_residual,
103
+ self.int8_mode,
104
  model_path,
105
+ self.weights_data_type,
106
+ self.inference_data_type)
107
+
108
  self.build_model = True
109
+ torch.cuda.empty_cache()
110
 
111
  def forward(self,
112
  start_ids: torch.Tensor,
 
122
  random_seed: torch.Tensor = None,
123
  return_output_length=False,
124
  return_cum_log_probs=0):
125
+
 
126
  input_len = start_ids.size(1)
127
  assert input_len > 0, "input len must be larger than zero. For an unconditional case, use start_id as the first token."
128