irexyc commited on
Commit
536b93f
1 Parent(s): 4c03188

lmdeploy managed

Browse files
Files changed (3) hide show
  1. config.json +37 -34
  2. configuration_lmdeploy.py +35 -0
  3. modeling_lmdeploy.py +224 -0
config.json CHANGED
@@ -1,36 +1,39 @@
1
  {
2
- "architectures": [
3
- "InternLMForCausalLM"
4
- ],
5
- "auto_map": {
6
- "AutoConfig": "configuration_internlm.InternLMConfig",
7
- "AutoModel": "modeling_internlm.InternLMForCausalLM",
8
- "AutoModelForCausalLM": "modeling_internlm.InternLMForCausalLM"
9
- },
10
- "bias": false,
11
- "bos_token_id": 1,
12
- "eos_token_id": 2,
13
- "hidden_act": "silu",
14
- "hidden_size": 5120,
15
- "initializer_range": 0.02,
16
- "intermediate_size": 13824,
17
- "max_position_embeddings": 2048,
18
- "model_type": "internlm",
19
- "num_attention_heads": 40,
20
- "num_hidden_layers": 60,
21
- "num_key_value_heads": 40,
22
- "pad_token_id": 2,
23
- "pretraining_tp": 1,
24
- "rms_norm_eps": 1e-06,
25
- "rope_scaling": null,
26
- "rope_theta": 10000.0,
27
- "tie_word_embeddings": false,
28
- "torch_dtype": "float16",
29
- "transformers_version": "4.33.1",
30
- "use_cache": false,
31
- "vocab_size": 103168,
32
- "rotary": {
33
- "base": 10000,
34
- "type": "dynamic"
35
- }
 
 
 
36
  }
 
1
  {
2
+ "architectures": [
3
+ "LmdeployForCausalLM"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_lmdeploy.LmdeployConfig",
7
+ "AutoModel": "modeling_lmdeploy.LmdeployForCausalLM",
8
+ "AutoModelForCausalLM": "modeling_lmdeploy.LmdeployForCausalLM"
9
+ },
10
+ "turbomind": {
11
+ "model_name": "internlm-chat-20b",
12
+ "tensor_para_size": 1,
13
+ "head_num": 40,
14
+ "kv_head_num": 40,
15
+ "vocab_size": 103168,
16
+ "num_layer": 60,
17
+ "inter_size": 13824,
18
+ "norm_eps": 1e-06,
19
+ "attn_bias": 0,
20
+ "start_id": 1,
21
+ "end_id": 2,
22
+ "session_len": 8200,
23
+ "weight_type": "int4",
24
+ "rotary_embedding": 128,
25
+ "rope_theta": 10000.0,
26
+ "size_per_head": 128,
27
+ "group_size": 128,
28
+ "max_batch_size": 32,
29
+ "max_context_token_num": 4,
30
+ "step_length": 1,
31
+ "cache_max_entry_count": 48,
32
+ "cache_chunk_size": 1,
33
+ "use_context_fmha": 1,
34
+ "quant_policy": 0,
35
+ "max_position_embeddings": 2048,
36
+ "use_dynamic_ntk": 0,
37
+ "use_logn_attn": 0
38
+ }
39
  }
configuration_lmdeploy.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import copy
3
+
4
+ from transformers import PretrainedConfig
5
+
6
+ from lmdeploy.turbomind.deploy.target_model.base import TurbomindModelConfig
7
+ from lmdeploy.version import __version__ as lm_version
8
+
9
+
10
+ class LmdeployConfig(PretrainedConfig):
11
+
12
+ def __init__(self, turbomind: dict = None, **kwargs):
13
+ default_tm_cfg = copy.deepcopy(
14
+ TurbomindModelConfig.from_dict({}, allow_none=True).__dict__)
15
+ if turbomind is not None:
16
+ default_tm_cfg.update(turbomind)
17
+ self.turbomind = default_tm_cfg
18
+ self.lmdeploy_version = lm_version
19
+ super().__init__(**kwargs)
20
+
21
+ @classmethod
22
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
23
+ return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
24
+ config, kwargs = super().from_pretrained(pretrained_model_name_or_path,
25
+ return_unused_kwargs=True,
26
+ **kwargs)
27
+ for k, v in kwargs.items():
28
+ if k in config.turbomind.keys():
29
+ config.turbomind[k] = v
30
+ if 'tp' in kwargs:
31
+ config.turbomind['tensor_para_size'] = kwargs['tp']
32
+ if return_unused_kwargs:
33
+ return config, kwargs
34
+ else:
35
+ return config
modeling_lmdeploy.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) OpenMMLab. All rights reserved.
2
+ import dataclasses
3
+ import os
4
+ from contextlib import contextmanager
5
+ from dataclasses import dataclass, field
6
+ from itertools import count
7
+ from queue import Queue
8
+ from typing import List, Optional, Tuple, Union
9
+
10
+ from transformers import PretrainedConfig
11
+ from transformers.modeling_utils import PreTrainedModel
12
+ from transformers.utils import logging
13
+
14
+ from lmdeploy.turbomind import TurboMind
15
+ from lmdeploy.turbomind.utils import download_hf_repo, get_gen_param
16
+
17
+ from .configuration_lmdeploy import LmdeployConfig
18
+
19
+ logger = logging.get_logger(__name__)
20
+
21
+
22
+ @dataclass
23
+ class Session:
24
+ _count = count()
25
+ _session_id: int = None
26
+ _message: List[Tuple[str, str]] = field(default_factory=list)
27
+ _step: int = 0
28
+ _nth_round: int = 0
29
+ _error: int = 0
30
+
31
+ def __init__(self):
32
+ self._session_id = next(Session._count)
33
+ self._message = []
34
+ self._step = 0
35
+ self._nth_round = 0
36
+
37
+ @property
38
+ def session_id(self):
39
+ return self._session_id
40
+
41
+ @property
42
+ def message(self):
43
+ return self._message
44
+
45
+ @property
46
+ def step(self):
47
+ return self._step
48
+
49
+ @property
50
+ def nth_round(self):
51
+ return self._nth_round
52
+
53
+ @property
54
+ def error(self):
55
+ return self._error
56
+
57
+
58
+ class LmdeployForCausalLM(PreTrainedModel):
59
+ config_class = LmdeployConfig
60
+
61
+ def __init__(self,
62
+ config: LmdeployConfig,
63
+ *inputs,
64
+ model_path: str = None,
65
+ **kwargs):
66
+ super().__init__(config)
67
+ self.tm_model = TurboMind.from_pretrained(model_path, **kwargs)
68
+ que = Queue()
69
+ for _ in range(config.turbomind['max_batch_size']):
70
+ que.put(self.tm_model.create_instance())
71
+ self.que = que
72
+
73
+ @classmethod
74
+ def from_pretrained(cls,
75
+ pretrained_model_name_or_path,
76
+ *model_args,
77
+ config: Optional[Union[PretrainedConfig, str,
78
+ os.PathLike]] = None,
79
+ cache_dir: Optional[Union[str, os.PathLike]] = None,
80
+ force_download: bool = False,
81
+ local_files_only: bool = False,
82
+ token: Optional[Union[str, bool]] = None,
83
+ revision: str = 'main',
84
+ **kwargs):
85
+ """Instantiate a LM model with turbomind backend."""
86
+
87
+ resume_download = kwargs.pop('resume_download', True)
88
+ proxies = kwargs.pop('proxies', None)
89
+
90
+ if os.path.isdir(pretrained_model_name_or_path):
91
+ local_folder = pretrained_model_name_or_path
92
+ else:
93
+ local_folder = download_hf_repo(
94
+ pretrained_model_name_or_path,
95
+ revision=revision,
96
+ cache_dir=cache_dir,
97
+ proxies=proxies,
98
+ resume_download=resume_download,
99
+ force_download=force_download,
100
+ token=token,
101
+ local_files_only=local_files_only,
102
+ )
103
+
104
+ if not isinstance(config, PretrainedConfig):
105
+ config_path = config if config is not None else local_folder
106
+ kwargs.pop('return_unused_kwargs')
107
+ config, model_kwargs = cls.config_class.from_pretrained(
108
+ config_path, return_unused_kwargs=True, **kwargs)
109
+ else:
110
+ model_kwargs = kwargs
111
+
112
+ model = cls(config,
113
+ *model_args,
114
+ model_path=local_folder,
115
+ **model_kwargs)
116
+
117
+ generation_config = model.tm_model.model.sampling_param
118
+ for k, v in dataclasses.asdict(generation_config).items():
119
+ if hasattr(model.generation_config, k):
120
+ base_value = getattr(model.generation_config, k)
121
+ setattr(generation_config, k, base_value)
122
+ if k in kwargs:
123
+ setattr(generation_config, k, v)
124
+ model.generation_config = generation_config
125
+
126
+ return model
127
+
128
+ @contextmanager
129
+ def managed_generator(self, session: Session):
130
+ generator = self.que.get()
131
+ try:
132
+ yield generator
133
+ except: # noqa E722
134
+ for _ in generator.stream_infer(session.session_id, [0],
135
+ request_output_len=0,
136
+ sequence_start=False,
137
+ sequence_end=False,
138
+ stop=True):
139
+ pass
140
+ finally:
141
+ self.que.put(generator)
142
+
143
+ def generate(
144
+ self,
145
+ input_ids: List[int],
146
+ session: Session,
147
+ **kwargs,
148
+ ):
149
+ """Generates sequences of token ids for models with a language modeling
150
+ head.
151
+
152
+ Args:
153
+ input_ids (List(int)): list of input token ids
154
+ session (Session) session information
155
+ kwargs (dict): hoc parametrization of generation
156
+ """
157
+ with self.managed_generator(session) as generator:
158
+ for outputs in generator.stream_infer(
159
+ session_id=session.session_id,
160
+ input_ids=[input_ids],
161
+ **kwargs,
162
+ ):
163
+ res, tokens = outputs[0]
164
+ yield res, tokens
165
+
166
+ def chat(
167
+ self,
168
+ query: str,
169
+ session: Optional[Session] = None,
170
+ cap: str = 'chat',
171
+ request_output_len: int = 512,
172
+ stream_output: bool = False,
173
+ ignore_eos=False,
174
+ random_seed: Optional[int] = None,
175
+ **kwargs,
176
+ ) -> Tuple[str, Session]:
177
+ """chat."""
178
+
179
+ if session is None:
180
+ session = Session()
181
+ assert session._error == 0, 'An error occurred before, ' \
182
+ 'please start a new session.'
183
+
184
+ session._message.append([query, ''])
185
+
186
+ prompt = self.tm_model.model.get_prompt(query, session.nth_round == 0)
187
+ input_ids = self.tm_model.tokenizer.encode(prompt)
188
+
189
+ if len(
190
+ input_ids
191
+ ) + session.step + request_output_len >= self.tm_model.session_len:
192
+ logger.error(
193
+ f'session_length exceeded {self.tm_model.session_len}')
194
+ session._error = 1
195
+ yield '', session
196
+ else:
197
+ gen_param = get_gen_param(cap, self.generation_config,
198
+ session.nth_round + 1, session.step,
199
+ request_output_len, **kwargs)
200
+ gen_kwargs = dataclasses.asdict(gen_param)
201
+ gen_kwargs.update(
202
+ random_seed=random_seed if session.nth_round == 0 else None,
203
+ stream_output=stream_output,
204
+ ignore_eos=ignore_eos,
205
+ **kwargs)
206
+
207
+ _step = session._step
208
+ _nth_round = session._nth_round
209
+ response_size = 0
210
+
211
+ for res, tokens in self.generate(input_ids,
212
+ session=session,
213
+ **gen_kwargs):
214
+ response = self.tm_model.tokenizer.decode(res.tolist(),
215
+ offset=response_size)
216
+ if response.endswith('�'):
217
+ continue
218
+ response_size = tokens
219
+
220
+ session._message[-1][-1] += response
221
+ session._nth_round = _nth_round + 1
222
+ session._step = _step + response_size
223
+
224
+ yield response, session