lmdeploy managed
Browse files- config.json +37 -34
- configuration_lmdeploy.py +35 -0
- modeling_lmdeploy.py +224 -0
config.json
CHANGED
@@ -1,36 +1,39 @@
|
|
1 |
{
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
36 |
}
|
|
|
1 |
{
|
2 |
+
"architectures": [
|
3 |
+
"LmdeployForCausalLM"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoConfig": "configuration_lmdeploy.LmdeployConfig",
|
7 |
+
"AutoModel": "modeling_lmdeploy.LmdeployForCausalLM",
|
8 |
+
"AutoModelForCausalLM": "modeling_lmdeploy.LmdeployForCausalLM"
|
9 |
+
},
|
10 |
+
"turbomind": {
|
11 |
+
"model_name": "internlm-chat-20b",
|
12 |
+
"tensor_para_size": 1,
|
13 |
+
"head_num": 40,
|
14 |
+
"kv_head_num": 40,
|
15 |
+
"vocab_size": 103168,
|
16 |
+
"num_layer": 60,
|
17 |
+
"inter_size": 13824,
|
18 |
+
"norm_eps": 1e-06,
|
19 |
+
"attn_bias": 0,
|
20 |
+
"start_id": 1,
|
21 |
+
"end_id": 2,
|
22 |
+
"session_len": 8200,
|
23 |
+
"weight_type": "int4",
|
24 |
+
"rotary_embedding": 128,
|
25 |
+
"rope_theta": 10000.0,
|
26 |
+
"size_per_head": 128,
|
27 |
+
"group_size": 128,
|
28 |
+
"max_batch_size": 32,
|
29 |
+
"max_context_token_num": 4,
|
30 |
+
"step_length": 1,
|
31 |
+
"cache_max_entry_count": 48,
|
32 |
+
"cache_chunk_size": 1,
|
33 |
+
"use_context_fmha": 1,
|
34 |
+
"quant_policy": 0,
|
35 |
+
"max_position_embeddings": 2048,
|
36 |
+
"use_dynamic_ntk": 0,
|
37 |
+
"use_logn_attn": 0
|
38 |
+
}
|
39 |
}
|
configuration_lmdeploy.py
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) OpenMMLab. All rights reserved.
|
2 |
+
import copy
|
3 |
+
|
4 |
+
from transformers import PretrainedConfig
|
5 |
+
|
6 |
+
from lmdeploy.turbomind.deploy.target_model.base import TurbomindModelConfig
|
7 |
+
from lmdeploy.version import __version__ as lm_version
|
8 |
+
|
9 |
+
|
10 |
+
class LmdeployConfig(PretrainedConfig):
|
11 |
+
|
12 |
+
def __init__(self, turbomind: dict = None, **kwargs):
|
13 |
+
default_tm_cfg = copy.deepcopy(
|
14 |
+
TurbomindModelConfig.from_dict({}, allow_none=True).__dict__)
|
15 |
+
if turbomind is not None:
|
16 |
+
default_tm_cfg.update(turbomind)
|
17 |
+
self.turbomind = default_tm_cfg
|
18 |
+
self.lmdeploy_version = lm_version
|
19 |
+
super().__init__(**kwargs)
|
20 |
+
|
21 |
+
@classmethod
|
22 |
+
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
|
23 |
+
return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
|
24 |
+
config, kwargs = super().from_pretrained(pretrained_model_name_or_path,
|
25 |
+
return_unused_kwargs=True,
|
26 |
+
**kwargs)
|
27 |
+
for k, v in kwargs.items():
|
28 |
+
if k in config.turbomind.keys():
|
29 |
+
config.turbomind[k] = v
|
30 |
+
if 'tp' in kwargs:
|
31 |
+
config.turbomind['tensor_para_size'] = kwargs['tp']
|
32 |
+
if return_unused_kwargs:
|
33 |
+
return config, kwargs
|
34 |
+
else:
|
35 |
+
return config
|
modeling_lmdeploy.py
ADDED
@@ -0,0 +1,224 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) OpenMMLab. All rights reserved.
|
2 |
+
import dataclasses
|
3 |
+
import os
|
4 |
+
from contextlib import contextmanager
|
5 |
+
from dataclasses import dataclass, field
|
6 |
+
from itertools import count
|
7 |
+
from queue import Queue
|
8 |
+
from typing import List, Optional, Tuple, Union
|
9 |
+
|
10 |
+
from transformers import PretrainedConfig
|
11 |
+
from transformers.modeling_utils import PreTrainedModel
|
12 |
+
from transformers.utils import logging
|
13 |
+
|
14 |
+
from lmdeploy.turbomind import TurboMind
|
15 |
+
from lmdeploy.turbomind.utils import download_hf_repo, get_gen_param
|
16 |
+
|
17 |
+
from .configuration_lmdeploy import LmdeployConfig
|
18 |
+
|
19 |
+
logger = logging.get_logger(__name__)
|
20 |
+
|
21 |
+
|
22 |
+
@dataclass
|
23 |
+
class Session:
|
24 |
+
_count = count()
|
25 |
+
_session_id: int = None
|
26 |
+
_message: List[Tuple[str, str]] = field(default_factory=list)
|
27 |
+
_step: int = 0
|
28 |
+
_nth_round: int = 0
|
29 |
+
_error: int = 0
|
30 |
+
|
31 |
+
def __init__(self):
|
32 |
+
self._session_id = next(Session._count)
|
33 |
+
self._message = []
|
34 |
+
self._step = 0
|
35 |
+
self._nth_round = 0
|
36 |
+
|
37 |
+
@property
|
38 |
+
def session_id(self):
|
39 |
+
return self._session_id
|
40 |
+
|
41 |
+
@property
|
42 |
+
def message(self):
|
43 |
+
return self._message
|
44 |
+
|
45 |
+
@property
|
46 |
+
def step(self):
|
47 |
+
return self._step
|
48 |
+
|
49 |
+
@property
|
50 |
+
def nth_round(self):
|
51 |
+
return self._nth_round
|
52 |
+
|
53 |
+
@property
|
54 |
+
def error(self):
|
55 |
+
return self._error
|
56 |
+
|
57 |
+
|
58 |
+
class LmdeployForCausalLM(PreTrainedModel):
|
59 |
+
config_class = LmdeployConfig
|
60 |
+
|
61 |
+
def __init__(self,
|
62 |
+
config: LmdeployConfig,
|
63 |
+
*inputs,
|
64 |
+
model_path: str = None,
|
65 |
+
**kwargs):
|
66 |
+
super().__init__(config)
|
67 |
+
self.tm_model = TurboMind.from_pretrained(model_path, **kwargs)
|
68 |
+
que = Queue()
|
69 |
+
for _ in range(config.turbomind['max_batch_size']):
|
70 |
+
que.put(self.tm_model.create_instance())
|
71 |
+
self.que = que
|
72 |
+
|
73 |
+
@classmethod
|
74 |
+
def from_pretrained(cls,
|
75 |
+
pretrained_model_name_or_path,
|
76 |
+
*model_args,
|
77 |
+
config: Optional[Union[PretrainedConfig, str,
|
78 |
+
os.PathLike]] = None,
|
79 |
+
cache_dir: Optional[Union[str, os.PathLike]] = None,
|
80 |
+
force_download: bool = False,
|
81 |
+
local_files_only: bool = False,
|
82 |
+
token: Optional[Union[str, bool]] = None,
|
83 |
+
revision: str = 'main',
|
84 |
+
**kwargs):
|
85 |
+
"""Instantiate a LM model with turbomind backend."""
|
86 |
+
|
87 |
+
resume_download = kwargs.pop('resume_download', True)
|
88 |
+
proxies = kwargs.pop('proxies', None)
|
89 |
+
|
90 |
+
if os.path.isdir(pretrained_model_name_or_path):
|
91 |
+
local_folder = pretrained_model_name_or_path
|
92 |
+
else:
|
93 |
+
local_folder = download_hf_repo(
|
94 |
+
pretrained_model_name_or_path,
|
95 |
+
revision=revision,
|
96 |
+
cache_dir=cache_dir,
|
97 |
+
proxies=proxies,
|
98 |
+
resume_download=resume_download,
|
99 |
+
force_download=force_download,
|
100 |
+
token=token,
|
101 |
+
local_files_only=local_files_only,
|
102 |
+
)
|
103 |
+
|
104 |
+
if not isinstance(config, PretrainedConfig):
|
105 |
+
config_path = config if config is not None else local_folder
|
106 |
+
kwargs.pop('return_unused_kwargs')
|
107 |
+
config, model_kwargs = cls.config_class.from_pretrained(
|
108 |
+
config_path, return_unused_kwargs=True, **kwargs)
|
109 |
+
else:
|
110 |
+
model_kwargs = kwargs
|
111 |
+
|
112 |
+
model = cls(config,
|
113 |
+
*model_args,
|
114 |
+
model_path=local_folder,
|
115 |
+
**model_kwargs)
|
116 |
+
|
117 |
+
generation_config = model.tm_model.model.sampling_param
|
118 |
+
for k, v in dataclasses.asdict(generation_config).items():
|
119 |
+
if hasattr(model.generation_config, k):
|
120 |
+
base_value = getattr(model.generation_config, k)
|
121 |
+
setattr(generation_config, k, base_value)
|
122 |
+
if k in kwargs:
|
123 |
+
setattr(generation_config, k, v)
|
124 |
+
model.generation_config = generation_config
|
125 |
+
|
126 |
+
return model
|
127 |
+
|
128 |
+
@contextmanager
|
129 |
+
def managed_generator(self, session: Session):
|
130 |
+
generator = self.que.get()
|
131 |
+
try:
|
132 |
+
yield generator
|
133 |
+
except: # noqa E722
|
134 |
+
for _ in generator.stream_infer(session.session_id, [0],
|
135 |
+
request_output_len=0,
|
136 |
+
sequence_start=False,
|
137 |
+
sequence_end=False,
|
138 |
+
stop=True):
|
139 |
+
pass
|
140 |
+
finally:
|
141 |
+
self.que.put(generator)
|
142 |
+
|
143 |
+
def generate(
|
144 |
+
self,
|
145 |
+
input_ids: List[int],
|
146 |
+
session: Session,
|
147 |
+
**kwargs,
|
148 |
+
):
|
149 |
+
"""Generates sequences of token ids for models with a language modeling
|
150 |
+
head.
|
151 |
+
|
152 |
+
Args:
|
153 |
+
input_ids (List(int)): list of input token ids
|
154 |
+
session (Session) session information
|
155 |
+
kwargs (dict): hoc parametrization of generation
|
156 |
+
"""
|
157 |
+
with self.managed_generator(session) as generator:
|
158 |
+
for outputs in generator.stream_infer(
|
159 |
+
session_id=session.session_id,
|
160 |
+
input_ids=[input_ids],
|
161 |
+
**kwargs,
|
162 |
+
):
|
163 |
+
res, tokens = outputs[0]
|
164 |
+
yield res, tokens
|
165 |
+
|
166 |
+
def chat(
|
167 |
+
self,
|
168 |
+
query: str,
|
169 |
+
session: Optional[Session] = None,
|
170 |
+
cap: str = 'chat',
|
171 |
+
request_output_len: int = 512,
|
172 |
+
stream_output: bool = False,
|
173 |
+
ignore_eos=False,
|
174 |
+
random_seed: Optional[int] = None,
|
175 |
+
**kwargs,
|
176 |
+
) -> Tuple[str, Session]:
|
177 |
+
"""chat."""
|
178 |
+
|
179 |
+
if session is None:
|
180 |
+
session = Session()
|
181 |
+
assert session._error == 0, 'An error occurred before, ' \
|
182 |
+
'please start a new session.'
|
183 |
+
|
184 |
+
session._message.append([query, ''])
|
185 |
+
|
186 |
+
prompt = self.tm_model.model.get_prompt(query, session.nth_round == 0)
|
187 |
+
input_ids = self.tm_model.tokenizer.encode(prompt)
|
188 |
+
|
189 |
+
if len(
|
190 |
+
input_ids
|
191 |
+
) + session.step + request_output_len >= self.tm_model.session_len:
|
192 |
+
logger.error(
|
193 |
+
f'session_length exceeded {self.tm_model.session_len}')
|
194 |
+
session._error = 1
|
195 |
+
yield '', session
|
196 |
+
else:
|
197 |
+
gen_param = get_gen_param(cap, self.generation_config,
|
198 |
+
session.nth_round + 1, session.step,
|
199 |
+
request_output_len, **kwargs)
|
200 |
+
gen_kwargs = dataclasses.asdict(gen_param)
|
201 |
+
gen_kwargs.update(
|
202 |
+
random_seed=random_seed if session.nth_round == 0 else None,
|
203 |
+
stream_output=stream_output,
|
204 |
+
ignore_eos=ignore_eos,
|
205 |
+
**kwargs)
|
206 |
+
|
207 |
+
_step = session._step
|
208 |
+
_nth_round = session._nth_round
|
209 |
+
response_size = 0
|
210 |
+
|
211 |
+
for res, tokens in self.generate(input_ids,
|
212 |
+
session=session,
|
213 |
+
**gen_kwargs):
|
214 |
+
response = self.tm_model.tokenizer.decode(res.tolist(),
|
215 |
+
offset=response_size)
|
216 |
+
if response.endswith('�'):
|
217 |
+
continue
|
218 |
+
response_size = tokens
|
219 |
+
|
220 |
+
session._message[-1][-1] += response
|
221 |
+
session._nth_round = _nth_round + 1
|
222 |
+
session._step = _step + response_size
|
223 |
+
|
224 |
+
yield response, session
|