|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import json |
|
from pathlib import Path |
|
from typing import Optional |
|
|
|
from transformers import AutoTokenizer, T5Tokenizer |
|
|
|
import tensorrt_llm |
|
|
|
DEFAULT_HF_MODEL_DIRS = { |
|
'baichuan': 'baichuan-inc/Baichuan-13B-Chat', |
|
'bloom': 'bigscience/bloom-560m', |
|
'chatglm_6b': 'THUDM/chatglm-6b', |
|
'chatglm2_6b': 'THUDM/chatglm2-6b', |
|
'chatglm2_6b_32k': 'THUDM/chatglm2-6b-32k', |
|
'chatglm3_6b': 'THUDM/chatglm3-6b', |
|
'chatglm3_6b_base': 'THUDM/chatglm3-6b-base', |
|
'chatglm3_6b_32k': 'THUDM/chatglm3-6b-32k', |
|
'falcon': 'tiiuae/falcon-rw-1b', |
|
'glm_10b': 'THUDM/glm-10b', |
|
'gpt': 'gpt2-medium', |
|
'gptj': 'EleutherAI/gpt-j-6b', |
|
'gptneox': 'EleutherAI/gpt-neox-20b', |
|
'internlm': 'internlm/internlm-chat-7b', |
|
'llama': 'meta-llama/Llama-2-7b-hf', |
|
'mpt': 'mosaicml/mpt-7b', |
|
'opt': 'facebook/opt-350m', |
|
'qwen': 'Qwen/Qwen-7B', |
|
} |
|
|
|
DEFAULT_PROMPT_TEMPLATES = { |
|
'internlm': |
|
"<|User|>:{input_text}<eoh>\n<|Bot|>:", |
|
'qwen': |
|
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n", |
|
} |
|
|
|
|
|
def read_model_name(engine_dir: str): |
|
engine_version = tensorrt_llm.builder.get_engine_version(engine_dir) |
|
|
|
with open(Path(engine_dir) / "config.json", 'r') as f: |
|
config = json.load(f) |
|
|
|
if engine_version is None: |
|
return config['builder_config']['name'] |
|
|
|
return config['pretrained_config']['architecture'] |
|
|
|
|
|
def throttle_generator(generator, stream_interval): |
|
for i, out in enumerate(generator): |
|
if not i % stream_interval: |
|
yield out |
|
|
|
if i % stream_interval: |
|
yield out |
|
|
|
|
|
def load_tokenizer(tokenizer_dir: Optional[str] = None, |
|
vocab_file: Optional[str] = None, |
|
model_name: str = 'gpt', |
|
tokenizer_type: Optional[str] = None): |
|
if vocab_file is None: |
|
use_fast = True |
|
if tokenizer_type is not None and tokenizer_type == "llama": |
|
use_fast = False |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir, |
|
legacy=False, |
|
padding_side='left', |
|
truncation_side='left', |
|
trust_remote_code=True, |
|
tokenizer_type=tokenizer_type, |
|
use_fast=use_fast) |
|
else: |
|
|
|
assert model_name == 'gpt' |
|
tokenizer = T5Tokenizer(vocab_file=vocab_file, |
|
padding_side='left', |
|
truncation_side='left') |
|
|
|
if model_name == 'qwen': |
|
with open(Path(tokenizer_dir) / "generation_config.json") as f: |
|
gen_config = json.load(f) |
|
chat_format = gen_config['chat_format'] |
|
if chat_format == 'raw': |
|
pad_id = gen_config['pad_token_id'] |
|
end_id = gen_config['eos_token_id'] |
|
elif chat_format == 'chatml': |
|
pad_id = tokenizer.im_end_id |
|
end_id = tokenizer.im_end_id |
|
else: |
|
raise Exception(f"unknown chat format: {chat_format}") |
|
elif model_name == 'glm_10b': |
|
pad_id = tokenizer.pad_token_id |
|
end_id = tokenizer.eop_token_id |
|
else: |
|
if tokenizer.pad_token_id is None: |
|
tokenizer.pad_token_id = tokenizer.eos_token_id |
|
pad_id = tokenizer.pad_token_id |
|
end_id = tokenizer.eos_token_id |
|
|
|
return tokenizer, pad_id, end_id |
|
|