demo_test / fastchat /model /model_adapter.py
yuantao-infini-ai's picture
Upload 136 files
7472549 verified
"""Model adapter registration."""
import math
import os
import re
import sys
from typing import Dict, List, Optional
import warnings
if sys.version_info >= (3, 9):
from functools import cache
else:
from functools import lru_cache as cache
import accelerate
import psutil
import torch
from transformers import (
AutoConfig,
AutoModel,
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
AutoTokenizer,
LlamaTokenizer,
LlamaForCausalLM,
T5Tokenizer,
)
from fastchat.constants import CPU_ISA
from fastchat.conversation import Conversation, get_conv_template
from fastchat.model.compression import load_compress_model
from fastchat.model.llama_condense_monkey_patch import replace_llama_with_condense
from fastchat.model.model_chatglm import generate_stream_chatglm
from fastchat.model.model_codet5p import generate_stream_codet5p
from fastchat.model.model_falcon import generate_stream_falcon
from fastchat.model.model_exllama import generate_stream_exllama
from fastchat.model.model_xfastertransformer import generate_stream_xft
from fastchat.model.monkey_patch_non_inplace import (
replace_llama_attn_with_non_inplace_operations,
)
from fastchat.modules.awq import AWQConfig, load_awq_quantized
from fastchat.modules.exllama import ExllamaConfig, load_exllama_model
from fastchat.modules.xfastertransformer import load_xft_model, XftConfig
from fastchat.modules.gptq import GptqConfig, load_gptq_quantized
from fastchat.utils import get_gpu_memory
# Check an environment variable to check if we should be sharing Peft model
# weights. When false we treat all Peft models as separate.
peft_share_base_weights = (
os.environ.get("PEFT_SHARE_BASE_WEIGHTS", "false").lower() == "true"
)
ANTHROPIC_MODEL_LIST = (
"claude-1",
"claude-2",
"claude-instant-1",
)
class BaseModelAdapter:
"""The base and the default model adapter."""
use_fast_tokenizer = False
def match(self, model_path: str):
return True
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
try:
tokenizer = AutoTokenizer.from_pretrained(
model_path,
use_fast=self.use_fast_tokenizer,
revision=revision,
trust_remote_code=True,
)
except TypeError:
tokenizer = AutoTokenizer.from_pretrained(
model_path, use_fast=False, revision=revision, trust_remote_code=True
)
try:
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
trust_remote_code=True,
use_flash_attention_2=True,
**from_pretrained_kwargs,
)
except: # NameError:
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
trust_remote_code=True,
use_flash_attention_2=False,
**from_pretrained_kwargs,
)
# model = AutoModel.from_pretrained(
# model_path,
# low_cpu_mem_usage=True,
# trust_remote_code=True,
# **from_pretrained_kwargs,
# )
return model, tokenizer
def load_compress_model(self, model_path, device, torch_dtype, revision="main"):
return load_compress_model(
model_path,
device,
torch_dtype,
use_fast=self.use_fast_tokenizer,
revision=revision,
)
def get_default_conv_template(self, model_path: str) -> Conversation:
if 'megrez' in model_path.lower():
model_path = 'megrez'
elif 'minicpm' in model_path.lower():
model_path = "minicpm"
return get_conv_template(model_path.lower())
# A global registry for all model adapters
# TODO (lmzheng): make it a priority queue.
model_adapters: List[BaseModelAdapter] = []
def register_model_adapter(cls):
"""Register a model adapter."""
model_adapters.append(cls())
@cache
def get_model_adapter(model_path: str, model_name: str = None) -> BaseModelAdapter:
"""Get a model adapter for a model_path."""
model_path_basename = os.path.basename(os.path.normpath(model_path)) if not model_name else model_name
# Try the basename of model_path at first
for adapter in model_adapters:
if adapter.match(model_path_basename) and type(adapter) != BaseModelAdapter:
print(f"Matching model adapter: {adapter}")
return adapter
model_path = model_path if not model_name else model_name
# Then try the full path
for adapter in model_adapters:
if adapter.match(model_path):
print(f"Using model adapter: {adapter}")
return adapter
raise ValueError(f"No valid model adapter for {model_path}")
def raise_warning_for_incompatible_cpu_offloading_configuration(
device: str, load_8bit: bool, cpu_offloading: bool
):
if cpu_offloading:
if not load_8bit:
warnings.warn(
"The cpu-offloading feature can only be used while also using 8-bit-quantization.\n"
"Use '--load-8bit' to enable 8-bit-quantization\n"
"Continuing without cpu-offloading enabled\n"
)
return False
if not "linux" in sys.platform:
warnings.warn(
"CPU-offloading is only supported on linux-systems due to the limited compatability with the bitsandbytes-package\n"
"Continuing without cpu-offloading enabled\n"
)
return False
if device != "cuda":
warnings.warn(
"CPU-offloading is only enabled when using CUDA-devices\n"
"Continuing without cpu-offloading enabled\n"
)
return False
return cpu_offloading
def load_model(
model_path: str,
device: str = "cuda",
num_gpus: int = 1,
max_gpu_memory: Optional[str] = None,
dtype: Optional[torch.dtype] = None,
load_8bit: bool = False,
cpu_offloading: bool = False,
gptq_config: Optional[GptqConfig] = None,
awq_config: Optional[AWQConfig] = None,
exllama_config: Optional[ExllamaConfig] = None,
xft_config: Optional[XftConfig] = None,
revision: str = "main",
debug: bool = False,
model_name: str = None,
):
"""Load a model from Hugging Face."""
# get model adapter
adapter = get_model_adapter(model_path, model_name)
# Handle device mapping
cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration(
device, load_8bit, cpu_offloading
)
if device == "cpu":
# kwargs = {"torch_dtype": torch.float32}
kwargs = {"torch_dtype": torch.float16}
if CPU_ISA in ["avx512_bf16", "amx"]:
try:
import intel_extension_for_pytorch as ipex
kwargs = {"torch_dtype": torch.bfloat16}
except ImportError:
warnings.warn(
"Intel Extension for PyTorch is not installed, it can be installed to accelerate cpu inference"
)
elif device == "cuda":
# kwargs = {"torch_dtype": torch.float16}
kwargs = {"torch_dtype": torch.bfloat16}
if num_gpus != 1:
kwargs["device_map"] = "auto"
if max_gpu_memory is None:
kwargs[
"device_map"
] = "sequential" # This is important for not the same VRAM sizes
available_gpu_memory = get_gpu_memory(num_gpus)
kwargs["max_memory"] = {
i: str(int(available_gpu_memory[i] * 0.85)) + "GiB"
for i in range(num_gpus)
}
else:
kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)}
elif device == "mps":
kwargs = {"torch_dtype": torch.float16}
# Avoid bugs in mps backend by not using in-place operations.
replace_llama_attn_with_non_inplace_operations()
elif device == "xpu":
kwargs = {"torch_dtype": torch.bfloat16}
# Try to load ipex, while it looks unused, it links into torch for xpu support
try:
import intel_extension_for_pytorch as ipex
except ImportError:
warnings.warn(
"Intel Extension for PyTorch is not installed, but is required for xpu inference."
)
elif device == "npu":
kwargs = {"torch_dtype": torch.float16}
# Try to load ipex, while it looks unused, it links into torch for xpu support
try:
import torch_npu
except ImportError:
warnings.warn("Ascend Extension for PyTorch is not installed.")
else:
raise ValueError(f"Invalid device: {device}")
if cpu_offloading:
# raises an error on incompatible platforms
from transformers import BitsAndBytesConfig
if "max_memory" in kwargs:
kwargs["max_memory"]["cpu"] = (
str(math.floor(psutil.virtual_memory().available / 2**20)) + "Mib"
)
kwargs["quantization_config"] = BitsAndBytesConfig(
load_in_8bit_fp32_cpu_offload=cpu_offloading
)
kwargs["load_in_8bit"] = load_8bit
elif load_8bit:
if num_gpus != 1:
warnings.warn(
"8-bit quantization is not supported for multi-gpu inference."
)
else:
model, tokenizer = adapter.load_compress_model(
model_path=model_path,
device=device,
torch_dtype=kwargs["torch_dtype"],
revision=revision,
)
if debug:
print(model)
return model, tokenizer
elif awq_config and awq_config.wbits < 16:
assert (
awq_config.wbits == 4
), "Currently we only support 4-bit inference for AWQ."
model, tokenizer = load_awq_quantized(model_path, awq_config, device)
if num_gpus != 1:
device_map = accelerate.infer_auto_device_map(
model,
max_memory=kwargs["max_memory"],
no_split_module_classes=[
"OPTDecoderLayer",
"LlamaDecoderLayer",
"BloomBlock",
"MPTBlock",
"DecoderLayer",
],
)
model = accelerate.dispatch_model(
model, device_map=device_map, offload_buffers=True
)
else:
model.to(device)
return model, tokenizer
elif gptq_config and gptq_config.wbits < 16:
model, tokenizer = load_gptq_quantized(model_path, gptq_config)
if num_gpus != 1:
device_map = accelerate.infer_auto_device_map(
model,
max_memory=kwargs["max_memory"],
no_split_module_classes=["LlamaDecoderLayer"],
)
model = accelerate.dispatch_model(
model, device_map=device_map, offload_buffers=True
)
else:
model.to(device)
return model, tokenizer
elif exllama_config:
model, tokenizer = load_exllama_model(model_path, exllama_config)
return model, tokenizer
elif xft_config:
model, tokenizer = load_xft_model(model_path, xft_config)
return model, tokenizer
kwargs["revision"] = revision
if dtype is not None: # Overwrite dtype if it is provided in the arguments.
kwargs["torch_dtype"] = dtype
# Load model
model, tokenizer = adapter.load_model(model_path, kwargs)
if (
device == "cpu"
and kwargs["torch_dtype"] is torch.bfloat16
and CPU_ISA is not None
):
model = ipex.optimize(model, dtype=kwargs["torch_dtype"])
if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device in (
"mps",
"xpu",
"npu",
):
model.to(device)
if device == "xpu":
model = torch.xpu.optimize(model, dtype=kwargs["torch_dtype"], inplace=True)
if debug:
print(model)
return model, tokenizer
def get_conversation_template(model_path: str) -> Conversation:
"""Get the default conversation template."""
adapter = get_model_adapter(model_path)
return adapter.get_default_conv_template(model_path)
def get_generate_stream_function(model: torch.nn.Module, model_path: str):
"""Get the generate_stream function for inference."""
from fastchat.serve.inference import generate_stream
model_type = str(type(model)).lower()
is_chatglm = "chatglm" in model_type
is_falcon = "rwforcausallm" in model_type
is_codet5p = "codet5p" in model_type
is_peft = "peft" in model_type
is_exllama = "exllama" in model_type
is_xft = "xft" in model_type
if is_chatglm:
return generate_stream_chatglm
elif is_falcon:
return generate_stream_falcon
elif is_codet5p:
return generate_stream_codet5p
elif is_exllama:
return generate_stream_exllama
elif is_xft:
return generate_stream_xft
elif peft_share_base_weights and is_peft:
# Return a curried stream function that loads the right adapter
# according to the model_name available in this context. This ensures
# the right weights are available.
@torch.inference_mode()
def generate_stream_peft(
model,
tokenizer,
params: Dict,
device: str,
context_len: int,
stream_interval: int = 2,
judge_sent_end: bool = False,
):
model.set_adapter(model_path)
for x in generate_stream(
model,
tokenizer,
params,
device,
context_len,
stream_interval,
judge_sent_end,
):
yield x
return generate_stream_peft
else:
return generate_stream
def add_model_args(parser):
parser.add_argument(
"--model-path",
type=str,
default="lmsys/vicuna-7b-v1.5",
help="The path to the weights. This can be a local folder or a Hugging Face repo ID.",
)
parser.add_argument(
"--revision",
type=str,
default="main",
help="Hugging Face Hub model revision identifier",
)
parser.add_argument(
"--device",
type=str,
choices=["cpu", "cuda", "mps", "xpu", "npu"],
default="cuda",
help="The device type",
)
parser.add_argument(
"--gpus",
type=str,
default=None,
help="A single GPU like 1 or multiple GPUs like 0,2",
)
parser.add_argument("--num-gpus", type=int, default=1)
parser.add_argument(
"--max-gpu-memory",
type=str,
help="The maximum memory per GPU for storing model weights. Use a string like '13Gib'",
)
parser.add_argument(
"--dtype",
type=str,
choices=["float32", "float16", "bfloat16"],
help="Override the default dtype. If not set, it will use float16 on GPU and float32 on CPU.",
default=None,
)
parser.add_argument(
"--load-8bit", action="store_true", help="Use 8-bit quantization"
)
parser.add_argument(
"--cpu-offloading",
action="store_true",
help="Only when using 8-bit quantization: Offload excess weights to the CPU that don't fit on the GPU",
)
parser.add_argument(
"--gptq-ckpt",
type=str,
default=None,
help="Used for GPTQ. The path to the local GPTQ checkpoint.",
)
parser.add_argument(
"--gptq-wbits",
type=int,
default=16,
choices=[2, 3, 4, 8, 16],
help="Used for GPTQ. #bits to use for quantization",
)
parser.add_argument(
"--gptq-groupsize",
type=int,
default=-1,
help="Used for GPTQ. Groupsize to use for quantization; default uses full row.",
)
parser.add_argument(
"--gptq-act-order",
action="store_true",
help="Used for GPTQ. Whether to apply the activation order GPTQ heuristic",
)
parser.add_argument(
"--awq-ckpt",
type=str,
default=None,
help="Used for AWQ. Load quantized model. The path to the local AWQ checkpoint.",
)
parser.add_argument(
"--awq-wbits",
type=int,
default=16,
choices=[4, 16],
help="Used for AWQ. #bits to use for AWQ quantization",
)
parser.add_argument(
"--awq-groupsize",
type=int,
default=-1,
help="Used for AWQ. Groupsize to use for AWQ quantization; default uses full row.",
)
parser.add_argument(
"--enable-exllama",
action="store_true",
help="Used for exllamabv2. Enable exllamaV2 inference framework.",
)
parser.add_argument(
"--exllama-max-seq-len",
type=int,
default=4096,
help="Used for exllamabv2. Max sequence length to use for exllamav2 framework; default 4096 sequence length.",
)
parser.add_argument(
"--exllama-gpu-split",
type=str,
default=None,
help="Used for exllamabv2. Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7",
)
parser.add_argument(
"--enable-xft",
action="store_true",
help="Used for xFasterTransformer Enable xFasterTransformer inference framework.",
)
parser.add_argument(
"--xft-max-seq-len",
type=int,
default=4096,
help="Used for xFasterTransformer. Max sequence length to use for xFasterTransformer framework; default 4096 sequence length.",
)
parser.add_argument(
"--xft-dtype",
type=str,
choices=["fp16", "bf16", "int8", "bf16_fp16", "bf16_int8"],
help="Override the default dtype. If not set, it will use bfloat16 for first token and float16 next tokens on CPU.",
default=None,
)
def remove_parent_directory_name(model_path):
"""Remove parent directory name."""
if model_path[-1] == "/":
model_path = model_path[:-1]
return model_path.split("/")[-1]
peft_model_cache = {}
class PeftModelAdapter:
"""Loads any "peft" model and it's base model."""
def match(self, model_path: str):
"""Accepts any model path with "peft" in the name"""
if os.path.exists(os.path.join(model_path, "adapter_config.json")):
return True
return "peft" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
"""Loads the base model then the (peft) adapter weights"""
from peft import PeftConfig, PeftModel
config = PeftConfig.from_pretrained(model_path)
base_model_path = config.base_model_name_or_path
if "peft" in base_model_path:
raise ValueError(
f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}"
)
# Basic proof of concept for loading peft adapters that share the base
# weights. This is pretty messy because Peft re-writes the underlying
# base model and internally stores a map of adapter layers.
# So, to make this work we:
# 1. Cache the first peft model loaded for a given base models.
# 2. Call `load_model` for any follow on Peft models.
# 3. Make sure we load the adapters by the model_path. Why? This is
# what's accessible during inference time.
# 4. In get_generate_stream_function, make sure we load the right
# adapter before doing inference. This *should* be safe when calls
# are blocked the same semaphore.
if peft_share_base_weights:
if base_model_path in peft_model_cache:
model, tokenizer = peft_model_cache[base_model_path]
# Super important: make sure we use model_path as the
# `adapter_name`.
model.load_adapter(model_path, adapter_name=model_path)
else:
base_adapter = get_model_adapter(base_model_path)
base_model, tokenizer = base_adapter.load_model(
base_model_path, from_pretrained_kwargs
)
# Super important: make sure we use model_path as the
# `adapter_name`.
model = PeftModel.from_pretrained(
base_model, model_path, adapter_name=model_path
)
peft_model_cache[base_model_path] = (model, tokenizer)
return model, tokenizer
# In the normal case, load up the base model weights again.
base_adapter = get_model_adapter(base_model_path)
base_model, tokenizer = base_adapter.load_model(
base_model_path, from_pretrained_kwargs
)
model = PeftModel.from_pretrained(base_model, model_path)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
"""Uses the conv template of the base model"""
from peft import PeftConfig, PeftModel
config = PeftConfig.from_pretrained(model_path)
if "peft" in config.base_model_name_or_path:
raise ValueError(
f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}"
)
base_model_path = config.base_model_name_or_path
base_adapter = get_model_adapter(base_model_path)
return base_adapter.get_default_conv_template(config.base_model_name_or_path)
class DeepseekChatAdapter(BaseModelAdapter):
"""The model adapter for deepseek-ai's chat models"""
# Note: that this model will require tokenizer version >= 0.13.3 because the tokenizer class is LlamaTokenizerFast
def match(self, model_path: str):
return "deepseek" in model_path.lower() and "chat" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("deepseek")
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
device_map="sequential",
torch_dtype=torch.bfloat16,
max_memory=from_pretrained_kwargs['max_memory'],
attn_implementation="flash_attention_2"#"eager"
)
tokenizer = AutoTokenizer.from_pretrained(
model_path, utrust_remote_code=True
)
return model, tokenizer
class VicunaAdapter(BaseModelAdapter):
"Model adapter for Vicuna models (e.g., lmsys/vicuna-7b-v1.5)" ""
use_fast_tokenizer = False
def match(self, model_path: str):
return "vicuna" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
tokenizer = AutoTokenizer.from_pretrained(
model_path, use_fast=self.use_fast_tokenizer, revision=revision
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
use_flash_attention_2=True,
**from_pretrained_kwargs,
)
self.raise_warning_for_old_weights(model)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
if "v0" in remove_parent_directory_name(model_path):
return get_conv_template("one_shot")
return get_conv_template("vicuna_v1.1")
def raise_warning_for_old_weights(self, model):
if isinstance(model, LlamaForCausalLM) and model.model.vocab_size > 32000:
warnings.warn(
"\nYou are probably using the old Vicuna-v0 model, "
"which will generate unexpected results with the "
"current fastchat.\nYou can try one of the following methods:\n"
"1. Upgrade your weights to the new Vicuna-v1.3: https://github.com/lm-sys/FastChat#vicuna-weights.\n"
"2. Use the old conversation template by `python3 -m fastchat.serve.cli --model-path /path/to/vicuna-v0 --conv-template one_shot`\n"
"3. Downgrade fschat to fschat==0.1.10 (Not recommended).\n"
)
class AiroborosAdapter(BaseModelAdapter):
"""The model adapter for jondurbin/airoboros-*"""
def match(self, model_path: str):
if re.search(r"airoboros|spicyboros", model_path, re.I):
return True
return False
def get_default_conv_template(self, model_path: str) -> Conversation:
if "-3." in model_path or "-3p" in model_path:
return get_conv_template("airoboros_v3")
if "spicyboros" in model_path or re.search(r"-(2\.[2-9]+)", model_path):
return get_conv_template("airoboros_v2")
return get_conv_template("airoboros_v1")
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
if "mpt" not in model_path.lower():
return super().load_model(model_path, from_pretrained_kwargs)
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
trust_remote_code=True,
max_seq_len=8192,
**from_pretrained_kwargs,
)
tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, use_fast=True
)
return model, tokenizer
class Zhinao360Adapter(BaseModelAdapter):
def match(self, model_path: str):
return "360zhinao" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
tokenizer = AutoTokenizer.from_pretrained(
model_path,
trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True)
from transformers import GenerationConfig
generation_config = GenerationConfig.from_pretrained(
model_path,
trust_remote_code=True)
return model, tokenizer, generation_config
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("360zhinao")
class LongChatAdapter(BaseModelAdapter):
"Model adapter for LongChat models (e.g., lmsys/longchat-7b-16k)."
use_fast_tokenizer = False
def match(self, model_path: str):
return "longchat" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
# Apply monkey patch, TODO(Dacheng): Add flash attention support
config = AutoConfig.from_pretrained(model_path, revision=revision)
replace_llama_with_condense(config.rope_scaling["factor"])
tokenizer = AutoTokenizer.from_pretrained(
model_path, use_fast=self.use_fast_tokenizer, revision=revision
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
**from_pretrained_kwargs,
)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("vicuna_v1.1")
class GoogleT5Adapter(BaseModelAdapter):
"""The model adapter for google/Flan based models, such as Salesforce/codet5p-6b, lmsys/fastchat-t5-3b-v1.0, flan-t5-*, flan-ul2"""
def match(self, model_path: str):
return any(
model_str in model_path.lower()
for model_str in ["flan-", "fastchat-t5", "codet5p"]
)
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
tokenizer = T5Tokenizer.from_pretrained(model_path, revision=revision)
model = AutoModelForSeq2SeqLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
trust_remote_code=True,
**from_pretrained_kwargs,
)
return model, tokenizer
class KoalaAdapter(BaseModelAdapter):
"""The model adapter for Koala"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "koala" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("koala_v1")
class AlpacaAdapter(BaseModelAdapter):
"""The model adapter for Alpaca"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "alpaca" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("alpaca")
class ChatGLMAdapter(BaseModelAdapter):
"""The model adapter for THUDM/chatglm-6b, THUDM/chatglm2-6b"""
def match(self, model_path: str):
return "chatglm" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
if "chatglm3" in model_path.lower():
tokenizer = AutoTokenizer.from_pretrained(
model_path,
encode_special_tokens=True,
trust_remote_code=True,
revision=revision,
)
else:
tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, revision=revision
)
model = AutoModel.from_pretrained(
model_path, trust_remote_code=True, **from_pretrained_kwargs
)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
model_path = model_path.lower()
if "chatglm2" in model_path.lower():
return get_conv_template("chatglm2")
if "chatglm3" in model_path.lower():
return get_conv_template("chatglm3")
return get_conv_template("chatglm")
class CodeGeexAdapter(BaseModelAdapter):
"""The model adapter for THUDM/codegeex-6b, THUDM/codegeex2-6b"""
def match(self, model_path: str):
return "codegeex" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, revision=revision
)
model = AutoModel.from_pretrained(
model_path, trust_remote_code=True, **from_pretrained_kwargs
)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("codegeex")
class DollyV2Adapter(BaseModelAdapter):
"""The model adapter for databricks/dolly-v2-12b"""
def match(self, model_path: str):
return "dolly-v2" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
**from_pretrained_kwargs,
)
# 50277 means "### End"
tokenizer.eos_token_id = 50277
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("dolly_v2")
class OasstPythiaAdapter(BaseModelAdapter):
"""The model adapter for OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"""
def match(self, model_path: str):
model_path = model_path.lower()
return "oasst" in model_path and "pythia" in model_path
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("oasst_pythia")
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
return model, tokenizer
class OasstLLaMAAdapter(BaseModelAdapter):
"""The model adapter for OpenAssistant/oasst-sft-7-llama-30b"""
use_fast_tokenizer = False
def match(self, model_path: str):
model_path = model_path.lower()
if "openassistant-sft-7-llama-30b-hf" in model_path:
return True
return "oasst" in model_path and "pythia" not in model_path
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("oasst_llama")
class OpenChat35Adapter(BaseModelAdapter):
"""The model adapter for OpenChat 3.5 (e.g. openchat/openchat_3.5)"""
def match(self, model_path: str):
return "openchat" in model_path.lower() and "3.5" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("openchat_3.5")
class PythiaAdapter(BaseModelAdapter):
"""The model adapter for any EleutherAI/pythia model"""
def match(self, model_path: str):
return "pythia" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
return model, tokenizer
class StableLMAdapter(BaseModelAdapter):
"""The model adapter for StabilityAI/stablelm-tuned-alpha-7b"""
def match(self, model_path: str):
return "stablelm" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("stablelm")
class MPTAdapter(BaseModelAdapter):
"""The model adapter for MPT series (mosaicml/mpt-7b-chat, mosaicml/mpt-30b-chat)"""
def match(self, model_path: str):
model_path = model_path.lower()
return "mpt" in model_path and not "airoboros" in model_path
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
trust_remote_code=True,
max_seq_len=8192,
**from_pretrained_kwargs,
)
tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, revision=revision
)
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
model_path = model_path.lower()
if "mpt-7b-chat" in model_path:
return get_conv_template("mpt-7b-chat")
elif "mpt-30b-chat" in model_path:
return get_conv_template("mpt-30b-chat")
elif "mpt-30b-instruct" in model_path:
return get_conv_template("mpt-30b-instruct")
else:
print(
"Warning: Loading base MPT model with `zero_shot` conversation configuration. "
"If this is not desired, inspect model configurations and names."
)
return get_conv_template("zero_shot")
class BaizeAdapter(BaseModelAdapter):
"""The model adapter for project-baize/baize-v2-7b"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "baize" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("baize")
class RwkvAdapter(BaseModelAdapter):
"""The model adapter for BlinkDL/RWKV-4-Raven"""
def match(self, model_path: str):
return "rwkv-4" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
from fastchat.model.rwkv_model import RwkvModel
model = RwkvModel(model_path)
revision = from_pretrained_kwargs.get("revision", "main")
tokenizer = AutoTokenizer.from_pretrained(
"EleutherAI/pythia-160m", revision=revision
)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("rwkv")
class OpenBuddyAdapter(BaseModelAdapter):
"""The model adapter for OpenBuddy/openbuddy-7b-v1.1-bf16-enc"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "openbuddy" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("openbuddy")
class PhoenixAdapter(BaseModelAdapter):
"""The model adapter for FreedomIntelligence/phoenix-inst-chat-7b"""
def match(self, model_path: str):
return "phoenix" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("phoenix")
class ReaLMAdapter(BaseModelAdapter):
"""The model adapter for FreedomIntelligence/ReaLM-7b"""
def match(self, model_path: str):
return "ReaLM" in model_path
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("ReaLM-7b-v1")
class ChatGPTAdapter(BaseModelAdapter):
"""The model adapter for ChatGPT"""
def match(self, model_path: str):
return model_path in (
"gpt-3.5-turbo",
"gpt-3.5-turbo-1106",
"gpt-4",
"gpt-4-turbo",
)
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
raise NotImplementedError()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("chatgpt")
class AzureOpenAIAdapter(BaseModelAdapter):
"""The model adapter for Azure OpenAI"""
def match(self, model_path: str):
return model_path in ("azure-gpt-35-turbo", "azure-gpt-4")
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
raise NotImplementedError()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("chatgpt")
class ClaudeAdapter(BaseModelAdapter):
"""The model adapter for Claude"""
def match(self, model_path: str):
return model_path in ANTHROPIC_MODEL_LIST
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
raise NotImplementedError()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("claude")
class BardAdapter(BaseModelAdapter):
"""The model adapter for Bard"""
def match(self, model_path: str):
return model_path == "bard"
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
raise NotImplementedError()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("bard")
class PaLM2Adapter(BaseModelAdapter):
"""The model adapter for PaLM2"""
def match(self, model_path: str):
return model_path == "palm-2"
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
raise NotImplementedError()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("bard")
class BiLLaAdapter(BaseModelAdapter):
"""The model adapter for Neutralzz/BiLLa-7B-SFT"""
def match(self, model_path: str):
return "billa" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("billa")
class RedPajamaINCITEAdapter(BaseModelAdapter):
"""The model adapter for togethercomputer/RedPajama-INCITE-7B-Chat"""
def match(self, model_path: str):
return "redpajama-incite" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
**from_pretrained_kwargs,
)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("redpajama-incite")
class H2OGPTAdapter(BaseModelAdapter):
"""The model adapter for h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "h2ogpt" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("h2ogpt")
class RobinAdapter(BaseModelAdapter):
"""The model adapter for LMFlow/Full-Robin-7b-v2"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "robin" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("Robin")
class SnoozyAdapter(BaseModelAdapter):
"""The model adapter for nomic-ai/gpt4all-13b-snoozy"""
use_fast_tokenizer = False
def match(self, model_path: str):
model_path = model_path.lower()
return "gpt4all" in model_path and "snoozy" in model_path
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("snoozy")
class WizardLMAdapter(BaseModelAdapter):
"""The model adapter for WizardLM/WizardLM-13B-V1.0"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "wizardlm" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
model_path = model_path.lower()
if "13b" in model_path or "30b" in model_path or "70b" in model_path:
return get_conv_template("vicuna_v1.1")
else:
# TODO: use the recommended template for 7B
# (https://huggingface.co/WizardLM/WizardLM-13B-V1.0)
return get_conv_template("one_shot")
class ManticoreAdapter(BaseModelAdapter):
"""The model adapter for openaccess-ai-collective/manticore-13b-chat-pyg"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "manticore" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("manticore")
class GuanacoAdapter(BaseModelAdapter):
"""The model adapter for timdettmers/guanaco-33b-merged"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "guanaco" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
tokenizer = AutoTokenizer.from_pretrained(
model_path, use_fast=self.use_fast_tokenizer, revision=revision
)
model = AutoModelForCausalLM.from_pretrained(
model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
)
# Fix a bug in tokenizer config
tokenizer.eos_token_id = model.config.eos_token_id
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("zero_shot")
class ChangGPTAdapter(BaseModelAdapter):
"""The model adapter for lcw99/polyglot-ko-12.8b-chang-instruct-chat"""
def match(self, model_path: str):
model_path = model_path.lower()
return "polyglot" in model_path and "chang" in model_path
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("polyglot_changgpt")
class CamelAdapter(BaseModelAdapter):
"""The model adapter for camel-ai/CAMEL-13B-Combined-Data"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "camel" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("vicuna_v1.1")
class TuluAdapter(BaseModelAdapter):
"""The model adapter for allenai/tulu-30b"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "tulu" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("tulu")
class FalconAdapter(BaseModelAdapter):
"""The model adapter for tiiuae/falcon-40b"""
def match(self, model_path: str):
return "falcon" in model_path.lower() and "chat" not in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
# Strongly suggest using bf16, which is recommended by the author of Falcon
tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision)
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
trust_remote_code=True,
**from_pretrained_kwargs,
)
# In Falcon tokenizer config and special config there is not any pad token
# Setting `pad_token_id` to 9, which corresponds to special token '>>SUFFIX<<'
tokenizer.pad_token_id = 9
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("falcon")
class FalconChatAdapter(BaseModelAdapter):
def match(self, model_path: str):
return "falcon" in model_path.lower() and "chat" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("falcon-chat")
class TigerBotAdapter(BaseModelAdapter):
"""The model adapter for TigerResearch/tigerbot-7b-sft"""
def match(self, model_path: str):
return "tigerbot" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
tokenizer = AutoTokenizer.from_pretrained(
model_path,
trust_remote_code=True,
revision=revision,
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
low_cpu_mem_usage=True,
**from_pretrained_kwargs,
)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("tigerbot")
class BaichuanAdapter(BaseModelAdapter):
"""The model adapter for Baichuan models (e.g., baichuan-inc/Baichuan-7B)"""
def match(self, model_path: str):
return "baichuan" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, revision=revision
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
low_cpu_mem_usage=True,
**from_pretrained_kwargs,
)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
# for Baichuan-13B-Chat
if "chat" in model_path.lower():
if "baichuan2" in model_path.lower():
return get_conv_template("baichuan2-chat")
return get_conv_template("baichuan-chat")
return get_conv_template("zero_shot")
class XGenAdapter(BaseModelAdapter):
"""The model adapter for Salesforce/xgen-7b"""
def match(self, model_path: str):
return "xgen" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
trust_remote_code=True,
**from_pretrained_kwargs,
)
tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, revision=revision
)
model.config.eos_token_id = 50256
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("xgen")
class NousHermesAdapter(BaseModelAdapter):
"""The model adapter for NousResearch/Nous-Hermes-13b"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "nous-hermes" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("alpaca")
class InternLMChatAdapter(BaseModelAdapter):
"""The model adapter for internlm/internlm-chat-7b"""
def match(self, model_path: str):
return "internlm-chat" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
trust_remote_code=True,
**from_pretrained_kwargs,
)
model = model.eval()
if "8k" in model_path.lower():
model.config.max_sequence_length = 8192
tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, revision=revision
)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("internlm-chat")
class StarChatAdapter(BaseModelAdapter):
"""The model adapter for HuggingFaceH4/starchat-beta"""
def match(self, model_path: str):
return "starchat" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("starchat")
class MistralAdapter(BaseModelAdapter):
"""The model adapter for Mistral AI models"""
def match(self, model_path: str):
return "mistral" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("mistral")
class Llama2Adapter(BaseModelAdapter):
"""The model adapter for Llama-2 (e.g., meta-llama/Llama-2-7b-hf)"""
def match(self, model_path: str):
return "llama-2" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("llama-2")
class CuteGPTAdapter(BaseModelAdapter):
"""The model adapter for CuteGPT"""
def match(self, model_path: str):
return "cutegpt" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
tokenizer = LlamaTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs
)
tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("<end>")
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.eos_token_id
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("cutegpt")
class OpenOrcaAdapter(BaseModelAdapter):
"""Model adapter for Open-Orca models which may use different prompt templates
- (e.g. Open-Orca/OpenOrcaxOpenChat-Preview2-13B, Open-Orca/Mistral-7B-OpenOrca)
- `OpenOrcaxOpenChat-Preview2-13B` uses their "OpenChat Llama2 V1" prompt template.
- [Open-Orca/OpenOrcaxOpenChat-Preview2-13B #Prompt Template](https://huggingface.co/Open-Orca/OpenOrcaxOpenChat-Preview2-13B#prompt-template)
- `Mistral-7B-OpenOrca` uses the [OpenAI's Chat Markup Language (ChatML)](https://github.com/openai/openai-python/blob/main/chatml.md)
format, with <|im_start|> and <|im_end|> tokens added to support this.
- [Open-Orca/Mistral-7B-OpenOrca #Prompt Template](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca#prompt-template)
"""
use_fast_tokenizer = False
def match(self, model_path: str):
return (
"mistral-7b-openorca" in model_path.lower()
or "openorca" in model_path.lower()
)
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
tokenizer = AutoTokenizer.from_pretrained(
model_path, use_fast=self.use_fast_tokenizer, revision=revision
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
**from_pretrained_kwargs,
).eval()
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
if "mistral-7b-openorca" in model_path.lower():
return get_conv_template("mistral-7b-openorca")
return get_conv_template("open-orca")
class WizardCoderAdapter(BaseModelAdapter):
"""The model adapter for WizardCoder (e.g., WizardLM/WizardCoder-Python-34B-V1.0)"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "wizardcoder" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
# Same as Alpaca, see :
# https://github.com/nlpxucan/WizardLM/blob/main/WizardCoder/src/inference_wizardcoder.py#L60
return get_conv_template("alpaca")
class QwenChatAdapter(BaseModelAdapter):
"""The model adapter for Qwen/Qwen-7B-Chat
To run this model, you need to ensure additional flash attention installation:
``` bash
git clone https://github.com/Dao-AILab/flash-attention
cd flash-attention && pip install .
pip install csrc/layer_norm
pip install csrc/rotary
```
Since from 2.0, the following change happened
- `flash_attn_unpadded_func` -> `flash_attn_varlen_func`
- `flash_attn_unpadded_qkvpacked_func` -> `flash_attn_varlen_qkvpacked_func`
- `flash_attn_unpadded_kvpacked_func` -> `flash_attn_varlen_kvpacked_func`
You may need to revise the code in: https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/modeling_qwen.py#L69
to from flash_attn.flash_attn_interface import flash_attn_varlen_func as flash_attn_unpadded_func
"""
def match(self, model_path: str):
return "qwen" in model_path.lower()
def float_set(self, config, option):
config.bf16 = False
config.fp16 = False
config.fp32 = False
if option == "bf16":
config.bf16 = True
elif option == "fp16":
config.fp16 = True
elif option == "fp32":
config.fp32 = True
else:
print("Invalid option. Please choose one from 'bf16', 'fp16' and 'fp32'.")
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
from transformers.generation import GenerationConfig
revision = from_pretrained_kwargs.get("revision", "main")
config = AutoConfig.from_pretrained(
model_path,
trust_remote_code=True,
)
# NOTE: if you use the old version of model file, please remove the comments below
# config.use_flash_attn = False
self.float_set(config, "fp16")
generation_config = GenerationConfig.from_pretrained(
model_path, trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
config=config,
low_cpu_mem_usage=True,
trust_remote_code=True,
**from_pretrained_kwargs,
).eval()
if hasattr(model.config, "use_dynamic_ntk") and model.config.use_dynamic_ntk:
model.config.max_sequence_length = 16384
tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, revision=revision
)
tokenizer.eos_token_id = config.eos_token_id
tokenizer.bos_token_id = config.bos_token_id
tokenizer.pad_token_id = generation_config.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.bos_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("qwen-7b-chat")
class BGEAdapter(BaseModelAdapter):
"""The model adapter for BGE (e.g., BAAI/bge-large-en-v1.5)"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "bge" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
model = AutoModel.from_pretrained(
model_path,
**from_pretrained_kwargs,
)
tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, revision=revision
)
if hasattr(model.config, "max_position_embeddings") and hasattr(
tokenizer, "model_max_length"
):
model.config.max_sequence_length = min(
model.config.max_position_embeddings, tokenizer.model_max_length
)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("one_shot")
class E5Adapter(BaseModelAdapter):
"""The model adapter for E5 (e.g., intfloat/e5-large-v2)"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "e5-" in model_path.lower() and 'megrez' not in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
model = AutoModel.from_pretrained(
model_path,
**from_pretrained_kwargs,
)
tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, revision=revision
)
if hasattr(model.config, "max_position_embeddings") and hasattr(
tokenizer, "model_max_length"
):
model.config.max_sequence_length = min(
model.config.max_position_embeddings, tokenizer.model_max_length
)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("one_shot")
class AquilaChatAdapter(BaseModelAdapter):
"""The model adapter for BAAI/Aquila
Now supports:
- BAAI/AquilaChat-7B
- BAAI/AquilaChat2-7B
- BAAI/AquilaChat2-34B
"""
def match(self, model_path: str):
return "aquila" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
model = AutoModelForCausalLM.from_pretrained(
model_path,
low_cpu_mem_usage=True,
trust_remote_code=True,
**from_pretrained_kwargs,
)
model = model.eval()
tokenizer = AutoTokenizer.from_pretrained(
model_path, trust_remote_code=True, revision=revision
)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
model_path = model_path.lower()
# See: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L347
if "aquilachat2" in model_path:
if "16k" in model_path:
return get_conv_template("aquila")
elif "34b" in model_path:
return get_conv_template("aquila-legacy")
else:
return get_conv_template("aquila-v1")
else:
return get_conv_template("aquila-chat")
class Lamma2ChineseAdapter(BaseModelAdapter):
"""The model adapter for FlagAlpha/LLama2-Chinese sft"""
def match(self, model_path: str):
return "llama2-chinese" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
tokenizer = AutoTokenizer.from_pretrained(
model_path,
trust_remote_code=True,
revision=revision,
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
low_cpu_mem_usage=True,
**from_pretrained_kwargs,
)
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("llama2-chinese")
class VigogneAdapter(BaseModelAdapter):
"""The model adapter for vigogne (e.g., bofenghuang/vigogne-2-7b-chat)"""
use_fast_tokenizer = False
def match(self, model_path: str):
return bool(re.search(r"vigogne|vigostral", model_path, re.I))
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
tokenizer = AutoTokenizer.from_pretrained(
model_path,
use_fast=self.use_fast_tokenizer,
trust_remote_code=True,
revision=revision,
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
low_cpu_mem_usage=True,
**from_pretrained_kwargs,
).eval()
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
if "chat" in model_path.lower():
if "vigostral" in model_path.lower():
return get_conv_template("vigogne_chat_v3")
return get_conv_template("vigogne_chat_v2")
return get_conv_template("vigogne_instruct")
class OpenLLaMaOpenInstructAdapter(BaseModelAdapter):
"""The model adapter for OpenLLaMa-Open-Instruct (e.g., VMware/open-llama-7b-open-instruct)"""
use_fast_tokenizer = False
def match(self, model_path: str):
return (
"open-llama" in model_path.lower() and "open-instruct" in model_path.lower()
)
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
revision = from_pretrained_kwargs.get("revision", "main")
tokenizer = AutoTokenizer.from_pretrained(
model_path,
use_fast=self.use_fast_tokenizer,
trust_remote_code=True,
revision=revision,
)
model = AutoModelForCausalLM.from_pretrained(
model_path,
trust_remote_code=True,
low_cpu_mem_usage=True,
**from_pretrained_kwargs,
).eval()
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("alpaca")
class CodeLlamaAdapter(BaseModelAdapter):
"""The model adapter for CodeLlama (e.g., codellama/CodeLlama-34b-hf)"""
def match(self, model_path: str):
return "codellama" in model_path.lower()
def load_model(self, model_path: str, from_pretrained_kwargs: dict):
model, tokenizer = super().load_model(model_path, from_pretrained_kwargs)
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
return model, tokenizer
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("llama-2")
class PhindCodeLlamaAdapter(CodeLlamaAdapter):
"""The model adapter for Phind-CodeLlama (e.g., Phind/Phind-CodeLlama-34B-v2)"""
def match(self, model_path: str):
return "phind-codellama-" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("phind")
class Llama2ChangAdapter(Llama2Adapter):
"""The model adapter for Llama2-ko-chang (e.g., lcw99/llama2-ko-chang-instruct-chat)"""
def match(self, model_path: str):
return "llama2-ko-chang" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("polyglot_changgpt")
class ZephyrAdapter(BaseModelAdapter):
"""The model adapter for Zephyr (e.g. HuggingFaceH4/zephyr-7b-alpha)"""
def match(self, model_path: str):
return "zephyr" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("zephyr")
class XwinLMAdapter(BaseModelAdapter):
"""The model adapter for Xwin-LM V0.1 and V0.2 series of models(e.g., Xwin-LM/Xwin-LM-70B-V0.1)"""
# use_fast_tokenizer = False
def match(self, model_path: str):
return "xwin-lm" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("vicuna_v1.1")
class LemurAdapter(BaseModelAdapter):
"""The model adapter for OpenLemur/lemur-70b-chat-v1"""
use_fast_tokenizer = False
def match(self, model_path: str):
return "lemur-70b-chat" in model_path.lower()
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("lemur-70b-chat")
class PygmalionAdapter(BaseModelAdapter):
"""The model adapter for Pygmalion/Metharme series of models(e.g., PygmalionAI/mythalion-13b)"""
# use_fast_tokenizer = False
def match(self, model_path: str):
return bool(
re.search(r"pygmalion|mythalion|metharme", model_path.lower(), re.I)
)
def get_default_conv_template(self, model_path: str) -> Conversation:
return get_conv_template("metharme")
# Note: the registration order matters.
# The one registered earlier has a higher matching priority.
register_model_adapter(PeftModelAdapter)
register_model_adapter(DeepseekChatAdapter)
register_model_adapter(VicunaAdapter)
register_model_adapter(AiroborosAdapter)
register_model_adapter(LongChatAdapter)
register_model_adapter(GoogleT5Adapter)
register_model_adapter(KoalaAdapter)
register_model_adapter(AlpacaAdapter)
register_model_adapter(ChatGLMAdapter)
register_model_adapter(CodeGeexAdapter)
register_model_adapter(DollyV2Adapter)
register_model_adapter(OasstPythiaAdapter)
register_model_adapter(OasstLLaMAAdapter)
register_model_adapter(OpenChat35Adapter)
register_model_adapter(StableLMAdapter)
register_model_adapter(BaizeAdapter)
register_model_adapter(RwkvAdapter)
register_model_adapter(OpenBuddyAdapter)
register_model_adapter(PhoenixAdapter)
register_model_adapter(BardAdapter)
register_model_adapter(PaLM2Adapter)
register_model_adapter(ChatGPTAdapter)
register_model_adapter(AzureOpenAIAdapter)
register_model_adapter(ClaudeAdapter)
register_model_adapter(MPTAdapter)
register_model_adapter(BiLLaAdapter)
register_model_adapter(RedPajamaINCITEAdapter)
register_model_adapter(H2OGPTAdapter)
register_model_adapter(RobinAdapter)
register_model_adapter(SnoozyAdapter)
register_model_adapter(WizardLMAdapter)
register_model_adapter(ManticoreAdapter)
register_model_adapter(GuanacoAdapter)
register_model_adapter(CamelAdapter)
register_model_adapter(ChangGPTAdapter)
register_model_adapter(TuluAdapter)
register_model_adapter(FalconChatAdapter)
register_model_adapter(FalconAdapter)
register_model_adapter(TigerBotAdapter)
register_model_adapter(BaichuanAdapter)
register_model_adapter(XGenAdapter)
register_model_adapter(NousHermesAdapter)
register_model_adapter(PythiaAdapter)
register_model_adapter(InternLMChatAdapter)
register_model_adapter(StarChatAdapter)
register_model_adapter(Llama2Adapter)
register_model_adapter(CuteGPTAdapter)
register_model_adapter(OpenOrcaAdapter)
register_model_adapter(MistralAdapter)
register_model_adapter(WizardCoderAdapter)
register_model_adapter(QwenChatAdapter)
register_model_adapter(AquilaChatAdapter)
register_model_adapter(BGEAdapter)
register_model_adapter(E5Adapter)
register_model_adapter(Lamma2ChineseAdapter)
register_model_adapter(VigogneAdapter)
register_model_adapter(OpenLLaMaOpenInstructAdapter)
register_model_adapter(ReaLMAdapter)
register_model_adapter(PhindCodeLlamaAdapter)
register_model_adapter(CodeLlamaAdapter)
register_model_adapter(Llama2ChangAdapter)
register_model_adapter(ZephyrAdapter)
register_model_adapter(XwinLMAdapter)
register_model_adapter(LemurAdapter)
register_model_adapter(PygmalionAdapter)
register_model_adapter(Zhinao360Adapter)
# After all adapters, try the default base adapter.
register_model_adapter(BaseModelAdapter)