TSAI_S22 / tsai_gpt /config.py
ToletiSri's picture
Upload 26 files
dca102c
raw
history blame
37.1 kB
import json
from copy import deepcopy
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Literal, Optional, Type, Union
import torch
from typing_extensions import Self
import tsai_gpt.model
from tsai_gpt.utils import find_multiple
@dataclass
class Config:
name: str = ""
hf_config: dict = field(default_factory=dict)
block_size: int = 4096
vocab_size: int = 50254
padding_multiple: int = 512
padded_vocab_size: Optional[int] = None
n_layer: int = 16
n_head: int = 32
n_embd: int = 4096
rotary_percentage: float = 0.25
parallel_residual: bool = True
bias: bool = True
lm_head_bias: bool = False
# to use multi-head attention (MHA), set this to `n_head` (default)
# to use multi-query attention (MQA), set this to 1
# to use grouped-query attention (GQA), set this to a value in between
# Example with `n_head=4`
# ┌───┐┌───┐┌───┐┌───┐ ┌───┐ ┌───┐ ┌───┐
# │ v ││ v ││ v ││ v │ │ v │ │ v │ │ v │
# └───┘└───┘└───┘└───┘ └───┘ └───┘ └───┘
# │ │ │ │ │ │ │
# ┌───┐┌───┐┌───┐┌───┐ ┌───┐ ┌───┐ ┌───┐
# │ k ││ k ││ k ││ k │ │ k │ │ k │ │ k │
# └───┘└───┘└───┘└───┘ └───┘ └───┘ └───┘
# │ │ │ │ ┌──┴──┐ ┌──┴──┐ ┌────┬──┴─┬────┐
# ┌───┐┌───┐┌───┐┌───┐ ┌───┐┌───┐┌───┐┌───┐ ┌───┐┌───┐┌───┐┌───┐
# │ q ││ q ││ q ││ q │ │ q ││ q ││ q ││ q │ │ q ││ q ││ q ││ q │
# └───┘└───┘└───┘└───┘ └───┘└───┘└───┘└───┘ └───┘└───┘└───┘└───┘
# ◀──────────────────▶ ◀──────────────────▶ ◀──────────────────▶
# MHA GQA MQA
# n_query_groups=4 n_query_groups=2 n_query_groups=1
#
# credit https://arxiv.org/pdf/2305.13245.pdf
n_query_groups: Optional[int] = None
shared_attention_norm: bool = False
_norm_class: Literal["LayerNorm", "RMSNorm"] = "LayerNorm"
norm_eps: float = 1e-5
_mlp_class: Literal["GptNeoxMLP", "LLaMAMLP"] = "GptNeoxMLP"
gelu_approximate: str = "none"
intermediate_size: Optional[int] = None
rope_condense_ratio: int = 1
rope_base: int = 10000
def __post_init__(self):
if not self.name:
self.name = self.hf_config.get("name", self.name)
assert self.n_embd % self.n_head == 0
self.head_size = self.n_embd // self.n_head
# vocab size should be a power of 2 to be optimal on hardware. compute the closest value
if self.padded_vocab_size is None:
self.padded_vocab_size = find_multiple(self.vocab_size, self.padding_multiple)
else:
# vocab size shouldn't be larger than padded vocab size
self.vocab_size = min(self.vocab_size, self.padded_vocab_size)
# compute the number of query groups
if self.n_query_groups is not None:
assert self.n_head % self.n_query_groups == 0
else:
self.n_query_groups = self.n_head
# compute the intermediate size for MLP if not set
if self.intermediate_size is None:
if self._mlp_class == "LLaMAMLP":
raise ValueError("The config needs to set the `intermediate_size`")
self.intermediate_size = 4 * self.n_embd
self.rope_n_elem = int(self.rotary_percentage * self.head_size)
@classmethod
def from_name(cls, name: str, **kwargs: Any) -> Self:
if name not in name_to_config:
# search through all `config['hf_config']['name']`
conf_dict = next(config for config in configs if name == config["hf_config"]["name"])
else:
conf_dict = name_to_config[name]
conf_dict = conf_dict.copy()
if "condense_ratio" in kwargs: # legacy name
kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio")
conf_dict.update(kwargs)
return cls(**conf_dict)
@classmethod
def from_json(cls, path: Union[str, Path], **kwargs: Any) -> Self:
with open(path, encoding="utf-8") as fp:
json_kwargs = json.load(fp)
if "condense_ratio" in json_kwargs: # legacy name
json_kwargs["rope_condense_ratio"] = json_kwargs.pop("condense_ratio")
if "condense_ratio" in kwargs: # legacy name
kwargs["rope_condense_ratio"] = kwargs.pop("condense_ratio")
if "org" in json_kwargs: # legacy name
json_kwargs["hf_config"] = {"name": json_kwargs["name"], "org": json_kwargs.pop("org")}
if "org" in kwargs: # legacy name
kwargs["hf_config"] = {"name": kwargs.get("name", json_kwargs["name"]), "org": kwargs.pop("org")}
json_kwargs.update(kwargs)
return cls(**json_kwargs)
@property
def mlp_class(self) -> Type:
# `self._mlp_class` cannot be the type to keep the config json serializable
return getattr(tsai_gpt.model, self._mlp_class)
@property
def norm_class(self) -> Type:
# `self._norm_class` cannot be the type to keep the config json serializable
if self._norm_class == "RMSNorm":
from tsai_gpt.rmsnorm import RMSNorm
return RMSNorm
return getattr(torch.nn, self._norm_class)
########################
# Stability AI StableLM
########################
configs = [
# https://huggingface.co/stabilityai/stablelm-base-alpha-3b/blob/main/config.json
dict(name="stablelm-base-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-base-alpha-3b")),
# https://huggingface.co/stabilityai/stablelm-base-alpha-7b/blob/main/config.json
dict(
name="stablelm-base-alpha-7b",
hf_config=dict(org="stabilityai", name="stablelm-base-alpha-7b"),
n_head=48,
n_embd=6144,
padding_multiple=256,
),
# https://huggingface.co/stabilityai/stablelm-tuned-alpha-3b/blob/main/config.json
dict(name="stablelm-tuned-alpha-3b", hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-3b"), n_head=32),
# https://huggingface.co/stabilityai/stablelm-tuned-alpha-7b/blob/main/config.json
dict(
name="stablelm-tuned-alpha-7b",
hf_config=dict(org="stabilityai", name="stablelm-tuned-alpha-7b"),
n_head=48,
n_embd=6144,
padding_multiple=256,
),
]
####################
# EleutherAI Pythia
####################
pythia = [
# https://huggingface.co/EleutherAI/pythia-70m/blob/main/config.json
dict(
name="pythia-70m",
hf_config=dict(org="EleutherAI", name="pythia-70m"),
block_size=2048,
n_layer=6,
n_embd=512,
n_head=8,
padding_multiple=128,
),
# https://huggingface.co/EleutherAI/pythia-160m/blob/main/config.json
dict(
name="pythia-160m",
hf_config=dict(org="EleutherAI", name="pythia-160m"),
block_size=2048,
n_layer=12,
n_embd=768,
n_head=12,
padding_multiple=128,
),
# https://huggingface.co/EleutherAI/pythia-410m/blob/main/config.json
dict(
name="pythia-410m",
hf_config=dict(org="EleutherAI", name="pythia-410m"),
block_size=2048,
n_layer=24,
n_embd=1024,
n_head=16,
padding_multiple=128,
),
# https://huggingface.co/EleutherAI/pythia-1b/blob/main/config.json
dict(
name="pythia-1b",
hf_config=dict(org="EleutherAI", name="pythia-1b"),
block_size=2048,
n_embd=2048,
n_head=8,
padding_multiple=128,
),
# https://huggingface.co/EleutherAI/pythia-1.4b/blob/main/config.json
dict(
name="pythia-1.4b",
hf_config=dict(org="EleutherAI", name="pythia-1.4b"),
block_size=2048,
n_layer=24,
n_embd=2048,
n_head=16,
padding_multiple=128,
),
# https://huggingface.co/EleutherAI/pythia-2.8b/blob/main/config.json
dict(
name="pythia-2.8b",
hf_config=dict(org="EleutherAI", name="pythia-2.8b"),
block_size=2048,
n_layer=32,
n_embd=2560,
padding_multiple=128,
),
# https://huggingface.co/EleutherAI/pythia-6.9b/blob/main/config.json
dict(
name="pythia-6.9b",
hf_config=dict(org="EleutherAI", name="pythia-6.9b"),
block_size=2048,
n_layer=32,
padding_multiple=256,
),
# https://huggingface.co/EleutherAI/pythia-12b/blob/main/config.json
dict(
name="pythia-12b",
hf_config=dict(org="EleutherAI", name="pythia-12b"),
block_size=2048,
n_layer=36,
n_embd=5120,
n_head=40,
),
]
configs.extend(pythia)
for c in pythia:
copy = c.copy()
copy["name"] = f"{c['name']}-deduped"
copy["hf_config"]["name"] = f"{c['hf_config']['name']}-deduped"
configs.append(copy)
####################################
# togethercomputer RedPajama INCITE
####################################
redpajama_incite = [
# https://huggingface.co/togethercomputer/RedPajama-INCITE-Base-3B-v1/blob/main/config.json
dict(
name="RedPajama-INCITE-{}-3B-v1",
hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-3B-v1"),
block_size=2048,
n_layer=32,
n_embd=2560,
padding_multiple=256,
rotary_percentage=1.0,
parallel_residual=False,
),
# https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Base/blob/main/config.json
dict(
name="RedPajama-INCITE-7B-{}",
hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-7B-{}"),
block_size=2048,
n_layer=32,
padding_multiple=256,
rotary_percentage=1.0,
parallel_residual=False,
),
# this redirects to the checkpoint above. kept for those who had the old weights already downloaded
dict(
name="RedPajama-INCITE-{}-7B-v0.1",
hf_config=dict(org="togethercomputer", name="RedPajama-INCITE-{}-7B-v0.1"),
block_size=2048,
n_layer=32,
padding_multiple=256,
rotary_percentage=1.0,
parallel_residual=False,
),
]
for c in redpajama_incite:
for kind in ("Base", "Chat", "Instruct"):
copy = c.copy()
copy["name"] = c["name"].format(kind)
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
configs.append(copy)
#################
# TII UAE Falcon
#################
falcon = [
# https://huggingface.co/tiiuae/falcon-7b/blob/main/config.json
dict(
name="falcon-7b{}",
hf_config=dict(org="tiiuae", name="falcon-7b{}"),
block_size=2048,
vocab_size=65024,
padded_vocab_size=65024,
n_layer=32,
n_head=71,
n_embd=4544,
rotary_percentage=1.0,
n_query_groups=1,
bias=False,
# this is not in the config, but in the original model implementation, only for this config
shared_attention_norm=True,
),
# https://huggingface.co/tiiuae/falcon-40b/blob/main/config.json
dict(
name="falcon-40b{}",
hf_config=dict(org="tiiuae", name="falcon-40b{}"),
block_size=2048,
vocab_size=65024,
padded_vocab_size=65024,
n_layer=60,
n_head=128,
n_embd=8192,
rotary_percentage=1.0,
n_query_groups=8,
bias=False,
),
]
for c in falcon:
for kind in ("", "-instruct"):
copy = c.copy()
copy["name"] = c["name"].format(kind)
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
configs.append(copy)
# https://huggingface.co/tiiuae/falcon-180b/blob/main/config.json
falcon180b = dict(
name="falcon-180B{}",
hf_config=dict(org="tiiuae", name="falcon-180B{}"),
block_size=2048,
vocab_size=65024,
padded_vocab_size=65024,
n_layer=80,
n_head=232,
n_embd=14848,
rotary_percentage=1.0,
n_query_groups=8,
bias=False,
)
for kind in ("", "-chat"):
copy = falcon180b.copy()
copy["name"] = falcon180b["name"].format(kind)
copy["hf_config"]["name"] = falcon180b["hf_config"]["name"].format(kind)
configs.append(copy)
#############################
# OpenLM Research Open LLaMA
#############################
open_LLaMA = [
# https://huggingface.co/openlm-research/open_llama_3b/blob/main/config.json
dict(
name="open_llama_3b",
hf_config=dict(org="openlm-research", name="open_llama_3b"),
block_size=2048,
vocab_size=32000,
padding_multiple=64,
n_layer=26,
n_embd=3200,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-6,
_mlp_class="LLaMAMLP",
intermediate_size=8640,
),
# https://huggingface.co/openlm-research/open_llama_7b/blob/main/config.json
dict(
name="open_llama_7b",
hf_config=dict(org="openlm-research", name="open_llama_7b"),
block_size=2048,
vocab_size=32000,
padding_multiple=64,
n_layer=32,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-6,
_mlp_class="LLaMAMLP",
intermediate_size=11008,
),
# https://huggingface.co/openlm-research/open_llama_13b/blob/main/config.json
dict(
name="open_llama_13b",
hf_config=dict(org="openlm-research", name="open_llama_13b"),
block_size=2048,
vocab_size=32000,
padding_multiple=64,
n_layer=40,
n_head=40,
n_embd=5120,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-6,
_mlp_class="LLaMAMLP",
intermediate_size=13824,
),
]
configs.extend(open_LLaMA)
###############
# LMSYS Vicuna
###############
vicuna = [
# https://huggingface.co/lmsys/vicuna-7b-v1.3/blob/main/config.json
dict(
name="vicuna-7b-v1.3",
hf_config=dict(org="lmsys", name="vicuna-7b-v1.3"),
block_size=2048,
vocab_size=32000,
padding_multiple=64,
n_layer=32,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-6,
_mlp_class="LLaMAMLP",
intermediate_size=11008,
),
# https://huggingface.co/lmsys/vicuna-13b-v1.3/blob/main/config.json
dict(
name="vicuna-13b-v1.3",
hf_config=dict(org="lmsys", name="vicuna-13b-v1.3"),
block_size=2048,
vocab_size=32000,
padding_multiple=64,
n_layer=40,
n_head=40,
n_embd=5120,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-6,
_mlp_class="LLaMAMLP",
intermediate_size=13824,
),
# https://huggingface.co/lmsys/vicuna-33b-v1.3/blob/main/config.json
dict(
name="vicuna-33b-v1.3",
hf_config=dict(org="lmsys", name="vicuna-33b-v1.3"),
block_size=2048,
vocab_size=32000,
padding_multiple=64,
n_layer=60,
n_head=52,
n_embd=6656,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-6,
_mlp_class="LLaMAMLP",
intermediate_size=17920,
),
# https://huggingface.co/lmsys/vicuna-7b-v1.5/blob/main/config.json
dict(
name="vicuna-7b-v1.5",
hf_config=dict(org="lmsys", name="vicuna-7b-v1.5"),
vocab_size=32000,
padding_multiple=64,
n_layer=32,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
_mlp_class="LLaMAMLP",
intermediate_size=11008,
),
# https://huggingface.co/lmsys/vicuna-7b-v1.5-16k/blob/main/config.json
dict(
name="vicuna-7b-v1.5-16k",
hf_config=dict(org="lmsys", name="vicuna-7b-v1.5-16k"),
block_size=16384,
vocab_size=32000,
padding_multiple=64,
n_layer=32,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
_mlp_class="LLaMAMLP",
intermediate_size=11008,
rope_condense_ratio=4,
),
# https://huggingface.co/lmsys/vicuna-13b-v1.5/blob/main/config.json
dict(
name="vicuna-13b-v1.5",
hf_config=dict(org="lmsys", name="vicuna-13b-v1.5"),
vocab_size=32000,
padding_multiple=64,
n_layer=40,
n_head=40,
n_embd=5120,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
_mlp_class="LLaMAMLP",
intermediate_size=13824,
),
# https://huggingface.co/lmsys/vicuna-13b-v1.5-16k/blob/main/config.json
dict(
name="vicuna-13b-v1.5-16k",
hf_config=dict(org="lmsys", name="vicuna-13b-v1.5-16k"),
block_size=16384,
vocab_size=32000,
padding_multiple=64,
n_layer=40,
n_head=40,
n_embd=5120,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
_mlp_class="LLaMAMLP",
intermediate_size=13824,
rope_condense_ratio=4,
),
]
configs.extend(vicuna)
#################
# LMSYS LongChat
#################
long_chat = [
# https://huggingface.co/lmsys/longchat-7b-16k/blob/main/config.json
dict(
name="longchat-7b-16k",
hf_config=dict(org="lmsys", name="longchat-7b-16k"),
block_size=16384,
vocab_size=32000,
padding_multiple=64,
n_layer=32,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-6,
_mlp_class="LLaMAMLP",
intermediate_size=11008,
rope_condense_ratio=8,
),
# https://huggingface.co/lmsys/longchat-13b-16k/blob/main/config.json
dict(
name="longchat-13b-16k",
hf_config=dict(org="lmsys", name="longchat-13b-16k"),
block_size=16384,
vocab_size=32000,
padding_multiple=64,
n_layer=40,
n_head=40,
n_embd=5120,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-6,
_mlp_class="LLaMAMLP",
intermediate_size=13824,
rope_condense_ratio=8,
),
]
configs.extend(long_chat)
######################
# NousResearch Hermes
######################
nous_research = [
# https://huggingface.co/NousResearch/Nous-Hermes-llama-2-7b/blob/main/config.json
dict(
name="Nous-Hermes-llama-2-7b",
hf_config=dict(org="NousResearch", name="Nous-Hermes-llama-2-7b"),
padded_vocab_size=32000,
n_layer=32,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-05,
_mlp_class="LLaMAMLP",
intermediate_size=11008,
),
# https://huggingface.co/NousResearch/Nous-Hermes-13B/blob/main/config.json
dict(
name="Nous-Hermes-13b",
hf_config=dict(org="NousResearch", name="Nous-Hermes-13b"),
block_size=2048,
vocab_size=32000,
padded_vocab_size=32001,
n_layer=40,
n_head=40,
n_embd=5120,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-6,
_mlp_class="LLaMAMLP",
intermediate_size=13824,
),
# https://huggingface.co/NousResearch/Nous-Hermes-Llama2-13b
dict(
name="Nous-Hermes-Llama2-13b",
hf_config=dict(org="NousResearch", name="Nous-Hermes-Llama2-13b"),
vocab_size=32000,
padded_vocab_size=32032,
n_layer=40,
n_head=40,
n_embd=5120,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-05,
_mlp_class="LLaMAMLP",
intermediate_size=13824,
),
]
configs.extend(nous_research)
###############
# Meta LLaMA 2
###############
llama_2 = [
# https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/main/config.json
dict(
name="Llama-2-7b{}-hf",
hf_config=dict(org="meta-llama", name="Llama-2-7b{}-hf"),
vocab_size=32000,
padding_multiple=64,
n_layer=32,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
_mlp_class="LLaMAMLP",
intermediate_size=11008,
),
# https://huggingface.co/meta-llama/Llama-2-13b-hf/blob/main/config.json
dict(
name="Llama-2-13b{}-hf",
hf_config=dict(org="meta-llama", name="Llama-2-13b{}-hf"),
vocab_size=32000,
padding_multiple=64,
n_layer=40,
n_head=40,
n_embd=5120,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
_mlp_class="LLaMAMLP",
intermediate_size=13824,
),
# https://huggingface.co/meta-llama/Llama-2-70b-hf/blob/main/config.json
dict(
name="Llama-2-70b{}-hf",
hf_config=dict(org="meta-llama", name="Llama-2-70b{}-hf"),
vocab_size=32000,
padding_multiple=64,
n_layer=80,
n_head=64,
n_embd=8192,
n_query_groups=8,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
_mlp_class="LLaMAMLP",
intermediate_size=28672,
),
]
for c in llama_2:
for kind in ("", "-chat"):
copy = c.copy()
copy["name"] = c["name"].format(kind)
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
configs.append(copy)
##########################
# Stability AI FreeWilly2
##########################
freewilly_2 = [
# https://huggingface.co/stabilityai/FreeWilly2/blob/main/config.json
dict(
name="FreeWilly2",
hf_config=dict(org="stabilityai", name="FreeWilly2"),
vocab_size=32000,
padding_multiple=64,
n_layer=80,
n_head=64,
n_embd=8192,
n_query_groups=8,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
_mlp_class="LLaMAMLP",
intermediate_size=28672,
)
]
configs.extend(freewilly_2)
##################
# Meta Code Llama
##################
code_llama = [
# https://huggingface.co/codellama/CodeLlama-7b-hf/blob/main/config.json
dict(
name="CodeLlama-7b-hf",
hf_config=dict(org="codellama", name="CodeLlama-7b-hf"),
block_size=16384,
vocab_size=32016,
padding_multiple=16,
n_layer=32,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-05,
_mlp_class="LLaMAMLP",
intermediate_size=11008,
rope_base=1000000,
),
# https://huggingface.co/codellama/CodeLlama-13b-hf/blob/main/config.json
dict(
name="CodeLlama-13b-hf",
hf_config=dict(org="codellama", name="CodeLlama-13b-hf"),
block_size=16384,
vocab_size=32016,
padding_multiple=16,
n_layer=40,
n_head=40,
n_embd=5120,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-05,
_mlp_class="LLaMAMLP",
intermediate_size=13824,
rope_base=1000000,
),
# https://huggingface.co/codellama/CodeLlama-34b-hf/blob/main/config.json
dict(
name="CodeLlama-34b-hf",
hf_config=dict(org="codellama", name="CodeLlama-34b-hf"),
block_size=16384,
vocab_size=32000,
padding_multiple=64,
n_layer=48,
n_head=64,
n_embd=8192,
n_query_groups=8,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-05,
_mlp_class="LLaMAMLP",
intermediate_size=22016,
rope_base=1000000,
),
# https://huggingface.co/codellama/CodeLlama-7b-Python-hf/blob/main/config.json
dict(
name="CodeLlama-7b-Python-hf",
hf_config=dict(org="codellama", name="CodeLlama-7b-Python-hf"),
block_size=16384,
vocab_size=32000,
padding_multiple=64,
n_layer=32,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-05,
_mlp_class="LLaMAMLP",
intermediate_size=11008,
rope_base=1000000,
),
# https://huggingface.co/codellama/CodeLlama-13b-Python-hf/blob/main/config.json
dict(
name="CodeLlama-13b-Python-hf",
hf_config=dict(org="codellama", name="CodeLlama-13b-Python-hf"),
block_size=16384,
vocab_size=32000,
padding_multiple=64,
n_layer=40,
n_head=40,
n_embd=5120,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-05,
_mlp_class="LLaMAMLP",
intermediate_size=13824,
rope_base=1000000,
),
# https://huggingface.co/codellama/CodeLlama-34b-Python-hf/blob/main/config.json
dict(
name="CodeLlama-34b-Python-hf",
hf_config=dict(org="codellama", name="CodeLlama-34b-Python-hf"),
block_size=16384,
vocab_size=32000,
padding_multiple=64,
n_layer=48,
n_head=64,
n_embd=8192,
n_query_groups=8,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-05,
_mlp_class="LLaMAMLP",
intermediate_size=22016,
rope_base=1000000,
),
# https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/tree/main/config.json
dict(
name="CodeLlama-7b-Instruct-hf",
hf_config=dict(org="codellama", name="CodeLlama-7b-Instruct-hf"),
block_size=16384,
vocab_size=32016,
padding_multiple=16,
n_layer=32,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-05,
_mlp_class="LLaMAMLP",
intermediate_size=11008,
rope_base=1000000,
),
# https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf/blob/main/config.json
dict(
name="CodeLlama-13b-Instruct-hf",
hf_config=dict(org="codellama", name="CodeLlama-13b-Instruct-hf"),
block_size=2048,
vocab_size=32016,
padding_multiple=16,
n_layer=40,
n_head=40,
n_embd=5120,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-05,
_mlp_class="LLaMAMLP",
intermediate_size=13824,
rope_base=1000000,
),
# https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf/blob/main/config.json
dict(
name="CodeLlama-34b-Instruct-hf",
hf_config=dict(org="codellama", name="CodeLlama-34b-Instruct-hf"),
block_size=16384,
vocab_size=32000,
padding_multiple=64,
n_layer=48,
n_head=64,
n_embd=8192,
n_query_groups=8,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-05,
_mlp_class="LLaMAMLP",
intermediate_size=22016,
rope_base=1000000,
),
]
configs.extend(code_llama)
########################
# garage-bAInd Platypus
########################
platypus = [
# https://huggingface.co/garage-bAInd/Platypus-30B/blob/main/config.json
dict(
name="Platypus-30B",
hf_config=dict(org="garage-bAInd", name="Platypus-30B"),
block_size=2048,
padded_vocab_size=32000,
n_layer=60,
n_head=52,
n_embd=6656,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-06,
_mlp_class="LLaMAMLP",
intermediate_size=17920,
),
# https://huggingface.co/garage-bAInd/Platypus2-7B/blob/main/config.json
dict(
name="Platypus2-7B",
hf_config=dict(org="garage-bAInd", name="Platypus2-7B"),
padded_vocab_size=32000,
n_layer=32,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-05,
_mlp_class="LLaMAMLP",
intermediate_size=11008,
),
# https://huggingface.co/garage-bAInd/Platypus2-13B/blob/main/config.json
dict(
name="Platypus2-13B",
hf_config=dict(org="garage-bAInd", name="Platypus2-13B"),
padded_vocab_size=32000,
n_layer=40,
n_head=40,
n_embd=5120,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-05,
_mlp_class="LLaMAMLP",
intermediate_size=13824,
),
# https://huggingface.co/garage-bAInd/Platypus2-70B/blob/main/config.json
dict(
name="Platypus2-70B",
hf_config=dict(org="garage-bAInd", name="Platypus2-70B"),
padded_vocab_size=32000,
n_layer=80,
n_head=64,
n_embd=8192,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
_mlp_class="LLaMAMLP",
intermediate_size=28672,
),
# https://huggingface.co/garage-bAInd/Camel-Platypus2-13B/blob/main/config.json
dict(
name="Camel-Platypus2-13B",
hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-13B"),
padded_vocab_size=32000,
n_layer=40,
n_head=40,
n_embd=5120,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
_mlp_class="LLaMAMLP",
intermediate_size=13824,
),
# https://huggingface.co/garage-bAInd/Camel-Platypus2-70B/blob/main/config.json
dict(
name="Camel-Platypus2-70B",
hf_config=dict(org="garage-bAInd", name="Camel-Platypus2-70B"),
padded_vocab_size=32000,
n_layer=80,
n_head=64,
n_embd=8192,
n_query_groups=8,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
_mlp_class="LLaMAMLP",
intermediate_size=28672,
),
# https://huggingface.co/garage-bAInd/Stable-Platypus2-13B/blob/main/config.json
dict(
name="Stable-Platypus2-13B",
hf_config=dict(org="garage-bAInd", name="Stable-Platypus2-13B"),
padded_vocab_size=32000,
n_layer=40,
n_head=40,
n_embd=5120,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
_mlp_class="LLaMAMLP",
intermediate_size=13824,
),
# https://huggingface.co/garage-bAInd/Platypus2-70B-instruct/blob/main/config.json
dict(
name="Platypus2-70B-instruct",
hf_config=dict(org="garage-bAInd", name="Platypus2-70B-instruct"),
padded_vocab_size=32000,
n_layer=80,
n_head=64,
n_embd=8192,
n_query_groups=8,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
_mlp_class="LLaMAMLP",
intermediate_size=28672,
),
]
configs.extend(platypus)
##########################
# Stability AI StableCode
##########################
stablecode = [
# https://huggingface.co/stabilityai/stablecode-completion-alpha-3b/blob/main/config.json
dict(
name="stablecode-completion-alpha-3b",
hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b"),
block_size=16384,
vocab_size=49152,
n_layer=32,
n_embd=2560,
),
# https://huggingface.co/stabilityai/stablecode-completion-alpha-3b-4k/blob/main/config.json
dict(
name="stablecode-completion-alpha-3b-4k",
hf_config=dict(org="stabilityai", name="stablecode-completion-alpha-3b-4k"),
vocab_size=49152,
n_layer=32,
n_embd=2560,
),
# https://huggingface.co/stabilityai/stablecode-instruct-alpha-3b/blob/main/config.json
dict(
name="stablecode-instruct-alpha-3b",
hf_config=dict(org="stabilityai", name="stablecode-instruct-alpha-3b"),
vocab_size=49152,
n_layer=32,
n_embd=2560,
),
]
configs.extend(stablecode)
##################################
# togethercomputer LLaMA-2-7B-32K
##################################
together_llama2_32k = [
# https://huggingface.co/togethercomputer/LLaMA-2-7B-32K/blob/main/config.json
dict(
name="LLaMA-2-7B-32K",
hf_config=dict(org="togethercomputer", name="LLaMA-2-7B-32K"),
vocab_size=32000,
padding_multiple=64,
n_layer=32,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
_mlp_class="LLaMAMLP",
intermediate_size=11008,
rope_condense_ratio=8,
)
]
configs.extend(together_llama2_32k)
################
# Microsoft Phi
################
phi = [
# https://huggingface.co/microsoft/phi-1_5/blob/main/config.json
dict(
name="phi-1_5",
hf_config=dict(org="microsoft", name="phi-1_5"),
vocab_size=50257,
padded_vocab_size=51200,
block_size=2048,
n_embd=2048,
n_layer=24,
rotary_percentage=0.5, # 32 / (n_embd / n_head) = 32 / 64
shared_attention_norm=True,
lm_head_bias=True,
gelu_approximate="tanh",
)
]
configs.extend(phi)
#############
# Mistral AI
#############
mistral = [
# https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
dict(
name="Mistral-7B-{}v0.1",
hf_config=dict(org="mistralai", name="Mistral-7B-{}v0.1"),
padded_vocab_size=32000,
block_size=4096, # should be 32768 but sliding window attention is not implemented
n_layer=32,
n_query_groups=8,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm",
norm_eps=1e-05,
_mlp_class="LLaMAMLP",
intermediate_size=14336,
)
]
for c in mistral:
for kind in ("", "Instruct-"):
copy = c.copy()
copy["name"] = c["name"].format(kind)
copy["hf_config"]["name"] = c["hf_config"]["name"].format(kind)
configs.append(copy)
############
# TinyLlama
############
tiny_llama = [
dict(
name="tiny-llama-1.1b",
hf_config=dict(org="PY007", name="TinyLlama-1.1B-intermediate-step-480k-1T"),
block_size=2048,
vocab_size=32000,
padding_multiple=64,
n_layer=22,
n_head=32,
n_embd=2048,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm", # original TinyLlama uses FusedRMSNorm
norm_eps=1e-5,
_mlp_class="LLaMAMLP",
intermediate_size=5632,
n_query_groups=4,
),
dict(
name="tiny-llama-new",
hf_config=dict(org="PY007", name="TinyLlama-1.1B-intermediate-step-480k-1T"),
block_size=768,
vocab_size=32000,
padding_multiple=64,
n_layer=18,
n_head=32,
n_embd=1024,
rotary_percentage=1.0,
parallel_residual=False,
bias=False,
_norm_class="RMSNorm", # original TinyLlama uses FusedRMSNorm
norm_eps=1e-5,
_mlp_class="LLaMAMLP",
intermediate_size=5632,
n_query_groups=4,
),
]
configs.extend(tiny_llama)
name_to_config = {config["name"]: config for config in configs}