OneLLM / model /LLM /onellm.py
csuhan's picture
Upload folder using huggingface_hub
1a84a43
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the GNU General Public License version 3.
from typing import Optional, Tuple
from dataclasses import dataclass
import math
import functools
import copy
import torch
from torch import nn
import torch.nn.functional as F
import fairscale.nn.model_parallel.initialize as fs_init
from fairscale.nn.model_parallel.layers import (
ParallelEmbedding,
RowParallelLinear,
ColumnParallelLinear,
)
from ..components import RMSNorm
from flash_attn import flash_attn_func
import open_clip
default_linear_init = nn.init.xavier_uniform_
@dataclass
class ModelArgs:
dim: int = 512
n_layers: int = 8
n_heads: int = 8
vocab_size: int = -1 # defined later by tokenizer
multiple_of: int = 256 # make SwiGLU hidden layer size multiple of large power of 2
norm_eps: float = 1e-5
max_batch_size: int = 32
max_seq_len: int = 2048
def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)
[: (dim // 2)].float() / dim))
t = torch.arange(end, device=freqs.device) # type: ignore
freqs = torch.outer(t, freqs).float() # type: ignore
freqs_cis = torch.polar(torch.ones_like(freqs), freqs) # complex64
return freqs_cis
def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
ndim = x.ndim
assert 0 <= 1 < ndim
assert freqs_cis.shape == (x.shape[1], x.shape[-1])
shape = [d if i == 1 or i == ndim -
1 else 1 for i, d in enumerate(x.shape)]
return freqs_cis.view(*shape)
def apply_rotary_emb(
xq: torch.Tensor,
xk: torch.Tensor,
freqs_cis: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
return xq_out.type_as(xq), xk_out.type_as(xk)
class Attention(nn.Module):
def __init__(self, args: ModelArgs):
super().__init__()
self.n_local_heads = args.n_heads // fs_init.get_model_parallel_world_size()
self.head_dim = args.dim // args.n_heads
self.wq = ColumnParallelLinear(
args.dim,
args.n_heads * self.head_dim,
bias=False,
gather_output=False,
init_method=default_linear_init,
)
self.wk = ColumnParallelLinear(
args.dim,
args.n_heads * self.head_dim,
bias=False,
gather_output=False,
init_method=default_linear_init,
)
self.wv = ColumnParallelLinear(
args.dim,
args.n_heads * self.head_dim,
bias=False,
gather_output=False,
init_method=default_linear_init,
)
self.wo = RowParallelLinear(
args.n_heads * self.head_dim,
args.dim,
bias=False,
input_is_parallel=True,
init_method=default_linear_init,
)
self.flash = True
self.k_cache, self.v_cache = None, None
def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor], prompt=None):
bsz, seqlen, _ = x.shape
xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
xq = xq.view(bsz, seqlen, self.n_local_heads, self.head_dim)
xk = xk.view(bsz, seqlen, self.n_local_heads, self.head_dim)
xv = xv.view(bsz, seqlen, self.n_local_heads, self.head_dim)
if freqs_cis is not None:
xq, xk = apply_rotary_emb(xq, xk, freqs_cis=freqs_cis)
if self.k_cache is None or self.v_cache is None:
keys, values = xk, xv
else:
self.k_cache = self.k_cache.to(xk)
self.v_cache = self.v_cache.to(xv)
self.k_cache[:bsz, start_pos: start_pos + seqlen, :, :] = xk
self.v_cache[:bsz, start_pos: start_pos + seqlen, :, :] = xv
keys = self.k_cache[:bsz, :start_pos + seqlen]
values = self.v_cache[:bsz, :start_pos + seqlen]
output = flash_attn_func(
xq, keys, values, dropout_p=0.0, causal=mask is not None)
output = output.contiguous().view(bsz, seqlen, -1)
return self.wo(output)
def allocate_kv_cache(self, max_batch_size: int, max_seq_len: int) -> None:
kv_cache_shape = (max_batch_size, max_seq_len,
self.n_local_heads, self.head_dim)
if self.k_cache is None or self.k_cache.size() != kv_cache_shape:
self.k_cache = torch.empty(kv_cache_shape)
if self.v_cache is None or self.v_cache.size() != kv_cache_shape:
self.v_cache = torch.empty(kv_cache_shape)
def destroy_kv_cache(self) -> None:
self.k_cache, self.v_cache = None, None
class FeedForward(nn.Module):
def __init__(
self,
dim: int,
hidden_dim: int,
multiple_of: int,
):
super().__init__()
hidden_dim = int(2 * hidden_dim / 3)
hidden_dim = multiple_of * \
((hidden_dim + multiple_of - 1) // multiple_of)
self.w1 = ColumnParallelLinear(
dim, hidden_dim, bias=False, gather_output=False, init_method=default_linear_init,
)
self.w2 = RowParallelLinear(
hidden_dim, dim, bias=False, input_is_parallel=True, init_method=default_linear_init
)
self.w3 = ColumnParallelLinear(
dim, hidden_dim, bias=False, gather_output=False, init_method=default_linear_init
)
def _silu_gating(self, x, y):
return F.silu(x) * y
def forward(self, x):
return self.w2(self._silu_gating(self.w1(x), self.w3(x)))
class TransformerBlock(nn.Module):
def __init__(self, layer_id: int, args: ModelArgs):
super().__init__()
self.n_heads = args.n_heads
self.dim = args.dim
self.head_dim = args.dim // args.n_heads
self.attention = Attention(args)
self.feed_forward = FeedForward(
dim=args.dim, hidden_dim=4 * args.dim, multiple_of=args.multiple_of
)
self.layer_id = layer_id
self.attention_norm = RMSNorm(args.dim, eps=args.norm_eps)
self.ffn_norm = RMSNorm(args.dim, eps=args.norm_eps)
def _forward_ffn(self, h):
return h + self.feed_forward(self.ffn_norm(h))
def _forward_attention(self, x, start_pos, freqs_cis, mask, prompt):
return x + self.attention.forward(self.attention_norm(x), start_pos, freqs_cis, mask, prompt)
def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor], prompt=None):
h = self._forward_attention(x, start_pos, freqs_cis, mask, prompt)
out = self._forward_ffn(h)
return out
class Mlp(nn.Module):
""" MLP as used in Vision Transformer, MLP-Mixer and related networks
"""
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.fc2(x)
return x
class Transformer(nn.Module):
def __init__(self, params: ModelArgs):
super().__init__()
self.params = params
self.vocab_size = params.vocab_size
self.n_layers = params.n_layers
self.tok_embeddings = ParallelEmbedding(
params.vocab_size, params.dim, init_method=nn.init.normal_,
)
self.layers = torch.nn.ModuleList()
for layer_id in range(params.n_layers):
self.layers.append(TransformerBlock(layer_id, params))
self.norm = RMSNorm(params.dim, eps=params.norm_eps)
self.output = ColumnParallelLinear(
params.dim, params.vocab_size, bias=False, init_method=default_linear_init,
)
self.freqs_cis = precompute_freqs_cis(
self.params.dim // self.params.n_heads, self.params.max_seq_len * 2
)
# load clip
self.clip, _, _ = open_clip.create_model_and_transforms(
'ViT-L-14', pretrained='openai')
for param in self.clip.parameters():
param.requires_grad = False
param.data = param.data.half()
self.clip.transformer = None
self.image_words = 30
self.cache_image_words = 0 # for inference
clip_width = self.clip.visual.conv1.out_channels
# create modal shared modules
self.resample_layers = nn.ModuleDict()
self.num_experts = 3
self.num_resample_layers = 8
for expert in range(self.num_experts):
expert = str(expert)
self.resample_layers[expert] = nn.ModuleList()
resampler_params = copy.deepcopy(params)
resampler_params.n_heads = 16
resampler_params.dim = clip_width
for layer_id in range(self.num_resample_layers):
self.resample_layers[expert].append(
TransformerBlock(layer_id, resampler_params))
self.conv1 = nn.ModuleDict()
self.positional_embedding = nn.ParameterDict()
self.resample_tokens = nn.ParameterDict()
self.clip_proj1 = nn.ModuleDict()
self.clip_proj2 = nn.ModuleDict()
self.routers = nn.ModuleDict()
self.start_tag = nn.ParameterDict()
self.end_tag = nn.ParameterDict()
self.modals = ['image', 'audio', 'point', 'video', 'rgbd', 'rgbn', 'fmri', 'imu']
for modal in self.modals:
if modal in ['image', 'video', 'rgbn', 'rgbn']:
modal_tokens = 256 + 1
pass
elif modal == 'audio':
self.conv1[modal] = nn.Conv2d(
1, clip_width, kernel_size=(16, 16), stride=(10, 10))
modal_tokens = 1212 + 1
self.positional_embedding[modal] = nn.Parameter(
torch.empty([modal_tokens, clip_width]))
nn.init.normal_(self.positional_embedding[modal], std=0.02)
elif modal == 'point':
from model.lib.point_utils import PointPatchEmbed
self.conv1[modal] = PointPatchEmbed(
in_channels=6, channels=clip_width)
modal_tokens = 1024 + 1
self.positional_embedding[modal] = nn.Parameter(
torch.empty([modal_tokens, clip_width]))
nn.init.normal_(self.positional_embedding[modal], std=0.02)
elif modal == 'fmri':
self.conv1[modal] = nn.Linear(15724, 8192)
self.positional_embedding[modal] = nn.Parameter(
torch.empty([8+1, clip_width]))
nn.init.normal_(self.positional_embedding[modal], std=0.02)
elif modal == 'imu':
self.conv1[modal] = nn.Conv1d(
in_channels=6, out_channels=clip_width, kernel_size=10, bias=False)
self.positional_embedding[modal] = nn.Parameter(
torch.empty([391+1, clip_width]))
nn.init.normal_(self.positional_embedding[modal], std=0.02)
self.routers[modal] = Mlp(
clip_width, clip_width * 4, self.num_experts)
self.resample_tokens[modal] = nn.Parameter(
torch.empty([1, 30, resampler_params.dim]))
nn.init.normal_(self.resample_tokens[modal], std=0.02)
self.clip_proj1[modal] = nn.Sequential(
nn.Linear(clip_width, resampler_params.dim),
nn.LayerNorm(resampler_params.dim))
self.clip_proj2[modal] = nn.Sequential(
nn.Linear(resampler_params.dim, params.dim),
nn.LayerNorm(params.dim))
self.start_tag[modal] = nn.Parameter(torch.rand(1, 1, params.dim))
self.end_tag[modal] = nn.Parameter(torch.rand(1, 1, params.dim))
# @torch.no_grad()
def clip_encode_image(self, x, modal='image'):
# shape = [*, width, grid ** 2]
x = x.reshape(x.shape[0], x.shape[1], -1)
x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
x = torch.cat([self.clip.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1,
x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width]
# use pretrained pos embeding for rest modalities
pos_embedding = self.clip.visual.positional_embedding
if modal in ['audio', 'point', 'fmri', 'imu']:
pos_embedding = self.positional_embedding[modal]
x = x + pos_embedding.to(x.dtype)
x = self.clip.visual.ln_pre(x)
x = x.permute(1, 0, 2) # NLD -> LND
x = self.clip.visual.transformer(x)
x = x.permute(1, 0, 2) # LND -> NLD
# preserve all spatial tokens
x = self.clip.visual.ln_post(x[:, :, :])
# if self.clip.visual.proj is not None:
# x = x @ self.clip.visual.proj
return x
def encode_image(self, x, modal='image'):
bsz = x.size(0)
T = 1
if modal in ['image']:
# modified from CLIP
x = self.clip.visual.conv1(x) # shape = [*, width, grid, grid]
elif modal in ['audio', 'imu']:
x = self.conv1[modal](x)
elif modal == 'point':
# [B, 16384, 6] -> [B, 1024, 1024, 1]
x = self.conv1[modal](x.float()).to(x.dtype)
elif modal in ['video', 'rgbd', 'rgbn']:
# [B, 15, 3, 224, 224]
B, T = x.shape[:2]
bsz = B * T
x = x.reshape(bsz, *x.shape[2:])
x = self.clip.visual.conv1(x)
elif modal == 'fmri':
x = self.conv1[modal](x)
# [B, 1, 8196] -> [B, 1024, 8]
x = x.reshape(x.size(0), self.clip.visual.conv1.out_channels, -1)
image_feats = self.clip_encode_image(x, modal=modal)
# take mean on time dimension
# all inputs are reduced to [B, L, D]
bsz = int(bsz / T)
image_feats = image_feats.reshape(
bsz, T, *image_feats.shape[1:]).mean(dim=1)
image_feats = self.clip_proj1[modal](image_feats)
image_feats = torch.cat(
[self.resample_tokens[modal].repeat(bsz, 1, 1), image_feats], dim=1)
# routing modalites
# [B, L, D]->[B, L, N]
routing_weights = self.routers[modal](image_feats).sigmoid()
routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)
image_feats_experts = []
for expert_id in range(self.num_experts):
image_feats_expert = image_feats
for layer in self.resample_layers[str(expert_id)]:
image_feats_expert = layer(image_feats_expert, 0, None, None)
image_feats_expert = image_feats_expert[:, :self.resample_tokens[modal].size(1)]
routing_weight = routing_weights[:, :self.resample_tokens[modal].size(
1), expert_id]
# [B, L, D] * [B, L, 1]
image_feats_expert = image_feats_expert * routing_weight[:, :, None]
image_feats_experts.append(image_feats_expert)
image_feats = sum(image_feats_experts)
image_feats = self.clip_proj2[modal](image_feats)
return image_feats
def forward(self, examples, image=None, modal='image'):
self._destroy_kv_cache() # training always disables kv cache
modal = modal[0]
_bsz, seqlen = examples.shape
h = self.tok_embeddings(examples)
self.freqs_cis = self.freqs_cis.to(h.device)
start_pos = 0
prefix_len = 0
if image is not None:
h_bos, h_caption = h[:, :1], h[:, 1:]
image_tokens = self.encode_image(image, modal)
h = torch.cat((h_bos, self.start_tag[modal].expand(
_bsz, -1, -1), image_tokens, self.end_tag[modal].expand(_bsz, -1, -1), h_caption), dim=1)
# bos + image token + start_tag[modal], end_tag[modal] is used for caption generation
prefix_len = image_tokens.shape[1] + 1 + 1
seqlen = h.shape[1]
freqs_cis = self.freqs_cis[start_pos:start_pos + seqlen]
mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=h.device)
mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
for layer in self.layers:
h = layer(h, start_pos, freqs_cis, mask)
h = self.norm(h)
output = self.output(h[:, prefix_len:, :])
return output
@torch.inference_mode()
def forward_inference(self, tokens: torch.Tensor, start_pos: int, image=None, modal='image'):
modal = modal[0] if isinstance(modal, list) else modal
_bsz, seqlen = tokens.shape
if start_pos == 0:
# kv cache will not re-allocate if size is unchanged
self._allocate_kv_cache(_bsz)
h = self.tok_embeddings(tokens)
self.freqs_cis = self.freqs_cis.to(h.device)
if image is not None:
h_bos, h_caption = h[:, :1], h[:, 1:]
image_tokens = self.encode_image(image, modal)
self.cache_image_words = image_tokens.shape[1]
h = torch.cat((h_bos, self.start_tag[modal].repeat(_bsz, 1, 1), image_tokens, self.end_tag[modal].repeat(_bsz, 1, 1), h_caption), dim=1)
seqlen = h.shape[1]
freqs_cis = self.freqs_cis[0: seqlen]
else:
if start_pos == 0:
self.cache_image_words = 0
freqs_cis = self.freqs_cis[0: seqlen]
else:
# if image was not None when start_pos=0,
# the offset should be added to start_pos within later forward_inference calls
start_pos = start_pos + self.cache_image_words
freqs_cis = self.freqs_cis[start_pos: start_pos + seqlen]
# freqs_cis = self.freqs_cis[start_pos : start_pos + seqlen]
mask = None
if seqlen > 1:
mask = torch.full((1, 1, seqlen, seqlen), float("-inf"), device=tokens.device)
mask = torch.triu(mask, diagonal=start_pos + 1).type_as(h)
for layer in self.layers:
h = layer(h, start_pos, freqs_cis, mask)
h = self.norm(h)
output = self.output(h[:, -1, :]) # only compute last logits
return output.float()
def _allocate_kv_cache(self, max_batch_size: int) -> None:
for layer in self.layers:
layer.attention.allocate_kv_cache(
max_batch_size, self.params.max_seq_len)
def _destroy_kv_cache(self) -> None:
for layer in self.layers:
layer.attention.destroy_kv_cache()