Spaces:
Runtime error
Runtime error
File size: 4,967 Bytes
12bfd03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""A streamable transformer."""
import typing as tp
import torch
import torch.nn as nn
import torch.nn.functional as F
def create_sin_embedding(positions: torch.Tensor,
dim: int,
max_period: float=10000):
"""Create time embedding for the given positions, target dimension `dim`.
"""
# We aim for BTC format
assert dim % 2 == 0
half_dim = dim // 2
adim = torch.arange(half_dim, device=positions.device).view(1, 1, -1)
phase = positions / (max_period**(adim / (half_dim - 1)))
return torch.cat(
[
torch.cos(phase),
torch.sin(phase),
], dim=-1)
class StreamingTransformerEncoderLayer(nn.TransformerEncoderLayer):
def forward(self, x: torch.Tensor, x_past: torch.Tensor,
past_context: int): # type: ignore
if self.norm_first:
sa_input = self.norm1(x)
x = x + self._sa_block(sa_input, x_past, past_context)
x = x + self._ff_block(self.norm2(x))
else:
sa_input = x
x = self.norm1(x + self._sa_block(sa_input, x_past, past_context))
x = self.norm2(x + self._ff_block(x))
return x, sa_input
# self-attention block
def _sa_block(self,
x: torch.Tensor,
x_past: torch.Tensor,
past_context: int): # type: ignore
_, T, _ = x.shape
_, H, _ = x_past.shape
queries = x
keys = torch.cat([x_past, x], dim=1)
values = keys
queries_pos = torch.arange(H, T + H, device=x.device).view(-1, 1)
keys_pos = torch.arange(T + H, device=x.device).view(1, -1)
delta = queries_pos - keys_pos
valid_access = (delta >= 0) & (delta <= past_context)
x = self.self_attn(
queries, keys, values, attn_mask=~valid_access,
need_weights=False)[0]
return self.dropout1(x)
class StreamingTransformerEncoder(nn.Module):
"""TransformerEncoder with streaming support.
Args:
dim (int): dimension of the data.
hidden_scale (int): intermediate dimension of FF module is this times the dimension.
num_heads (int): number of heads.
num_layers (int): number of layers.
max_period (float): maxium period of cosines in the positional embedding.
past_context (int or None): receptive field for the causal mask, infinite if None.
gelu (bool): if true uses GeLUs, otherwise use ReLUs.
norm_in (bool): normalize the input.
dropout (float): dropout probability.
**kwargs: See `nn.TransformerEncoderLayer`.
"""
def __init__(self,
dim,
hidden_scale: float=4.,
num_heads: int=8,
num_layers: int=5,
max_period: float=10000,
past_context: int=1000,
gelu: bool=True,
norm_in: bool=True,
dropout: float=0.,
**kwargs):
super().__init__()
assert dim % num_heads == 0
hidden_dim = int(dim * hidden_scale)
self.max_period = max_period
self.past_context = past_context
activation: tp.Any = F.gelu if gelu else F.relu
self.norm_in: nn.Module
if norm_in:
self.norm_in = nn.LayerNorm(dim)
else:
self.norm_in = nn.Identity()
self.layers = nn.ModuleList()
for idx in range(num_layers):
self.layers.append(
StreamingTransformerEncoderLayer(
dim,
num_heads,
hidden_dim,
activation=activation,
batch_first=True,
dropout=dropout,
**kwargs))
def forward(self,
x: torch.Tensor,
states: tp.Optional[tp.List[torch.Tensor]]=None,
offset: tp.Union[int, torch.Tensor]=0):
B, T, C = x.shape
if states is None:
states = [
torch.zeros_like(x[:, :1]) for _ in range(1 + len(self.layers))
]
positions = torch.arange(T, device=x.device).view(1, -1, 1) + offset
pos_emb = create_sin_embedding(positions, C, max_period=self.max_period)
new_state: tp.List[torch.Tensor] = []
x = self.norm_in(x)
x = x + pos_emb
for layer_state, layer in zip(states, self.layers):
x, new_layer_state = layer(x, layer_state, self.past_context)
new_layer_state = torch.cat([layer_state, new_layer_state], dim=1)
new_state.append(new_layer_state[:, -self.past_context:, :])
return x, new_state, offset + T
|