import math import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from einops import rearrange from torch.autograd import Function class ReverseLayerF(Function): @staticmethod def forward(ctx, input_, alpha): ctx.alpha = alpha return input_ @staticmethod def backward(ctx, grad_output): output = grad_output.neg() * ctx.alpha return output, None class Attention(nn.Module): def __init__(self, dim, heads = 2, dim_head = 64, dropout = 0.): super().__init__() inner_dim = dim_head * heads project_out = not (heads == 1 and dim_head == dim) self.heads = heads self.scale = dim_head ** -0.5 self.attend = nn.Softmax(dim = -1) self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False) self.to_out = nn.Sequential( nn.Linear(inner_dim, dim), nn.Dropout(dropout) ) if project_out else nn.Identity() def forward(self, x): qkv = self.to_qkv(x).chunk(3, dim = -1) q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv) dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale attn = self.attend(dots) out = torch.matmul(attn, v) out = rearrange(out, 'b h n d -> b n (h d)') return self.to_out(out)