multi30k / attention.py
msarmi9's picture
initial commit
8c7a320
from typing import *
import torch
import torch.nn as nn
import torch.nn.functional as F
Tensor = torch.Tensor
class Attention(nn.Module):
"""Container for applying an attention scoring function."""""
def __init__(self, score: nn.Module, dropout: nn.Module = None):
super().__init__()
self.score = score
self.dropout = dropout
def forward(self, decoder_state: Tensor, encoder_state: Tensor, source_mask: Tensor = None) -> Tuple[Tensor, Tensor]:
"""Return context and attention weights. Accepts a boolean mask indicating padding in the source sequence."""""
(B, L, D), (B, T, _) = decoder_state.shape, encoder_state.shape
scores = self.score(decoder_state, encoder_state) # (B, L, T)
if source_mask is not None: # (B, T)
scores.masked_fill_(source_mask.view(B, 1, T), -1e4)
weights = F.softmax(scores, dim=-1) # (B, L, T)
if self.dropout is not None:
weights = self.dropout(weights)
context = weights @ encoder_state # (B, L, _)
return context, weights # (B, L, _), (B, L, T)
class ConcatScore(nn.Module):
"""A two layer network as an attention scoring function. Expects bidirectional encoder."""""
def __init__(self, d: int):
super().__init__()
self.w = nn.Linear(3*d, d)
self.v = nn.Linear(d, 1, bias=False)
self.initialize_parameters()
def forward(self, decoder_state: Tensor, encoder_state: Tensor) -> Tensor:
"""Return attention scores."""""
(B, L, D), (B, T, _) = decoder_state.shape, encoder_state.shape # (B, L, D), (B, T, 2*D)
decoder_state = decoder_state.repeat_interleave(T, dim=1) # (B, L*T, D)
encoder_state = encoder_state.repeat(1, L, 1) # (B, L*T, 2*D)
concatenated = torch.cat((decoder_state, encoder_state), dim=-1) # (B, L*T, 3*D)
scores = self.v(torch.tanh(self.w(concatenated))) # (B, L*T, 1)
return scores.view(B, L, T) # (B, L, T)
@torch.no_grad()
def initialize_parameters(self):
nn.init.xavier_uniform_(self.w.weight)
nn.init.xavier_uniform_(self.v.weight, gain=nn.init.calculate_gain("tanh"))
nn.init.zeros_(self.w.bias)