szukevin's picture
upload
7900c16
raw
history blame
2.92 kB
import torch
import torch.nn as nn
from tencentpretrain.utils.constants import *
class LmTarget(nn.Module):
"""
Language Model Target
"""
def __init__(self, args, vocab_size):
super(LmTarget, self).__init__()
self.vocab_size = vocab_size
self.hidden_size = args.hidden_size
if "label_smoothing" in args:
self.label_smoothing = args.label_smoothing
else:
self.label_smoothing = None
if "ignore_index" in args and args.ignore_index:
self.ignore_index = args.tokenizer.vocab.get(PAD_TOKEN)
else:
self.ignore_index = None
self.output_layer = nn.Linear(self.hidden_size, self.vocab_size, bias=args.has_lmtarget_bias)
self.softmax = nn.LogSoftmax(dim=-1)
self.criterion = nn.NLLLoss()
def lm(self, memory_bank, tgt_lm):
# Language modeling (LM) with full softmax prediction.
tgt_lm = tgt_lm.contiguous().view(-1)
memory_bank = memory_bank.contiguous().view(-1, self.hidden_size)
memory_bank = memory_bank[tgt_lm > 0, :]
tgt_lm = tgt_lm[tgt_lm > 0]
output = self.output_layer(memory_bank)
output = self.softmax(output)
denominator = torch.tensor(output.size(0) + 1e-6)
if output.size(0) == 0:
correct = torch.tensor(0.0)
else:
correct = torch.sum((output.argmax(dim=-1).eq(tgt_lm)).float())
if self.label_smoothing is None:
loss = self.criterion(output, tgt_lm)
else:
if tgt_lm.dim() == output.dim() - 1:
tgt_lm = tgt_lm.unsqueeze(-1)
nll_loss = -output.gather(dim=-1, index=tgt_lm)
smooth_loss = -output.sum(dim=-1, keepdim=True)
if self.ignore_index is not None:
pad_mask = tgt_lm.eq(self.ignore_index)
nll_loss.masked_fill_(pad_mask, 0.0)
smooth_loss.masked_fill_(pad_mask, 0.0)
else:
nll_loss = nll_loss.squeeze(-1)
smooth_loss = smooth_loss.squeeze(-1)
nll_loss = nll_loss.mean()
smooth_loss = smooth_loss.mean()
eps_i = self.label_smoothing / (output.size(-1) - 1)
loss = (1.0 - self.label_smoothing - eps_i) * nll_loss + eps_i * smooth_loss
return loss, correct, denominator
def forward(self, memory_bank, tgt, seg):
"""
Args:
memory_bank: [batch_size x seq_length x hidden_size]
tgt: [batch_size x seq_length]
Returns:
loss: Language modeling loss.
correct: Number of words that are predicted correctly.
denominator: Number of predicted words.
"""
# Language modeling (LM) with full softmax prediction.
loss, correct, denominator = self.lm(memory_bank, tgt)
return loss, correct, denominator