train20_last_layers_2ep_1e-6 / dependency_classifier.py

Model save

f947cc2 verified 12 months ago

10.6 kB


	from copy import deepcopy

	import numpy as np

	import torch
	from torch import nn
	from torch import Tensor, FloatTensor, BoolTensor, LongTensor
	import torch.nn.functional as F

	from transformers.activations import ACT2FN

	from cobald_parser.bilinear_matrix_attention import BilinearMatrixAttention
	from cobald_parser.chu_liu_edmonds import decode_mst
	from cobald_parser.utils import pairwise_mask, replace_masked_values


	class DependencyHeadBase(nn.Module):
	"""
	Base class for scoring arcs and relations between tokens in a dependency tree/graph.
	"""

	def __init__(self, hidden_size: int, n_rels: int):
	super().__init__()

	self.arc_attention = BilinearMatrixAttention(
	hidden_size,
	hidden_size,
	use_input_biases=True,
	n_labels=1
	)
	self.rel_attention = BilinearMatrixAttention(
	hidden_size,
	hidden_size,
	use_input_biases=True,
	n_labels=n_rels
	)

	def forward(
	self,
	h_arc_head: Tensor, # [batch_size, seq_len, hidden_size]
	h_arc_dep: Tensor, # ...
	h_rel_head: Tensor, # ...
	h_rel_dep: Tensor, # ...
	gold_arcs: LongTensor, # [batch_size, seq_len, seq_len]
	null_mask: BoolTensor, # [batch_size, seq_len]
	padding_mask: BoolTensor # [batch_size, seq_len]
	) -> dict[str, Tensor]:

	# Score arcs.
	# s_arc[:, i, j] = score of edge i -> j.
	s_arc = self.arc_attention(h_arc_head, h_arc_dep)
	# Mask undesirable values (padding, nulls, etc.) with -inf.
	mask2d = pairwise_mask(null_mask & padding_mask)
	replace_masked_values(s_arc, mask2d, replace_with=-1e8)
	# Score arcs' relations.
	# [batch_size, seq_len, seq_len, num_labels]
	s_rel = self.rel_attention(h_rel_head, h_rel_dep).permute(0, 2, 3, 1)

	# Calculate loss.
	loss = 0.0
	if gold_arcs is not None:
	loss += self.calc_arc_loss(s_arc, gold_arcs)
	loss += self.calc_rel_loss(s_rel, gold_arcs)

	# Predict arcs based on the scores.
	# [batch_size, seq_len, seq_len]
	pred_arcs_matrix = self.predict_arcs(s_arc, null_mask, padding_mask)
	# [batch_size, seq_len, seq_len]
	pred_rels_matrix = self.predict_rels(s_rel)
	# [n_pred_arcs, 4]
	preds_combined = self.combine_arcs_rels(pred_arcs_matrix, pred_rels_matrix)
	return {
	'preds': preds_combined,
	'loss': loss
	}

	@staticmethod
	def calc_arc_loss(
	s_arc: Tensor, # [batch_size, seq_len, seq_len]
	gold_arcs: LongTensor # [n_arcs, 4]
	) -> Tensor:
	"""Calculate arc loss."""
	raise NotImplementedError

	@staticmethod
	def calc_rel_loss(
	s_rel: Tensor, # [batch_size, seq_len, seq_len, num_labels]
	gold_arcs: LongTensor # [n_arcs, 4]
	) -> Tensor:
	batch_idxs, arcs_from, arcs_to, rels = gold_arcs.T
	return F.cross_entropy(s_rel[batch_idxs, arcs_from, arcs_to], rels)

	def predict_arcs(
	self,
	s_arc: Tensor, # [batch_size, seq_len, seq_len]
	null_mask: BoolTensor, # [batch_size, seq_len]
	padding_mask: BoolTensor # [batch_size, seq_len]
	) -> LongTensor:
	"""Predict arcs from scores."""
	raise NotImplementedError

	def predict_rels(
	self,
	s_rel: FloatTensor
	) -> LongTensor:
	return s_rel.argmax(dim=-1).long()

	@staticmethod
	def combine_arcs_rels(
	pred_arcs: LongTensor,
	pred_rels: LongTensor
	) -> LongTensor:
	"""Select relations towards predicted arcs."""
	assert pred_arcs.shape == pred_rels.shape
	# Get indices where arcs exist
	indices = pred_arcs.nonzero(as_tuple=True)
	batch_idxs, from_idxs, to_idxs = indices
	# Get corresponding relation types
	rel_types = pred_rels[batch_idxs, from_idxs, to_idxs]
	# Stack as [batch_idx, from_idx, to_idx, rel_type]
	return torch.stack([batch_idxs, from_idxs, to_idxs, rel_types], dim=1)


	class DependencyHead(DependencyHeadBase):
	"""
	Basic UD syntax specialization that predicts single edge for each token.
	"""


	def predict_arcs(
	self,
	s_arc: Tensor, # [batch_size, seq_len, seq_len]
	null_mask: BoolTensor, # [batch_size, seq_len]
	padding_mask: BoolTensor # [batch_size, seq_len, seq_len]
	) -> Tensor:

	if self.training:
	# During training, use fast greedy decoding.
	# - [batch_size, seq_len]
	pred_arcs_seq = s_arc.argmax(dim=1)
	else:
	# FIXME
	# During inference, decode Maximum Spanning Tree.
	# pred_arcs_seq = self._mst_decode(s_arc, padding_mask)
	pred_arcs_seq = s_arc.argmax(dim=1)

	# Upscale arcs sequence of shape [batch_size, seq_len]
	# to matrix of shape [batch_size, seq_len, seq_len].
	pred_arcs = F.one_hot(pred_arcs_seq, num_classes=pred_arcs_seq.size(1)).long().transpose(1, 2)
	# Apply mask one more time (even though s_arc is already masked),
	# because argmax erases information about masked values.
	mask2d = pairwise_mask(null_mask & padding_mask)
	replace_masked_values(pred_arcs, mask2d, replace_with=0)
	return pred_arcs

	def _mst_decode(
	self,
	s_arc: Tensor, # [batch_size, seq_len, seq_len]
	padding_mask: Tensor
	) -> tuple[Tensor, Tensor]:

	batch_size = s_arc.size(0)
	device = s_arc.device
	s_arc = s_arc.cpu()

	# Convert scores to probabilities, as `decode_mst` expects non-negative values.
	arc_probs = nn.functional.softmax(s_arc, dim=1)

	# `decode_mst` knows nothing about UD and ROOT, so we have to manually
	# zero probabilities of arcs leading to ROOT to make sure ROOT is a source node
	# of a graph.

	# Decode ROOT positions from diagonals.
	# shape: [batch_size]
	root_idxs = arc_probs.diagonal(dim1=1, dim2=2).argmax(dim=-1)
	# Zero out arcs leading to ROOTs.
	arc_probs[torch.arange(batch_size), :, root_idxs] = 0.0

	pred_arcs = []
	for sample_idx in range(batch_size):
	energy = arc_probs[sample_idx]
	length = padding_mask[sample_idx].sum()
	heads = decode_mst(energy, length)
	# Some nodes may be isolated. Pick heads greedily in this case.
	heads[heads <= 0] = s_arc[sample_idx].argmax(dim=1)[heads <= 0]
	pred_arcs.append(heads)

	# shape: [batch_size, seq_len]
	pred_arcs = torch.from_numpy(np.stack(pred_arcs)).long().to(device)
	return pred_arcs

	@staticmethod
	def calc_arc_loss(
	s_arc: Tensor, # [batch_size, seq_len, seq_len]
	gold_arcs: LongTensor # [n_arcs, 4]
	) -> tuple[Tensor, Tensor]:
	batch_idxs, from_idxs, to_idxs, _ = gold_arcs.T
	return F.cross_entropy(s_arc[batch_idxs, :, to_idxs], from_idxs)


	class MultiDependencyHead(DependencyHeadBase):
	"""
	Enhanced UD syntax specialization that predicts multiple edges for each token.
	"""


	def predict_arcs(
	self,
	s_arc: Tensor, # [batch_size, seq_len, seq_len]
	null_mask: BoolTensor, # [batch_size, seq_len]
	padding_mask: BoolTensor # [batch_size, seq_len]
	) -> Tensor:
	# Convert scores to probabilities.
	arc_probs = torch.sigmoid(s_arc)
	# Find confident arcs (with prob > 0.5).
	return arc_probs.round().long()

	@staticmethod
	def calc_arc_loss(
	s_arc: Tensor, # [batch_size, seq_len, seq_len]
	gold_arcs: LongTensor # [n_arcs, 4]
	) -> Tensor:
	batch_idxs, from_idxs, to_idxs, _ = gold_arcs.T
	# Gold arcs but as a matrix, where matrix[i, arcs_from, arc_to] = 1.0 if arcs is present.
	gold_arcs_matrix = torch.zeros_like(s_arc)
	gold_arcs_matrix[batch_idxs, from_idxs, to_idxs] = 1.0
	# Padded arcs's logits are huge negative values that doesn't contribute to the loss.
	return F.binary_cross_entropy_with_logits(s_arc, gold_arcs_matrix)


	class DependencyClassifier(nn.Module):
	"""
	Dozat and Manning's biaffine dependency classifier.
	"""

	def __init__(
	self,
	input_size: int,
	hidden_size: int,
	n_rels_ud: int,
	n_rels_eud: int,
	activation: str,
	dropout: float,
	):
	super().__init__()

	self.arc_dep_mlp = nn.Sequential(
	nn.Dropout(dropout),
	nn.Linear(input_size, hidden_size),
	ACT2FN[activation],
	nn.Dropout(dropout)
	)
	# All mlps are equal.
	self.arc_head_mlp = deepcopy(self.arc_dep_mlp)
	self.rel_dep_mlp = deepcopy(self.arc_dep_mlp)
	self.rel_head_mlp = deepcopy(self.arc_dep_mlp)

	self.dependency_head_ud = DependencyHead(hidden_size, n_rels_ud)
	self.dependency_head_eud = MultiDependencyHead(hidden_size, n_rels_eud)

	def forward(
	self,
	embeddings: Tensor, # [batch_size, seq_len, embedding_size]
	gold_ud: Tensor, # [n_ud_arcs, 4]
	gold_eud: Tensor, # [n_eud_arcs, 4]
	null_mask: Tensor, # [batch_size, seq_len]
	padding_mask: Tensor # [batch_size, seq_len]
	) -> dict[str, Tensor]:

	# - [batch_size, seq_len, hidden_size]
	h_arc_head = self.arc_head_mlp(embeddings)
	h_arc_dep = self.arc_dep_mlp(embeddings)
	h_rel_head = self.rel_head_mlp(embeddings)
	h_rel_dep = self.rel_dep_mlp(embeddings)

	# Share the h vectors between dependency and multi-dependency heads.
	output_ud = self.dependency_head_ud(
	h_arc_head,
	h_arc_dep,
	h_rel_head,
	h_rel_dep,
	gold_arcs=gold_ud,
	null_mask=null_mask,
	padding_mask=padding_mask
	)
	output_eud = self.dependency_head_eud(
	h_arc_head,
	h_arc_dep,
	h_rel_head,
	h_rel_dep,
	gold_arcs=gold_eud,
	# Ignore null mask in E-UD
	null_mask=torch.ones_like(padding_mask),
	padding_mask=padding_mask
	)

	return {
	'preds_ud': output_ud["preds"],
	'preds_eud': output_eud["preds"],
	'loss_ud': output_ud["loss"],
	'loss_eud': output_eud["loss"]
	}