Spaces:

ML701G7
/

taim-gan

Runtime error

App Files Files Community

taim-gan / src /models /losses.py

Dmmc

three-model version

c8ddb9b over 2 years ago

raw

history blame contribute delete

12.2 kB

	"""Module containing the loss functions for the GANs."""
	from typing import Any, Dict

	import torch
	from torch import nn

	# pylint: disable=too-many-arguments
	# pylint: disable=too-many-locals


	def generator_loss(
	logits: Dict[str, Dict[str, torch.Tensor]],
	local_fake_incept_feat: torch.Tensor,
	global_fake_incept_feat: torch.Tensor,
	real_labels: torch.Tensor,
	words_emb: torch.Tensor,
	sent_emb: torch.Tensor,
	match_labels: torch.Tensor,
	cap_lens: torch.Tensor,
	class_ids: torch.Tensor,
	real_vgg_feat: torch.Tensor,
	fake_vgg_feat: torch.Tensor,
	const_dict: Dict[str, float],
	) -> Any:
	"""Calculate the loss for the generator.

	Args:
	logits: Dictionary with fake/real and word-level/uncond/cond logits

	local_fake_incept_feat: The local inception features for the fake images.

	global_fake_incept_feat: The global inception features for the fake images.

	real_labels: Label for "real" image as predicted by discriminator,
	this is a tensor of ones. [shape: (batch_size, 1)].

	word_labels: POS tagged word labels for the captions. [shape: (batch_size, L)]

	words_emb: The embeddings for all the words in the captions.
	shape: (batch_size, embedding_size, max_caption_length)

	sent_emb: The embeddings for the sentences.
	shape: (batch_size, embedding_size)

	match_labels: Tensor of shape: (batch_size, 1).
	This is of the form torch.tensor([0, 1, 2, ..., batch-1])

	cap_lens: The length of the 'actual' captions in the batch [without padding]
	shape: (batch_size, 1)

	class_ids: The class ids for the instance. shape: (batch_size, 1)

	real_vgg_feat: The vgg features for the real images. shape: (batch_size, 128, 128, 128)
	fake_vgg_feat: The vgg features for the fake images. shape: (batch_size, 128, 128, 128)

	const_dict: The dictionary containing the constants.
	"""
	lambda1 = const_dict["lambda1"]
	total_error_g = 0.0

	cond_logits = logits["fake"]["cond"]
	cond_err_g = nn.BCEWithLogitsLoss()(cond_logits, real_labels)

	uncond_logits = logits["fake"]["uncond"]
	uncond_err_g = nn.BCEWithLogitsLoss()(uncond_logits, real_labels)

	# add up the conditional and unconditional losses
	loss_g = cond_err_g + uncond_err_g
	total_error_g += loss_g

	# DAMSM Loss from attnGAN.
	loss_damsm = damsm_loss(
	local_fake_incept_feat,
	global_fake_incept_feat,
	words_emb,
	sent_emb,
	match_labels,
	cap_lens,
	class_ids,
	const_dict,
	)

	total_error_g += loss_damsm

	loss_per = 0.5 * nn.MSELoss()(real_vgg_feat, fake_vgg_feat) # perceptual loss

	total_error_g += lambda1 * loss_per

	return total_error_g


	def damsm_loss(
	local_incept_feat: torch.Tensor,
	global_incept_feat: torch.Tensor,
	words_emb: torch.Tensor,
	sent_emb: torch.Tensor,
	match_labels: torch.Tensor,
	cap_lens: torch.Tensor,
	class_ids: torch.Tensor,
	const_dict: Dict[str, float],
	) -> Any:
	"""Calculate the DAMSM loss from the attnGAN paper.

	Args:
	local_incept_feat: The local inception features. [shape: (batch, D, 17, 17)]

	global_incept_feat: The global inception features. [shape: (batch, D)]

	words_emb: The embeddings for all the words in the captions.

	shape: (batch, D, max_caption_length)

	sent_emb: The embeddings for the sentences. shape: (batch_size, D)

	match_labels: Tensor of shape: (batch_size, 1).
	This is of the form torch.tensor([0, 1, 2, ..., batch-1])

	cap_lens: The length of the 'actual' captions in the batch [without padding]
	shape: (batch_size, 1)

	class_ids: The class ids for the instance. shape: (batch, 1)

	const_dict: The dictionary containing the constants.
	"""
	batch_size = match_labels.size(0)
	# Mask mis-match samples, that come from the same class as the real sample
	masks = []

	match_scores = []
	gamma1 = const_dict["gamma1"]
	gamma2 = const_dict["gamma2"]
	gamma3 = const_dict["gamma3"]
	lambda3 = const_dict["lambda3"]

	for i in range(batch_size):
	mask = (class_ids == class_ids[i]).int()
	# This ensures that "correct class" index is not included in the mask.
	mask[i] = 0
	masks.append(mask.reshape(1, -1)) # shape: (1, batch)

	numb_words = int(cap_lens[i])
	# shape: (1, D, L), this picks the caption at ith batch index.
	query_words = words_emb[i, :, :numb_words].unsqueeze(0)
	# shape: (batch, D, L), this expands the same caption for all batch indices.
	query_words = query_words.repeat(batch_size, 1, 1)

	c_i = compute_region_context_vector(
	local_incept_feat, query_words, gamma1
	) # Taken from attnGAN paper. shape: (batch, D, L)

	query_words = query_words.transpose(1, 2) # shape: (batch, L, D)
	c_i = c_i.transpose(1, 2) # shape: (batch, L, D)
	query_words = query_words.reshape(
	batch_size * numb_words, -1
	) # shape: (batch * L, D)
	c_i = c_i.reshape(batch_size * numb_words, -1) # shape: (batch * L, D)

	r_i = compute_relevance(
	c_i, query_words
	) # cosine similarity, or R(c_i, e_i) from attnGAN paper. shape: (batch * L, 1)
	r_i = r_i.view(batch_size, numb_words) # shape: (batch, L)
	r_i = torch.exp(r_i * gamma2) # shape: (batch, L)
	r_i = r_i.sum(dim=1, keepdim=True) # shape: (batch, 1)
	r_i = torch.log(
	r_i
	) # This is image-text matching score b/w whole image and caption, shape: (batch, 1)
	match_scores.append(r_i)

	masks = torch.cat(masks, dim=0).bool() # type: ignore
	match_scores = torch.cat(match_scores, dim=1) # type: ignore

	# This corresponds to P(D\|Q) from attnGAN.
	match_scores = gamma3 * match_scores # type: ignore
	match_scores.data.masked_fill_( # type: ignore
	masks, -float("inf")
	) # mask out the scores for mis-matched samples

	match_scores_t = match_scores.transpose( # type: ignore
	0, 1
	) # This corresponds to P(Q\|D) from attnGAN.

	# This corresponds to L1_w from attnGAN.
	l1_w = nn.CrossEntropyLoss()(match_scores, match_labels)
	# This corresponds to L2_w from attnGAN.
	l2_w = nn.CrossEntropyLoss()(match_scores_t, match_labels)

	incept_feat_norm = torch.linalg.norm(global_incept_feat, dim=1)
	sent_emb_norm = torch.linalg.norm(sent_emb, dim=1)

	# shape: (batch, batch)
	global_match_score = global_incept_feat @ (sent_emb.T)

	global_match_score = (
	global_match_score / torch.outer(incept_feat_norm, sent_emb_norm)
	).clamp(min=1e-8)
	global_match_score = gamma3 * global_match_score

	# mask out the scores for mis-matched samples
	global_match_score.data.masked_fill_(masks, -float("inf")) # type: ignore

	global_match_t = global_match_score.T # shape: (batch, batch)

	# This corresponds to L1_s from attnGAN.
	l1_s = nn.CrossEntropyLoss()(global_match_score, match_labels)
	# This corresponds to L2_s from attnGAN.
	l2_s = nn.CrossEntropyLoss()(global_match_t, match_labels)

	loss_damsm = lambda3 * (l1_w + l2_w + l1_s + l2_s)

	return loss_damsm


	def compute_relevance(c_i: torch.Tensor, query_words: torch.Tensor) -> Any:
	"""Computes the cosine similarity between the region context vector and the query words.

	Args:
	c_i: The region context vector. shape: (batch * L, D)
	query_words: The query words. shape: (batch * L, D)
	"""
	prod = c_i * query_words # shape: (batch * L, D)
	numr = torch.sum(prod, dim=1) # shape: (batch * L, 1)
	norm_c = torch.linalg.norm(c_i, ord=2, dim=1)
	norm_q = torch.linalg.norm(query_words, ord=2, dim=1)
	denr = norm_c * norm_q
	r_i = (numr / denr).clamp(min=1e-8).squeeze() # shape: (batch * L, 1)
	return r_i


	def compute_region_context_vector(
	local_incept_feat: torch.Tensor, query_words: torch.Tensor, gamma1: float
	) -> Any:
	"""Compute the region context vector (c_i) from attnGAN paper.

	Args:
	local_incept_feat: The local inception features. [shape: (batch, D, 17, 17)]
	query_words: The embeddings for all the words in the captions. shape: (batch, D, L)
	gamma1: The gamma1 value from attnGAN paper.
	"""
	batch, L = query_words.size(0), query_words.size(2) # pylint: disable=invalid-name

	feat_height, feat_width = local_incept_feat.size(2), local_incept_feat.size(3)
	N = feat_height * feat_width # pylint: disable=invalid-name

	# Reshape the local inception features to (batch, D, N)
	local_incept_feat = local_incept_feat.view(batch, -1, N)
	# shape: (batch, N, D)
	incept_feat_t = local_incept_feat.transpose(1, 2)

	sim_matrix = incept_feat_t @ query_words # shape: (batch, N, L)
	sim_matrix = sim_matrix.view(batch * N, L) # shape: (batch * N, L)

	sim_matrix = nn.Softmax(dim=1)(sim_matrix) # shape: (batch * N, L)
	sim_matrix = sim_matrix.view(batch, N, L) # shape: (batch, N, L)

	sim_matrix = torch.transpose(sim_matrix, 1, 2) # shape: (batch, L, N)
	sim_matrix = sim_matrix.reshape(batch * L, N) # shape: (batch * L, N)

	alpha_j = gamma1 * sim_matrix # shape: (batch * L, N)
	alpha_j = nn.Softmax(dim=1)(alpha_j) # shape: (batch * L, N)
	alpha_j = alpha_j.view(batch, L, N) # shape: (batch, L, N)
	alpha_j_t = torch.transpose(alpha_j, 1, 2) # shape: (batch, N, L)

	c_i = (
	local_incept_feat @ alpha_j_t
	) # shape: (batch, D, L) [summing over N dimension in paper, so we multiply like this]
	return c_i


	def discriminator_loss(
	logits: Dict[str, Dict[str, torch.Tensor]],
	labels: Dict[str, Dict[str, torch.Tensor]],
	) -> Any:
	"""
	Calculate discriminator objective

	:param dict[str, dict[str, torch.Tensor]] logits:
	Dictionary with fake/real and word-level/uncond/cond logits

	Example:

	logits = {
	"fake": {
	"word_level": torch.Tensor (BxL)
	"uncond": torch.Tensor (Bx1)
	"cond": torch.Tensor (Bx1)
	},
	"real": {
	"word_level": torch.Tensor (BxL)
	"uncond": torch.Tensor (Bx1)
	"cond": torch.Tensor (Bx1)
	},
	}
	:param dict[str, dict[str, torch.Tensor]] labels:
	Dictionary with fake/real and word-level/image labels

	Example:

	labels = {
	"fake": {
	"word_level": torch.Tensor (BxL)
	"image": torch.Tensor (Bx1)
	},
	"real": {
	"word_level": torch.Tensor (BxL)
	"image": torch.Tensor (Bx1)
	},
	}
	:param float lambda_4: Hyperparameter for word loss in paper
	:return: Discriminator objective loss
	:rtype: Any
	"""
	# define main loss functions for logit losses
	tot_loss = 0.0
	bce_logits = nn.BCEWithLogitsLoss()
	bce = nn.BCELoss()
	# calculate word-level loss
	word_loss = bce(logits["real"]["word_level"], labels["real"]["word_level"])
	# calculate unconditional adversarial loss
	uncond_loss = bce_logits(logits["real"]["uncond"], labels["real"]["image"])

	# calculate conditional adversarial loss
	cond_loss = bce_logits(logits["real"]["cond"], labels["real"]["image"])

	tot_loss = (uncond_loss + cond_loss) / 2.0

	fake_uncond_loss = bce_logits(logits["fake"]["uncond"], labels["fake"]["image"])
	fake_cond_loss = bce_logits(logits["fake"]["cond"], labels["fake"]["image"])

	tot_loss += (fake_uncond_loss + fake_cond_loss) / 3.0
	tot_loss += word_loss

	return tot_loss


	def kl_loss(mu_tensor: torch.Tensor, logvar: torch.Tensor) -> Any:
	"""
	Calculate KL loss

	:param torch.Tensor mu_tensor: Mean of latent distribution
	:param torch.Tensor logvar: Log variance of latent distribution
	:return: KL loss [-0.5 * (1 + log(sigma) - mu^2 - sigma^2)]
	:rtype: Any
	"""
	return torch.mean(-0.5 * (1 + 0.5 * logvar - mu_tensor.pow(2) - torch.exp(logvar)))