Spaces:

dattarij
/

disentangled-image-editing-final-project

Running

App Files Files Community

disentangled-image-editing-final-project / ContraCLIP /train.py

dattarij

adding ContraCLIP folder

8c212a5 5 months ago

raw

history blame contribute delete

13.3 kB

	import sys
	import argparse
	import os.path as osp
	import json
	import torch
	import clip
	from lib import *
	from lib import GENFORCE_MODELS, STYLEGAN_LAYERS, SEMANTIC_DIPOLES_CORPORA
	from models.load_generator import load_generator


	def main():
	"""ContraCLIP -- Training script.

	Options:
	===[ GAN Generator (G) ]========================================================================================
	--gan : set pre-trained GAN generator (see GENFORCE_MODELS in lib/config.py)
	--stylegan-space : set StyleGAN's latent space (Z, W, W+) to look for interpretable paths
	TODO: add style space S
	--stylegan-layer : choose up to which StyleGAN's layer to use for learning latent paths
	E.g., if --stylegan-layer=11, then interpretable paths will be learnt in a
	(12 * 512)-dimensional latent space.
	--truncation : set W-space truncation parameter. If set, W-space codes will be truncated

	===[ Corpus Support Sets (CSS) ]================================================================================
	--corpus : choose corpus of prompts (see config.py/PROMPT_CORPUS). The number of elements of
	the tuple PROMPT_CORPUS[args.corpus] will define the number of the latent support
	sets; i.e., the number of warping functions -- number of the interpretable latent
	paths to be optimised
	TODO: read corpus from input file
	--css-beta : set beta parameter for fixing CLIP space RBFs' gamma parameters
	(0.25 <= css_beta < 1.0)
	--styleclip : use StyleCLIP approach for calculating image-text similarity

	===[ Latent Support Sets (LSS) ]================================================================================
	--num-latent-support-dipoles : set number of support dipoles per support set
	--lss-beta : set beta parameter for initializing latent space RBFs' gamma parameters
	(0.0 < lss_beta < 1.0)
	--lr : set learning rate for learning the latent support sets LSS (with Adam optimizer)
	--linear : use the vector connecting the poles of the dipole for calculating image-text
	similarity
	--min-shift-magnitude : set minimum latent shift magnitude
	--max-shift-magnitude : set maximum latent shift magnitude

	===[ CLIP ]=====================================================================================================


	===[ Training ]=================================================================================================
	--max-iter : set maximum number of training iterations
	--batch-size : set training batch size
	--loss : set loss function ('cossim', 'contrastive')
	--temperature : set contrastive loss temperature
	--log-freq : set number iterations per log
	--ckp-freq : set number iterations per checkpoint model saving

	===[ CUDA ]=====================================================================================================
	--cuda : use CUDA during training (default)
	--no-cuda : do NOT use CUDA during training
	================================================================================================================
	"""
	parser = argparse.ArgumentParser(description="ContraCLIP training script")

	# === Experiment ID ============================================================================================== #
	parser.add_argument('--exp-id', type=str, default='', help="set optional experiment ID")

	# === Pre-trained GAN Generator (G) ============================================================================== #
	parser.add_argument('--gan', type=str, choices=GENFORCE_MODELS.keys(), help='GAN generator model')
	parser.add_argument('--stylegan-space', type=str, default='Z', choices=('Z', 'W', 'W+'),
	help="StyleGAN's latent space")
	parser.add_argument('--stylegan-layer', type=int, default=11, choices=range(18),
	help="choose up to which StyleGAN's layer to use for learning latent paths")
	parser.add_argument('--truncation', type=float, help="latent code sampling truncation parameter")

	# === Corpus Support Sets (CSS) ================================================================================== #
	parser.add_argument('--corpus', type=str, required=True, choices=SEMANTIC_DIPOLES_CORPORA.keys(),
	help="choose corpus of semantic dipoles")
	parser.add_argument('--css-beta', type=float, default=0.5,
	help="set beta parameter for initializing CLIP space RBFs' gamma parameters "
	"(0.25 <= css_beta < 1.0)")
	parser.add_argument('--styleclip', action='store_true',
	help="use StyleCLIP approach for calculating image-text similarity")
	parser.add_argument('--linear', action='store_true',
	help="use the vector connecting the poles of the dipole for calculating image-text similarity")

	# === Latent Support Sets (LSS) ================================================================================== #
	parser.add_argument('--num-latent-support-dipoles', type=int, help="number of latent support dipoles / support set")
	parser.add_argument('--lss-beta', type=float, default=0.1,
	help="set beta parameter for initializing latent space RBFs' gamma parameters "
	"(0.25 < css_beta < 1.0)")
	parser.add_argument('--lr', type=float, default=1e-4, help="latent support sets LSS learning rate")
	parser.add_argument('--min-shift-magnitude', type=float, default=0.25, help="minimum latent shift magnitude")
	parser.add_argument('--max-shift-magnitude', type=float, default=0.45, help="maximum latent shift magnitude")

	# === Training =================================================================================================== #
	parser.add_argument('--max-iter', type=int, default=10000, help="maximum number of training iterations")
	parser.add_argument('--batch-size', type=int, required=True, help="training batch size -- this should be less than "
	"or equal to the size of the given corpus")
	parser.add_argument('--loss', type=str, default='cossim', choices=('cossim', 'contrastive'),
	help="loss function")
	parser.add_argument('--temperature', type=float, default=1.0, help="contrastive temperature")
	parser.add_argument('--log-freq', default=10, type=int, help='number of iterations per log')
	parser.add_argument('--ckp-freq', default=1000, type=int, help='number of iterations per checkpoint model saving')

	# === CUDA ======================================================================================================= #
	parser.add_argument('--cuda', dest='cuda', action='store_true', help="use CUDA during training")
	parser.add_argument('--no-cuda', dest='cuda', action='store_false', help="do NOT use CUDA during training")
	parser.set_defaults(cuda=True)
	# ================================================================================================================ #

	# Parse given arguments
	args = parser.parse_args()

	# Check given batch size
	if args.batch_size > len(SEMANTIC_DIPOLES_CORPORA[args.corpus]):
	print("* WARNING *: Given batch size ({}) is greater than the size of the given corpus ({})\n"
	" Set batch size to {}".format(
	args.batch_size, len(SEMANTIC_DIPOLES_CORPORA[args.corpus]),
	len(SEMANTIC_DIPOLES_CORPORA[args.corpus])))
	args.batch_size = len(SEMANTIC_DIPOLES_CORPORA[args.corpus])

	# Check StyleGAN's layer
	if 'stylegan' in args.gan:
	if (args.stylegan_layer < 0) or (args.stylegan_layer > STYLEGAN_LAYERS[args.gan]-1):
	raise ValueError("Invalid stylegan_layer for given GAN ({}). Choose between 0 and {}".format(
	args.gan, STYLEGAN_LAYERS[args.gan]-1))

	# Create output dir and save current arguments
	exp_dir = create_exp_dir(args)

	# CUDA
	use_cuda = False
	multi_gpu = False
	if torch.cuda.is_available():
	if args.cuda:
	use_cuda = True
	torch.set_default_tensor_type('torch.cuda.FloatTensor')
	if torch.cuda.device_count() > 1:
	multi_gpu = True
	else:
	print("* WARNING *: It looks like you have a CUDA device, but aren't using CUDA.\n"
	" Run with --cuda for optimal training speed.")
	torch.set_default_tensor_type('torch.FloatTensor')
	else:
	torch.set_default_tensor_type('torch.FloatTensor')

	# Build GAN generator model and load with pre-trained weights
	print("#. Build GAN generator model G and load with pre-trained weights...")
	print(" \\__GAN generator : {} (res: {})".format(args.gan, GENFORCE_MODELS[args.gan][1]))
	print(" \\__Pre-trained weights: {}".format(GENFORCE_MODELS[args.gan][0]))
	G = load_generator(model_name=args.gan,
	latent_is_w=('stylegan' in args.gan) and ('W' in args.stylegan_space),
	verbose=True).eval()

	# Upload GAN generator model to GPU
	if use_cuda:
	G = G.cuda()

	# Build pretrained CLIP model
	print("#. Build pretrained CLIP model...")
	clip_model, _ = clip.load("ViT-B/32", device='cuda' if use_cuda else 'cpu', jit=False)
	clip_model.float()
	clip_model.eval()

	# Get CLIP (non-normalized) text features for the prompts of the given corpus
	prompt_f = PromptFeatures(prompt_corpus=SEMANTIC_DIPOLES_CORPORA[args.corpus], clip_model=clip_model)
	prompt_features = prompt_f.get_prompt_features()

	# Build Corpus Support Sets model CSS
	print("#. Build Corpus Support Sets CSS...")
	print(" \\__Number of corpus support sets : {}".format(prompt_f.num_prompts))
	print(" \\__Number of corpus support dipoles : {}".format(1))
	print(" \\__Prompt features dim : {}".format(prompt_f.prompt_features_dim))
	print(" \\__Text RBF beta param : {}".format(args.css_beta))

	CSS = SupportSets(prompt_features=prompt_features, css_beta=args.css_beta)

	# Count number of trainable parameters
	CSS_trainable_parameters = sum(p.numel() for p in CSS.parameters() if p.requires_grad)
	print(" \\__Trainable parameters: {:,}".format(CSS_trainable_parameters))

	# Set support vector dimensionality and initial gamma param
	support_vectors_dim = G.dim_z
	if ('stylegan' in args.gan) and (args.stylegan_space == 'W+'):
	support_vectors_dim *= (args.stylegan_layer + 1)

	# Get Jung radii
	with open(osp.join('models', 'jung_radii.json'), 'r') as f:
	jung_radii_dict = json.load(f)

	if 'stylegan' in args.gan:
	if 'W+' in args.stylegan_space:
	lm = jung_radii_dict[args.gan]['W']['{}'.format(args.stylegan_layer)]
	elif 'W' in args.stylegan_space:
	lm = jung_radii_dict[args.gan]['W']['0']
	else:
	lm = jung_radii_dict[args.gan]['Z']
	jung_radius = lm[0] * args.truncation + lm[1]
	else:
	jung_radius = jung_radii_dict[args.gan]['Z'][1]

	# Build Latent Support Sets model LSS
	print("#. Build Latent Support Sets LSS...")
	print(" \\__Number of latent support sets : {}".format(prompt_f.num_prompts))
	print(" \\__Number of latent support dipoles : {}".format(args.num_latent_support_dipoles))
	print(" \\__Support Vectors dim : {}".format(support_vectors_dim))
	print(" \\__Latent RBF beta param (lss-beta) : {}".format(args.lss_beta))
	print(" \\__Jung radius : {}".format(jung_radius))

	LSS = SupportSets(num_support_sets=prompt_f.num_prompts,
	num_support_dipoles=args.num_latent_support_dipoles,
	support_vectors_dim=support_vectors_dim,
	lss_beta=args.lss_beta,
	jung_radius=jung_radius)

	# Count number of trainable parameters
	LSS_trainable_parameters = sum(p.numel() for p in LSS.parameters() if p.requires_grad)
	print(" \\__Trainable parameters: {:,}".format(LSS_trainable_parameters))

	# Set up trainer
	print("#. Experiment: {}".format(exp_dir))
	t = Trainer(params=args, exp_dir=exp_dir, use_cuda=use_cuda, multi_gpu=multi_gpu)

	# Train
	t.train(generator=G, latent_support_sets=LSS, corpus_support_sets=CSS, clip_model=clip_model)


	if __name__ == '__main__':
	main()