climateGAN / train.py

initial commit from cc-ai/climateGAN

448ebbd about 2 years ago

6.37 kB

	import logging
	import os
	from pathlib import Path
	from time import sleep, time

	import hydra
	import yaml
	from addict import Dict
	from comet_ml import ExistingExperiment, Experiment
	from omegaconf import OmegaConf

	from climategan.trainer import Trainer
	from climategan.utils import (
	comet_kwargs,
	copy_run_files,
	env_to_path,
	find_existing_training,
	flatten_opts,
	get_existing_comet_id,
	get_git_branch,
	get_git_revision_hash,
	get_increased_path,
	kill_job,
	load_opts,
	pprint,
	)

	logging.basicConfig()
	logging.getLogger().setLevel(logging.ERROR)

	hydra_config_path = Path(__file__).resolve().parent / "shared/trainer/config.yaml"


	# requires hydra-core==0.11.3 and omegaconf==1.4.1
	@hydra.main(config_path=hydra_config_path, strict=False)
	def main(opts):
	"""
	Opts prevalence:
	1. Load file specified in args.default (or shared/trainer/defaults.yaml
	if none is provided)
	2. Update with file specified in args.config (or no update if none is provided)
	3. Update with parsed command-line arguments

	e.g.
	`python train.py args.config=config/large-lr.yaml data.loaders.batch_size=10`
	loads defaults, overrides with values in large-lr.yaml and sets batch_size to 10
	"""

	# -----------------------------
	# ----- Parse arguments -----
	# -----------------------------

	hydra_opts = Dict(OmegaConf.to_container(opts))
	args = hydra_opts.pop("args", None)
	auto_resumed = {}

	config_path = args.config

	if hydra_opts.train.resume:
	out_ = str(env_to_path(hydra_opts.output_path))
	config_path = Path(out_) / "opts.yaml"
	if not config_path.exists():
	config_path = None
	print("WARNING: could not reuse the opts in {}".format(out_))

	default = args.default or Path(__file__).parent / "shared/trainer/defaults.yaml"

	# -----------------------
	# ----- Load opts -----
	# -----------------------

	opts = load_opts(config_path, default=default, commandline_opts=hydra_opts)
	if args.resume:
	opts.train.resume = True

	opts.jobID = os.environ.get("SLURM_JOBID")
	opts.slurm_partition = os.environ.get("SLURM_JOB_PARTITION")
	opts.output_path = str(env_to_path(opts.output_path))
	print("Config output_path:", opts.output_path)

	exp = comet_previous_id = None

	# -------------------------------
	# ----- Check output_path -----
	# -------------------------------

	# Auto-continue if same slurm job ID (=job was requeued)
	if not opts.train.resume and opts.train.auto_resume:
	print("\n\nTrying to auto-resume...")
	existing_path = find_existing_training(opts)
	if existing_path is not None and existing_path.exists():
	auto_resumed["original output_path"] = str(opts.output_path)
	auto_resumed["existing_path"] = str(existing_path)
	opts.train.resume = True
	opts.output_path = str(existing_path)

	# Still not resuming: creating new output path
	if not opts.train.resume:
	opts.output_path = str(get_increased_path(opts.output_path))
	Path(opts.output_path).mkdir(parents=True, exist_ok=True)

	# Copy the opts's sbatch_file to output_path
	copy_run_files(opts)
	# store git hash
	opts.git_hash = get_git_revision_hash()
	opts.git_branch = get_git_branch()

	if not args.no_comet:
	# ----------------------------------
	# ----- Set Comet Experiment -----
	# ----------------------------------

	if opts.train.resume:
	# Is resuming: get existing comet exp id
	assert Path(opts.output_path).exists(), "Output_path does not exist"

	comet_previous_id = get_existing_comet_id(opts.output_path)
	# Continue existing experiment
	if comet_previous_id is None:
	print("WARNING could not retreive previous comet id")
	print(f"from {opts.output_path}")
	else:
	print("Continuing previous experiment", comet_previous_id)
	auto_resumed["continuing exp id"] = comet_previous_id
	exp = ExistingExperiment(
	previous_experiment=comet_previous_id, **comet_kwargs
	)
	print("Comet Experiment resumed")

	if exp is None:
	# Create new experiment
	print("Starting new experiment")
	exp = Experiment(project_name="climategan", **comet_kwargs)
	exp.log_asset_folder(
	str(Path(__file__).parent / "climategan"),
	recursive=True,
	log_file_name=True,
	)
	exp.log_asset(str(Path(__file__)))

	# Log note
	if args.note:
	exp.log_parameter("note", args.note)

	# Merge and log tags
	if args.comet_tags or opts.comet.tags:
	tags = set([f"branch:{opts.git_branch}"])
	if args.comet_tags:
	tags.update(args.comet_tags)
	if opts.comet.tags:
	tags.update(opts.comet.tags)
	opts.comet.tags = list(tags)
	print("Logging to comet.ml with tags", opts.comet.tags)
	exp.add_tags(opts.comet.tags)

	# Log all opts
	exp.log_parameters(flatten_opts(opts))
	if auto_resumed:
	exp.log_text("\n".join(f"{k:20}: {v}" for k, v in auto_resumed.items()))

	# allow some time for comet to get its url
	sleep(1)

	# Save comet exp url
	url_path = get_increased_path(Path(opts.output_path) / "comet_url.txt")
	with open(url_path, "w") as f:
	f.write(exp.url)

	# Save config file
	opts_path = get_increased_path(Path(opts.output_path) / "opts.yaml")
	with (opts_path).open("w") as f:
	yaml.safe_dump(opts.to_dict(), f)

	pprint("Running model in", opts.output_path)

	# -------------------
	# ----- Train -----
	# -------------------

	trainer = Trainer(opts, comet_exp=exp, verbose=1)
	trainer.logger.time.start_time = time()
	trainer.setup()
	trainer.train()

	# -----------------------------
	# ----- End of training -----
	# -----------------------------

	pprint("Done training")
	kill_job(opts.jobID)


	if __name__ == "__main__":

	main()