Spaces:

NimaBoscarino
/

climategan

Runtime error

App Files Files Community

climategan / train.py

NimaBoscarino

copy the climategan repo in here

6e601ed over 3 years ago

raw

history blame contribute delete

6.37 kB

	import logging
	import os
	from pathlib import Path
	from time import sleep, time

	import hydra
	import yaml
	from addict import Dict
	from comet_ml import ExistingExperiment, Experiment
	from omegaconf import OmegaConf

	from climategan.trainer import Trainer
	from climategan.utils import (
	comet_kwargs,
	copy_run_files,
	env_to_path,
	find_existing_training,
	flatten_opts,
	get_existing_comet_id,
	get_git_branch,
	get_git_revision_hash,
	get_increased_path,
	kill_job,
	load_opts,
	pprint,
	)

	logging.basicConfig()
	logging.getLogger().setLevel(logging.ERROR)

	hydra_config_path = Path(__file__).resolve().parent / "shared/trainer/config.yaml"


	# requires hydra-core==0.11.3 and omegaconf==1.4.1
	@hydra.main(config_path=hydra_config_path, strict=False)
	def main(opts):
	"""
	Opts prevalence:
	1. Load file specified in args.default (or shared/trainer/defaults.yaml
	if none is provided)
	2. Update with file specified in args.config (or no update if none is provided)
	3. Update with parsed command-line arguments

	e.g.
	`python train.py args.config=config/large-lr.yaml data.loaders.batch_size=10`
	loads defaults, overrides with values in large-lr.yaml and sets batch_size to 10
	"""

	# -----------------------------
	# ----- Parse arguments -----
	# -----------------------------

	hydra_opts = Dict(OmegaConf.to_container(opts))
	args = hydra_opts.pop("args", None)
	auto_resumed = {}

	config_path = args.config

	if hydra_opts.train.resume:
	out_ = str(env_to_path(hydra_opts.output_path))
	config_path = Path(out_) / "opts.yaml"
	if not config_path.exists():
	config_path = None
	print("WARNING: could not reuse the opts in {}".format(out_))

	default = args.default or Path(__file__).parent / "shared/trainer/defaults.yaml"

	# -----------------------
	# ----- Load opts -----
	# -----------------------

	opts = load_opts(config_path, default=default, commandline_opts=hydra_opts)
	if args.resume:
	opts.train.resume = True

	opts.jobID = os.environ.get("SLURM_JOBID")
	opts.slurm_partition = os.environ.get("SLURM_JOB_PARTITION")
	opts.output_path = str(env_to_path(opts.output_path))
	print("Config output_path:", opts.output_path)

	exp = comet_previous_id = None

	# -------------------------------
	# ----- Check output_path -----
	# -------------------------------

	# Auto-continue if same slurm job ID (=job was requeued)
	if not opts.train.resume and opts.train.auto_resume:
	print("\n\nTrying to auto-resume...")
	existing_path = find_existing_training(opts)
	if existing_path is not None and existing_path.exists():
	auto_resumed["original output_path"] = str(opts.output_path)
	auto_resumed["existing_path"] = str(existing_path)
	opts.train.resume = True
	opts.output_path = str(existing_path)

	# Still not resuming: creating new output path
	if not opts.train.resume:
	opts.output_path = str(get_increased_path(opts.output_path))
	Path(opts.output_path).mkdir(parents=True, exist_ok=True)

	# Copy the opts's sbatch_file to output_path
	copy_run_files(opts)
	# store git hash
	opts.git_hash = get_git_revision_hash()
	opts.git_branch = get_git_branch()

	if not args.no_comet:
	# ----------------------------------
	# ----- Set Comet Experiment -----
	# ----------------------------------

	if opts.train.resume:
	# Is resuming: get existing comet exp id
	assert Path(opts.output_path).exists(), "Output_path does not exist"

	comet_previous_id = get_existing_comet_id(opts.output_path)
	# Continue existing experiment
	if comet_previous_id is None:
	print("WARNING could not retreive previous comet id")
	print(f"from {opts.output_path}")
	else:
	print("Continuing previous experiment", comet_previous_id)
	auto_resumed["continuing exp id"] = comet_previous_id
	exp = ExistingExperiment(
	previous_experiment=comet_previous_id, **comet_kwargs
	)
	print("Comet Experiment resumed")

	if exp is None:
	# Create new experiment
	print("Starting new experiment")
	exp = Experiment(project_name="climategan", **comet_kwargs)
	exp.log_asset_folder(
	str(Path(__file__).parent / "climategan"),
	recursive=True,
	log_file_name=True,
	)
	exp.log_asset(str(Path(__file__)))

	# Log note
	if args.note:
	exp.log_parameter("note", args.note)

	# Merge and log tags
	if args.comet_tags or opts.comet.tags:
	tags = set([f"branch:{opts.git_branch}"])
	if args.comet_tags:
	tags.update(args.comet_tags)
	if opts.comet.tags:
	tags.update(opts.comet.tags)
	opts.comet.tags = list(tags)
	print("Logging to comet.ml with tags", opts.comet.tags)
	exp.add_tags(opts.comet.tags)

	# Log all opts
	exp.log_parameters(flatten_opts(opts))
	if auto_resumed:
	exp.log_text("\n".join(f"{k:20}: {v}" for k, v in auto_resumed.items()))

	# allow some time for comet to get its url
	sleep(1)

	# Save comet exp url
	url_path = get_increased_path(Path(opts.output_path) / "comet_url.txt")
	with open(url_path, "w") as f:
	f.write(exp.url)

	# Save config file
	opts_path = get_increased_path(Path(opts.output_path) / "opts.yaml")
	with (opts_path).open("w") as f:
	yaml.safe_dump(opts.to_dict(), f)

	pprint("Running model in", opts.output_path)

	# -------------------
	# ----- Train -----
	# -------------------

	trainer = Trainer(opts, comet_exp=exp, verbose=1)
	trainer.logger.time.start_time = time()
	trainer.setup()
	trainer.train()

	# -----------------------------
	# ----- End of training -----
	# -----------------------------

	pprint("Done training")
	kill_job(opts.jobID)


	if __name__ == "__main__":

	main()