climategan / train.py
NimaBoscarino's picture
copy the climategan repo in here
6e601ed
raw history blame
No virus
6.37 kB
import logging
import os
from pathlib import Path
from time import sleep, time
import hydra
import yaml
from addict import Dict
from comet_ml import ExistingExperiment, Experiment
from omegaconf import OmegaConf
from climategan.trainer import Trainer
from climategan.utils import (
comet_kwargs,
copy_run_files,
env_to_path,
find_existing_training,
flatten_opts,
get_existing_comet_id,
get_git_branch,
get_git_revision_hash,
get_increased_path,
kill_job,
load_opts,
pprint,
)
logging.basicConfig()
logging.getLogger().setLevel(logging.ERROR)
hydra_config_path = Path(__file__).resolve().parent / "shared/trainer/config.yaml"
# requires hydra-core==0.11.3 and omegaconf==1.4.1
@hydra.main(config_path=hydra_config_path, strict=False)
def main(opts):
"""
Opts prevalence:
1. Load file specified in args.default (or shared/trainer/defaults.yaml
if none is provided)
2. Update with file specified in args.config (or no update if none is provided)
3. Update with parsed command-line arguments
e.g.
`python train.py args.config=config/large-lr.yaml data.loaders.batch_size=10`
loads defaults, overrides with values in large-lr.yaml and sets batch_size to 10
"""
# -----------------------------
# ----- Parse arguments -----
# -----------------------------
hydra_opts = Dict(OmegaConf.to_container(opts))
args = hydra_opts.pop("args", None)
auto_resumed = {}
config_path = args.config
if hydra_opts.train.resume:
out_ = str(env_to_path(hydra_opts.output_path))
config_path = Path(out_) / "opts.yaml"
if not config_path.exists():
config_path = None
print("WARNING: could not reuse the opts in {}".format(out_))
default = args.default or Path(__file__).parent / "shared/trainer/defaults.yaml"
# -----------------------
# ----- Load opts -----
# -----------------------
opts = load_opts(config_path, default=default, commandline_opts=hydra_opts)
if args.resume:
opts.train.resume = True
opts.jobID = os.environ.get("SLURM_JOBID")
opts.slurm_partition = os.environ.get("SLURM_JOB_PARTITION")
opts.output_path = str(env_to_path(opts.output_path))
print("Config output_path:", opts.output_path)
exp = comet_previous_id = None
# -------------------------------
# ----- Check output_path -----
# -------------------------------
# Auto-continue if same slurm job ID (=job was requeued)
if not opts.train.resume and opts.train.auto_resume:
print("\n\nTrying to auto-resume...")
existing_path = find_existing_training(opts)
if existing_path is not None and existing_path.exists():
auto_resumed["original output_path"] = str(opts.output_path)
auto_resumed["existing_path"] = str(existing_path)
opts.train.resume = True
opts.output_path = str(existing_path)
# Still not resuming: creating new output path
if not opts.train.resume:
opts.output_path = str(get_increased_path(opts.output_path))
Path(opts.output_path).mkdir(parents=True, exist_ok=True)
# Copy the opts's sbatch_file to output_path
copy_run_files(opts)
# store git hash
opts.git_hash = get_git_revision_hash()
opts.git_branch = get_git_branch()
if not args.no_comet:
# ----------------------------------
# ----- Set Comet Experiment -----
# ----------------------------------
if opts.train.resume:
# Is resuming: get existing comet exp id
assert Path(opts.output_path).exists(), "Output_path does not exist"
comet_previous_id = get_existing_comet_id(opts.output_path)
# Continue existing experiment
if comet_previous_id is None:
print("WARNING could not retreive previous comet id")
print(f"from {opts.output_path}")
else:
print("Continuing previous experiment", comet_previous_id)
auto_resumed["continuing exp id"] = comet_previous_id
exp = ExistingExperiment(
previous_experiment=comet_previous_id, **comet_kwargs
)
print("Comet Experiment resumed")
if exp is None:
# Create new experiment
print("Starting new experiment")
exp = Experiment(project_name="climategan", **comet_kwargs)
exp.log_asset_folder(
str(Path(__file__).parent / "climategan"),
recursive=True,
log_file_name=True,
)
exp.log_asset(str(Path(__file__)))
# Log note
if args.note:
exp.log_parameter("note", args.note)
# Merge and log tags
if args.comet_tags or opts.comet.tags:
tags = set([f"branch:{opts.git_branch}"])
if args.comet_tags:
tags.update(args.comet_tags)
if opts.comet.tags:
tags.update(opts.comet.tags)
opts.comet.tags = list(tags)
print("Logging to comet.ml with tags", opts.comet.tags)
exp.add_tags(opts.comet.tags)
# Log all opts
exp.log_parameters(flatten_opts(opts))
if auto_resumed:
exp.log_text("\n".join(f"{k:20}: {v}" for k, v in auto_resumed.items()))
# allow some time for comet to get its url
sleep(1)
# Save comet exp url
url_path = get_increased_path(Path(opts.output_path) / "comet_url.txt")
with open(url_path, "w") as f:
f.write(exp.url)
# Save config file
opts_path = get_increased_path(Path(opts.output_path) / "opts.yaml")
with (opts_path).open("w") as f:
yaml.safe_dump(opts.to_dict(), f)
pprint("Running model in", opts.output_path)
# -------------------
# ----- Train -----
# -------------------
trainer = Trainer(opts, comet_exp=exp, verbose=1)
trainer.logger.time.start_time = time()
trainer.setup()
trainer.train()
# -----------------------------
# ----- End of training -----
# -----------------------------
pprint("Done training")
kill_job(opts.jobID)
if __name__ == "__main__":
main()