diff --git a/train_enc_dc_f32c32_EqM/.hydra/config.yaml b/train_enc_dc_f32c32_EqM/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b86341adfde2c3e25fbcd849ac0bc9f926acb55c --- /dev/null +++ b/train_enc_dc_f32c32_EqM/.hydra/config.yaml @@ -0,0 +1,52 @@ +seed: 0 +task: train +runtime_path: ${hydra:runtime.cwd} +ckpt_dir: ${runtime_path}/runs +run_name: train_enc_dc_f32c32_EqM +cache_dir: ${ckpt_dir}/cache +run_dir: ${ckpt_dir}/jobs/${run_name} +checkpoint_path: ${run_dir}/checkpoints +dataset: + imagenet_root: imagenet_data + im_size: 128 + batch_size: 192 + aug_scale: 2 + limit: null +distill_teacher: false +dc_ssdae: + compile: false + checkpoint: null + encoder: f32c32 + encoder_checkpoint: null + encoder_train: true + decoder: S + trainer_type: FM + encoder_type: dc + sampler: + steps: 10 + ema: + decay: 0.999 + start_iter: 50000 +aux_losses: + compile: ${dc_ssdae.compile} + repa: + i_extract: 4 + n_layers: 2 + lpips: true +training: + sdpa_kernel: 2 + mixed_precision: bf16 + grad_accumulate: 1 + grad_clip: 0.1 + epochs: 60 + eval_freq: 1 + save_on_best: FID + log_freq: 100 + lr: 0.0003 + weight_decay: 0.001 +losses: + diffusion: 1 + repa: 0.25 + lpips: 0.5 + kl: 1.0e-06 +show_samples: 8 diff --git a/train_enc_dc_f32c32_EqM/.hydra/hydra.yaml b/train_enc_dc_f32c32_EqM/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..50aef4d052a1b83f72b4c0af43e91c0923f87255 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/.hydra/hydra.yaml @@ -0,0 +1,172 @@ +hydra: + run: + dir: ${run_dir} + sweep: + dir: ${run_dir} + subdir: multirun_${hydra:job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + colorlog: + (): colorlog.ColoredFormatter + format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] + - %(message)s' + log_colors: + DEBUG: purple + INFO: green + WARNING: yellow + ERROR: red + CRITICAL: red + handlers: + console: + class: logging.StreamHandler + formatter: colorlog + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra:runtime.output_dir}/${hydra:job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - run_name=train_enc_dc_f32c32_EqM + - dataset.im_size=128 + - dataset.aug_scale=2 + - training.epochs=60 + - dc_ssdae.encoder_train=true + job: + name: main + chdir: null + override_dirname: dataset.aug_scale=2,dataset.im_size=128,dc_ssdae.encoder_train=true,run_name=train_enc_dc_f32c32_EqM,training.epochs=60 + id: ??? + num: ??? + config_name: dc_f32c32_EqM + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /workspace/DC_SSDAE + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /workspace/DC_SSDAE/config + schema: file + provider: main + - path: hydra_plugins.hydra_colorlog.conf + schema: pkg + provider: hydra-colorlog + - path: '' + schema: structured + provider: schema + output_dir: /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: colorlog + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/train_enc_dc_f32c32_EqM/.hydra/overrides.yaml b/train_enc_dc_f32c32_EqM/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7d8a182062cd4caf54f478a8db6e9d66cf5dc00c --- /dev/null +++ b/train_enc_dc_f32c32_EqM/.hydra/overrides.yaml @@ -0,0 +1,5 @@ +- run_name=train_enc_dc_f32c32_EqM +- dataset.im_size=128 +- dataset.aug_scale=2 +- training.epochs=60 +- dc_ssdae.encoder_train=true diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/custom_checkpoint_0.pkl b/train_enc_dc_f32c32_EqM/checkpoints/best/custom_checkpoint_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..7936b53c26c9c1c5e207d297cc10556b899c26a7 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/custom_checkpoint_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:726176b2bceed93bdc4062c280fc8c3909ab0fa49448ac117e8c666b885644bf +size 2613 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/model.safetensors b/train_enc_dc_f32c32_EqM/checkpoints/best/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..797e509d0e45f4cc195fbbc384f78a19b1978819 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f97dec3f6a847ed655e06cece14b31d9a302b44e873cee0bb4e646213ef807d +size 968466492 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/model_1.safetensors b/train_enc_dc_f32c32_EqM/checkpoints/best/model_1.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cad3c7b2ebee60b6d9f3f501f7898b825e4e07a5 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/model_1.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ac56b70c949f519090239c149624309f2547b4071420c958ea5ac77a6654aca +size 968466492 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/model_2.safetensors b/train_enc_dc_f32c32_EqM/checkpoints/best/model_2.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3011c561291585ae9ca12cd1c3683e6fa478a063 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/model_2.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bbab4c62906aa403b7b13d37c8ad279f930263ddb049d895cf49c6e0d26d760 +size 598032 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/model_ae.safetensors b/train_enc_dc_f32c32_EqM/checkpoints/best/model_ae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..797e509d0e45f4cc195fbbc384f78a19b1978819 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/model_ae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f97dec3f6a847ed655e06cece14b31d9a302b44e873cee0bb4e646213ef807d +size 968466492 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/model_ae_ema.safetensors b/train_enc_dc_f32c32_EqM/checkpoints/best/model_ae_ema.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cad3c7b2ebee60b6d9f3f501f7898b825e4e07a5 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/model_ae_ema.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ac56b70c949f519090239c149624309f2547b4071420c958ea5ac77a6654aca +size 968466492 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/model_aux_losses.safetensors b/train_enc_dc_f32c32_EqM/checkpoints/best/model_aux_losses.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3011c561291585ae9ca12cd1c3683e6fa478a063 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/model_aux_losses.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bbab4c62906aa403b7b13d37c8ad279f930263ddb049d895cf49c6e0d26d760 +size 598032 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/optimizer.bin b/train_enc_dc_f32c32_EqM/checkpoints/best/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..fd5c24c00c35a326d1f64341166f3aa3dc16a855 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d355d619e7076fe417e23f82105fdeffb5a5ed5c96793b6d9ecebe39f18adeb +size 1938294667 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_0.pkl b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d1352a7bf899f01149b2963b0215d3d44e1657a5 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4da74ebe113e8d4fabdb5ff2d4f7adb86212961727e1ef1b0e51a500f3128980 +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_1.pkl b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ad8616e6d208d1501c5359cda0240c993ca2915d --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09a638e16e35b2644ee8480e1e6018a28aa825be4236df698a206317d91ec5a5 +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_2.pkl b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cd78b6f7c2b23f1e8bf3c9e22247e4376bd85d68 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa728dc461456dfca7829ef28594484d6fc259c19d4ba5b3f653e2b29255bc13 +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_3.pkl b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..09f4ed2be978e5055b8220c7b60be7ba600d7880 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d0bc9c84ddd3eaa7aa38a00a62879a644f28a488db5044414c322d0493ec058 +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_4.pkl b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cc79300ae99d3bbe61db4a568375c65cca300c8f --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68021225260db210aa55b53d8bea3c5cddff57c2a777d98f05250fbb3e220e69 +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_5.pkl b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..92f0fdacb18dc4ca2bc6bf899767799b56e73c05 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aab7b9f9e7f8875e76f1a5f1c0c25f3329b6389239a4c806630e3cfcf22b7ed +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_6.pkl b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b5a9a061fa5c7e95999220480ad7382fc4e3f435 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b8d5e9206b3a5e76a7f063d58911be1b35b7c5816f79a9f4aab1620c2c192f6 +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_7.pkl b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b3801eee67da2e2064d77a068c95caebfbb3caf5 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/best/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03fc4df777e844870cd3b8520830d35d70be3b5580abc9f2db62bdb0b1f84e26 +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/custom_checkpoint_0.pkl b/train_enc_dc_f32c32_EqM/checkpoints/last/custom_checkpoint_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c4fbadeda6c17d2f4b658859f52df954976ad62d --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/custom_checkpoint_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93e385120cf4f6bf4eb26b6dfc294a83606c14f52551cf3478273c70e1a3c8c9 +size 2613 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/model.safetensors b/train_enc_dc_f32c32_EqM/checkpoints/last/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..797e509d0e45f4cc195fbbc384f78a19b1978819 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f97dec3f6a847ed655e06cece14b31d9a302b44e873cee0bb4e646213ef807d +size 968466492 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/model_1.safetensors b/train_enc_dc_f32c32_EqM/checkpoints/last/model_1.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cad3c7b2ebee60b6d9f3f501f7898b825e4e07a5 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/model_1.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ac56b70c949f519090239c149624309f2547b4071420c958ea5ac77a6654aca +size 968466492 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/model_2.safetensors b/train_enc_dc_f32c32_EqM/checkpoints/last/model_2.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3011c561291585ae9ca12cd1c3683e6fa478a063 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/model_2.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bbab4c62906aa403b7b13d37c8ad279f930263ddb049d895cf49c6e0d26d760 +size 598032 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/model_ae.safetensors b/train_enc_dc_f32c32_EqM/checkpoints/last/model_ae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..797e509d0e45f4cc195fbbc384f78a19b1978819 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/model_ae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f97dec3f6a847ed655e06cece14b31d9a302b44e873cee0bb4e646213ef807d +size 968466492 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/model_ae_ema.safetensors b/train_enc_dc_f32c32_EqM/checkpoints/last/model_ae_ema.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..cad3c7b2ebee60b6d9f3f501f7898b825e4e07a5 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/model_ae_ema.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ac56b70c949f519090239c149624309f2547b4071420c958ea5ac77a6654aca +size 968466492 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/model_aux_losses.safetensors b/train_enc_dc_f32c32_EqM/checkpoints/last/model_aux_losses.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3011c561291585ae9ca12cd1c3683e6fa478a063 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/model_aux_losses.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bbab4c62906aa403b7b13d37c8ad279f930263ddb049d895cf49c6e0d26d760 +size 598032 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/optimizer.bin b/train_enc_dc_f32c32_EqM/checkpoints/last/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..fd5c24c00c35a326d1f64341166f3aa3dc16a855 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d355d619e7076fe417e23f82105fdeffb5a5ed5c96793b6d9ecebe39f18adeb +size 1938294667 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_0.pkl b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..d1352a7bf899f01149b2963b0215d3d44e1657a5 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4da74ebe113e8d4fabdb5ff2d4f7adb86212961727e1ef1b0e51a500f3128980 +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_1.pkl b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ad8616e6d208d1501c5359cda0240c993ca2915d --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09a638e16e35b2644ee8480e1e6018a28aa825be4236df698a206317d91ec5a5 +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_2.pkl b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cd78b6f7c2b23f1e8bf3c9e22247e4376bd85d68 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa728dc461456dfca7829ef28594484d6fc259c19d4ba5b3f653e2b29255bc13 +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_3.pkl b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..09f4ed2be978e5055b8220c7b60be7ba600d7880 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d0bc9c84ddd3eaa7aa38a00a62879a644f28a488db5044414c322d0493ec058 +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_4.pkl b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..cc79300ae99d3bbe61db4a568375c65cca300c8f --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68021225260db210aa55b53d8bea3c5cddff57c2a777d98f05250fbb3e220e69 +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_5.pkl b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..92f0fdacb18dc4ca2bc6bf899767799b56e73c05 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aab7b9f9e7f8875e76f1a5f1c0c25f3329b6389239a4c806630e3cfcf22b7ed +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_6.pkl b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b5a9a061fa5c7e95999220480ad7382fc4e3f435 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b8d5e9206b3a5e76a7f063d58911be1b35b7c5816f79a9f4aab1620c2c192f6 +size 16449 diff --git a/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_7.pkl b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..b3801eee67da2e2064d77a068c95caebfbb3caf5 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/checkpoints/last/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03fc4df777e844870cd3b8520830d35d70be3b5580abc9f2db62bdb0b1f84e26 +size 16449 diff --git a/train_enc_dc_f32c32_EqM/config.yaml b/train_enc_dc_f32c32_EqM/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b86341adfde2c3e25fbcd849ac0bc9f926acb55c --- /dev/null +++ b/train_enc_dc_f32c32_EqM/config.yaml @@ -0,0 +1,52 @@ +seed: 0 +task: train +runtime_path: ${hydra:runtime.cwd} +ckpt_dir: ${runtime_path}/runs +run_name: train_enc_dc_f32c32_EqM +cache_dir: ${ckpt_dir}/cache +run_dir: ${ckpt_dir}/jobs/${run_name} +checkpoint_path: ${run_dir}/checkpoints +dataset: + imagenet_root: imagenet_data + im_size: 128 + batch_size: 192 + aug_scale: 2 + limit: null +distill_teacher: false +dc_ssdae: + compile: false + checkpoint: null + encoder: f32c32 + encoder_checkpoint: null + encoder_train: true + decoder: S + trainer_type: FM + encoder_type: dc + sampler: + steps: 10 + ema: + decay: 0.999 + start_iter: 50000 +aux_losses: + compile: ${dc_ssdae.compile} + repa: + i_extract: 4 + n_layers: 2 + lpips: true +training: + sdpa_kernel: 2 + mixed_precision: bf16 + grad_accumulate: 1 + grad_clip: 0.1 + epochs: 60 + eval_freq: 1 + save_on_best: FID + log_freq: 100 + lr: 0.0003 + weight_decay: 0.001 +losses: + diffusion: 1 + repa: 0.25 + lpips: 0.5 + kl: 1.0e-06 +show_samples: 8 diff --git a/train_enc_dc_f32c32_EqM/main.log b/train_enc_dc_f32c32_EqM/main.log new file mode 100644 index 0000000000000000000000000000000000000000..44ea52f93fc1e49ced8d3a6efb22541b267f8f99 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/main.log @@ -0,0 +1,1153 @@ +[2025-10-26 11:19:20,467][main][INFO] - Will write tensorboard logs inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/tensorboard_logs +[2025-10-26 11:19:20,470][main][INFO] - Runtime at /workspace/DC_SSDAE +[2025-10-26 11:19:20,472][main][INFO] - Running inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM +[2025-10-26 11:19:20,472][main][INFO] - Running args: ['main.py', 'run_name=train_enc_dc_f32c32_EqM', 'dataset.im_size=128', 'dataset.aug_scale=2', 'training.epochs=60', 'dc_ssdae.encoder_train=true'] +[2025-10-26 11:19:20,473][main][INFO] - Command: 'main.py' 'run_name=train_enc_dc_f32c32_EqM' 'dataset.im_size=128' 'dataset.aug_scale=2' 'training.epochs=60' 'dc_ssdae.encoder_train=true' +[2025-10-26 11:19:20,473][main][INFO] - Accelerator with 8 processes, running on cuda:0 +[2025-10-26 11:19:20,478][main][INFO] - Hydra configuration: +seed: 0 +task: train +runtime_path: ${hydra:runtime.cwd} +ckpt_dir: ${runtime_path}/runs +run_name: train_enc_dc_f32c32_EqM +cache_dir: ${ckpt_dir}/cache +run_dir: ${ckpt_dir}/jobs/${run_name} +checkpoint_path: ${run_dir}/checkpoints +dataset: + imagenet_root: imagenet_data + im_size: 128 + batch_size: 192 + aug_scale: 2 + limit: null +distill_teacher: false +dc_ssdae: + compile: false + checkpoint: null + encoder: f32c32 + encoder_checkpoint: null + encoder_train: true + decoder: S + trainer_type: FM + encoder_type: dc + sampler: + steps: 10 + ema: + decay: 0.999 + start_iter: 50000 +aux_losses: + compile: ${dc_ssdae.compile} + repa: + i_extract: 4 + n_layers: 2 + lpips: true +training: + sdpa_kernel: 2 + mixed_precision: bf16 + grad_accumulate: 1 + grad_clip: 0.1 + epochs: 60 + eval_freq: 1 + save_on_best: FID + log_freq: 100 + lr: 0.0003 + weight_decay: 0.001 +losses: + diffusion: 1 + repa: 0.25 + lpips: 0.5 + kl: 1.0e-06 +show_samples: 8 + + + +[2025-10-26 11:19:33,933][main][INFO] - Loaded ImageNet dataset: {'train': Dataset ImageNet + Number of datapoints: 1279867 + Root location: ../../../imagenet_data + Split: train + StandardTransform +Transform: Compose( + RandomResize(min_size=128, max_size=256, interpolation=InterpolationMode.LANCZOS, antialias=True) + RandomCrop(size=(128, 128), pad_if_needed=False, fill=0, padding_mode=constant) + RandomHorizontalFlip(p=0.5) + ToImage() + ToDtype(scale=True) + Normalize(mean=[0.5], std=[0.5], inplace=False) + ), 'test': Dataset ImageNet + Number of datapoints: 49950 + Root location: ../../../imagenet_data + Split: validation + StandardTransform +Transform: Compose( + Resize(size=[128], interpolation=InterpolationMode.BILINEAR, antialias=True) + CenterCrop(size=(128, 128)) + ToImage() + ToDtype(scale=True) + Normalize(mean=[0.5], std=[0.5], inplace=False) + )} +[2025-10-26 11:19:49,801][main][INFO] - ae parameters count: +[2025-10-26 11:19:49,807][main][INFO] - Total: #230.9M (trainable: #230.9M) +[2025-10-26 11:19:49,808][main][INFO] - - encoder: #217.4M (trainable: #217.4M) +[2025-10-26 11:19:49,809][main][INFO] - - project_in: #1.8K (trainable: #1.8K) +[2025-10-26 11:19:49,810][main][INFO] - - stages: #216.9M (trainable: #216.9M) +[2025-10-26 11:19:49,811][main][INFO] - - project_out: #576.1K (trainable: #576.1K) +[2025-10-26 11:19:49,813][main][INFO] - - decoder: #13.5M (trainable: #13.5M) +[2025-10-26 11:19:49,813][main][INFO] - - conv_in_img: #896 (trainable: #896) +[2025-10-26 11:19:49,814][main][INFO] - - conv_in_z: #9.0K (trainable: #9.0K) +[2025-10-26 11:19:49,814][main][INFO] - - conv_in: #36.1K (trainable: #36.1K) +[2025-10-26 11:19:49,815][main][INFO] - - batch_norm_z: #64 (trainable: #64) +[2025-10-26 11:19:49,815][main][INFO] - - time_proj: #0 (trainable: #0) +[2025-10-26 11:19:49,817][main][INFO] - - time_embedding: #80.5K (trainable: #80.5K) +[2025-10-26 11:19:49,818][main][INFO] - - ada_ctx_proj: #54.1K (trainable: #54.1K) +[2025-10-26 11:19:49,819][main][INFO] - - down_blocks: #3.0M (trainable: #3.0M) +[2025-10-26 11:19:49,820][main][INFO] - - mid_block: #3.4M (trainable: #3.4M) +[2025-10-26 11:19:49,820][main][INFO] - - up_blocks: #6.9M (trainable: #6.9M) +[2025-10-26 11:19:49,821][main][INFO] - - conv_norm_out: #128 (trainable: #128) +[2025-10-26 11:19:49,821][main][INFO] - - conv_out_act: #0 (trainable: #0) +[2025-10-26 11:19:49,822][main][INFO] - - conv_out: #1.7K (trainable: #1.7K) +[2025-10-26 11:19:49,825][main][INFO] - ae: EMAWrapper( + (model): DistributedDataParallel( + (module): DC_SSDAE( + (encoder): DCEncoder( + (project_in): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (stages): ModuleList( + (0): OpSequential( + (op_list): ModuleList() + ) + (1): OpSequential( + (op_list): ModuleList( + (0-4): 5 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + (5): ResidualBlock( + (main): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (shortcut): PixelUnshuffleChannelAveragingDownSampleLayer() + ) + ) + ) + (2): OpSequential( + (op_list): ModuleList( + (0-9): 10 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + (10): ResidualBlock( + (main): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(512, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (shortcut): PixelUnshuffleChannelAveragingDownSampleLayer() + ) + ) + ) + (3): OpSequential( + (op_list): ModuleList( + (0-3): 4 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + (4): ResidualBlock( + (main): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (shortcut): PixelUnshuffleChannelAveragingDownSampleLayer() + ) + ) + ) + (4): OpSequential( + (op_list): ModuleList( + (0-3): 4 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + (4): ResidualBlock( + (main): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(1024, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (shortcut): PixelUnshuffleChannelAveragingDownSampleLayer() + ) + ) + ) + (5): OpSequential( + (op_list): ModuleList( + (0-3): 4 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + ) + ) + ) + (project_out): OpSequential( + (op_list): ModuleList( + (0): ConvLayer( + (conv): Conv2d(1024, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + ) + (decoder): UViTDecoder( + (conv_in_img): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (conv_in_z): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (conv_in): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (batch_norm_z): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (time_proj): Timesteps() + (time_embedding): TimestepEmbedding( + (linear_1): Linear(in_features=64, out_features=256, bias=True) + (act): SiLU() + (linear_2): Linear(in_features=256, out_features=256, bias=True) + ) + (ada_ctx_proj): Sequential( + (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): SiLU() + (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (down_blocks): ModuleList( + (0): DownBlock2D( + (resnets): ModuleList( + (0-1): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=128, bias=True) + (norm2): GroupNorm(32, 64, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + (downsamplers): ModuleList( + (0): Downsample2D( + (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + ) + (1): DownBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(64, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 192, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + (downsamplers): ModuleList( + (0): Downsample2D( + (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + ) + (2): DownBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 192, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(96, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(96, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + (downsamplers): ModuleList( + (0): Downsample2D( + (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + ) + (3): DownBlock2D( + (resnets): ModuleList( + (0-1): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + ) + ) + (mid_block): UViTMiddleTransformer( + (proj_in): Linear(in_features=160, out_features=160, bias=True) + (transformer_blocks): ModuleList( + (0-7): 8 x TransformerBlock( + (norm1): AdaLayerNorm( + (silu): SiLU() + (linear): Linear(in_features=64, out_features=320, bias=True) + (norm): LayerNorm((160,), eps=1e-05, elementwise_affine=False) + ) + (attn1): Attention( + (to_q): Linear(in_features=160, out_features=160, bias=False) + (to_k): Linear(in_features=160, out_features=160, bias=False) + (to_v): Linear(in_features=160, out_features=160, bias=False) + (out_proj): Linear(in_features=160, out_features=160, bias=True) + (out_drop): Dropout(p=0.0, inplace=False) + ) + (norm2): LayerNorm((160,), eps=1e-05, elementwise_affine=True) + (ff): FeedForward( + (proj_in_act): GEGLU( + (proj): Linear(in_features=160, out_features=1280, bias=True) + ) + (drop): Dropout(p=0.0, inplace=False) + (proj_out): Linear(in_features=640, out_features=160, bias=True) + ) + (relative_position_bias): RelativePositionBias() + ) + ) + (proj_out): Linear(in_features=160, out_features=160, bias=True) + (norm): GroupNorm(32, 160, eps=1e-06, affine=True) + ) + (up_blocks): ModuleList( + (0): UpBlock2D( + (resnets): ModuleList( + (0-2): 3 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(320, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(320, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (upsamplers): ModuleList( + (0): Upsample2D( + (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (1): UpBlock2D( + (resnets): ModuleList( + (0-1): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(320, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(320, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + (2): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 512, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(256, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(256, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (upsamplers): ModuleList( + (0): Upsample2D( + (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (2): UpBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 512, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(256, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(256, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(192, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + (2): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(160, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (upsamplers): ModuleList( + (0): Upsample2D( + (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (3): UpBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=128, bias=True) + (norm2): GroupNorm(32, 64, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(160, 64, kernel_size=(1, 1), stride=(1, 1)) + ) + (1-2): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=128, bias=True) + (norm2): GroupNorm(32, 64, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(128, 64, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + ) + ) + (conv_norm_out): GroupNorm(32, 64, eps=1e-05, affine=True) + (conv_out_act): SiLU() + (conv_out): Conv2d(64, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (ema): EMA(ema_model=DC_SSDAE, decay=0.999, start_iter=50000) +) +[2025-10-26 11:19:49,825][main][INFO] - aux_losses parameters count: +[2025-10-26 11:19:49,826][main][INFO] - Total: #96.7M (trainable: #145.9K) +[2025-10-26 11:19:49,827][main][INFO] - - repa_loss: #82.7M (trainable: #145.9K) +[2025-10-26 11:19:49,828][main][INFO] - - lpips_loss: #14.0M (trainable: #0) +[2025-10-26 11:19:49,828][main][INFO] - aux_losses: DistributedDataParallel( + (module): SSDDLosses( + (repa_loss): REPALoss( + (features_extractor): Frozen(DinoEncoder/Dinov2Model) + (repa_mlp): Sequential( + (0): Linear(in_features=160, out_features=160, bias=True) + (1): SiLU() + (2): Linear(in_features=160, out_features=768, bias=True) + ) + (repa_loss): CosineSimilarity() + ) + (lpips_loss): Frozen(LPIPS) + ) +) +[2025-10-26 11:19:49,833][main][INFO] - Optimizer for autoencoder: RAdamScheduleFree ( +Parameter Group 0 + betas: (0.9, 0.999) + eps: 1e-08 + foreach: True + k: 0 + lr: 0.0003 + lr_max: -1.0 + r: 0.0 + scheduled_lr: 0.0 + silent_sgd_phase: True + train_mode: False + weight_decay: 0.001 + weight_lr_power: 2.0 + weight_sum: 0.0 + +Parameter Group 1 + betas: (0.9, 0.999) + eps: 1e-08 + foreach: True + k: 0 + lr: 0.0003 + lr_max: -1.0 + r: 0.0 + scheduled_lr: 0.0 + silent_sgd_phase: True + train_mode: False + weight_decay: 0.0 + weight_lr_power: 2.0 + weight_sum: 0.0 +) +[2025-10-26 11:19:49,843][main][INFO] - No training state found to resume from None +[2025-10-26 11:19:49,844][main][INFO] - ====================== RUNNING TASK train +[2025-10-26 11:19:49,844][main][INFO] - Starting training +[2025-10-26 11:19:49,845][main][INFO] - Batch size of 192 (24 per GPU, 1 acumulation step(s) 8 process(es)) +[2025-10-26 11:19:49,853][main][INFO] - --- + + +[2025-10-26 11:19:49,854][main][INFO] - [T_total=00:00:29 | T_train=00:00:00] Start epoch 0 +[2025-10-26 14:25:01,522][main][INFO] - [T_total=03:05:41 | T_train=03:05:11 | T_epoch=03:05:11] End of epoch 0 (6666 steps) train loss 67151 +[2025-10-26 14:25:01,524][main][INFO] - [Epoch 0] All losses: [[diffusion=0.124278 ; kl=6.71505e+10 ; lpips=0.360362 ; repa=0.667823]] +[2025-10-26 14:28:30,738][main][INFO] - [Epoch 1] Test metrics: [[MSE=47.45 | MAE=0.161 | LPIPS=0.4364 | PSNR=13.24 | SSIM=0.2403 | dreamsim=0.6167 | FID=113.3]] +[2025-10-26 14:28:30,740][main][INFO] - [Epoch 1] Best metrics: [[min_MSE=47.45 | min_MAE=0.161 | min_LPIPS=0.4364 | max_PSNR=13.24 | max_SSIM=0.2403 | min_dreamsim=0.6167 | min_FID=113.3]] +[2025-10-26 14:28:30,741][main][DEBUG] - Writing images to disk... +[2025-10-26 14:28:31,622][main][DEBUG] - Image(s) saved on disk +[2025-10-26 14:28:31,831][main][INFO] - End of epoch timers: [T_train=03:05:11 | T_epoch=03:05:11 | T_eval=00:03:30 | T_total=03:09:11] +[2025-10-26 14:28:31,832][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-26 14:28:43,727][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-26 14:28:54,887][main][INFO] - --- + + +[2025-10-26 14:28:54,888][main][INFO] - [T_total=03:09:34 | T_train=03:05:11] Start epoch 1 +[2025-10-26 17:33:46,084][main][INFO] - [T_total=06:14:25 | T_train=06:10:02 | T_epoch=03:04:51] End of epoch 1 (13332 steps) train loss 4110.26 +[2025-10-26 17:33:46,086][main][INFO] - [Epoch 1] All losses: [[diffusion=0.0919295 ; kl=4.10988e+09 ; lpips=0.275692 ; repa=0.588433]] +[2025-10-26 17:37:12,979][main][INFO] - [Epoch 2] Test metrics: [[MSE=46.7 | MAE=0.1611 | LPIPS=0.3256 | PSNR=13.31 | SSIM=0.2891 | dreamsim=0.496 | FID=78.54]] +[2025-10-26 17:37:12,981][main][INFO] - [Epoch 2] Best metrics: [[min_MSE=46.7 | min_MAE=0.161 | min_LPIPS=0.3256 | max_PSNR=13.31 | max_SSIM=0.2891 | min_dreamsim=0.496 | min_FID=78.54]] +[2025-10-26 17:37:12,982][main][DEBUG] - Writing images to disk... +[2025-10-26 17:37:13,796][main][DEBUG] - Image(s) saved on disk +[2025-10-26 17:37:14,011][main][INFO] - End of epoch timers: [T_train=06:10:02 | T_epoch=03:04:51 | T_eval=00:06:58 | T_total=06:17:53] +[2025-10-26 17:37:14,012][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-26 17:37:25,273][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-26 17:37:35,581][main][INFO] - --- + + +[2025-10-26 17:37:35,582][main][INFO] - [T_total=06:18:15 | T_train=06:10:02] Start epoch 2 +[2025-10-26 20:42:34,608][main][INFO] - [T_total=09:23:14 | T_train=09:15:01 | T_epoch=03:04:59] End of epoch 2 (19998 steps) train loss 1112.41 +[2025-10-26 20:42:34,609][main][INFO] - [Epoch 2] All losses: [[diffusion=0.0875515 ; kl=1.11206e+09 ; lpips=0.238805 ; repa=0.559219]] +[2025-10-26 20:46:02,005][main][INFO] - [Epoch 3] Test metrics: [[MSE=39.34 | MAE=0.1462 | LPIPS=0.2609 | PSNR=14.05 | SSIM=0.3195 | dreamsim=0.4047 | FID=56.04]] +[2025-10-26 20:46:02,007][main][INFO] - [Epoch 3] Best metrics: [[min_MSE=39.34 | min_MAE=0.1462 | min_LPIPS=0.2609 | max_PSNR=14.05 | max_SSIM=0.3195 | min_dreamsim=0.4047 | min_FID=56.04]] +[2025-10-26 20:46:02,007][main][DEBUG] - Writing images to disk... +[2025-10-26 20:46:02,818][main][DEBUG] - Image(s) saved on disk +[2025-10-26 20:46:03,028][main][INFO] - End of epoch timers: [T_train=09:15:01 | T_epoch=03:04:59 | T_eval=00:10:26 | T_total=09:26:42] +[2025-10-26 20:46:03,029][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-26 20:46:14,286][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-26 20:46:24,572][main][INFO] - --- + + +[2025-10-26 20:46:24,573][main][INFO] - [T_total=09:27:04 | T_train=09:15:01] Start epoch 3 +[2025-10-26 23:51:05,689][main][INFO] - [T_total=12:31:45 | T_train=12:19:43 | T_epoch=03:04:41] End of epoch 3 (26664 steps) train loss 5.02755 +[2025-10-26 23:51:05,690][main][INFO] - [Epoch 3] All losses: [[diffusion=0.0849642 ; kl=4.69653e+06 ; lpips=0.22171 ; repa=0.540818]] +[2025-10-26 23:54:33,185][main][INFO] - [Epoch 4] Test metrics: [[MSE=35.97 | MAE=0.1387 | LPIPS=0.2313 | PSNR=14.44 | SSIM=0.3346 | dreamsim=0.3568 | FID=45.03]] +[2025-10-26 23:54:33,187][main][INFO] - [Epoch 4] Best metrics: [[min_MSE=35.97 | min_MAE=0.1387 | min_LPIPS=0.2313 | max_PSNR=14.44 | max_SSIM=0.3346 | min_dreamsim=0.3568 | min_FID=45.03]] +[2025-10-26 23:54:33,188][main][DEBUG] - Writing images to disk... +[2025-10-26 23:54:34,013][main][DEBUG] - Image(s) saved on disk +[2025-10-26 23:54:34,260][main][INFO] - End of epoch timers: [T_train=12:19:43 | T_epoch=03:04:41 | T_eval=00:13:54 | T_total=12:35:13] +[2025-10-26 23:54:34,261][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-26 23:54:45,885][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-26 23:54:56,831][main][INFO] - --- + + +[2025-10-26 23:54:56,832][main][INFO] - [T_total=12:35:36 | T_train=12:19:43] Start epoch 4 +[2025-10-27 03:00:25,624][main][INFO] - [T_total=15:41:05 | T_train=15:25:11 | T_epoch=03:05:28] End of epoch 4 (33330 steps) train loss 166.439 +[2025-10-27 03:00:25,626][main][INFO] - [Epoch 4] All losses: [[diffusion=0.0838747 ; kl=1.66118e+08 ; lpips=0.211539 ; repa=0.528085]] +[2025-10-27 03:03:52,781][main][INFO] - [Epoch 5] Test metrics: [[MSE=31.78 | MAE=0.129 | LPIPS=0.2131 | PSNR=14.98 | SSIM=0.3511 | dreamsim=0.3263 | FID=38.77]] +[2025-10-27 03:03:52,782][main][INFO] - [Epoch 5] Best metrics: [[min_MSE=31.78 | min_MAE=0.129 | min_LPIPS=0.2131 | max_PSNR=14.98 | max_SSIM=0.3511 | min_dreamsim=0.3263 | min_FID=38.77]] +[2025-10-27 03:03:52,783][main][DEBUG] - Writing images to disk... +[2025-10-27 03:03:53,617][main][DEBUG] - Image(s) saved on disk +[2025-10-27 03:03:53,821][main][INFO] - End of epoch timers: [T_train=15:25:11 | T_epoch=03:05:28 | T_eval=00:17:22 | T_total=15:44:33] +[2025-10-27 03:03:53,821][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-27 03:04:05,149][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-27 03:04:14,375][main][INFO] - --- + + +[2025-10-27 03:04:14,376][main][INFO] - [T_total=15:44:53 | T_train=15:25:11] Start epoch 5 +[2025-10-27 06:10:26,456][main][INFO] - [T_total=18:51:05 | T_train=18:31:23 | T_epoch=03:06:12] End of epoch 5 (39996 steps) train loss 39272 +[2025-10-27 06:10:26,457][main][INFO] - [Epoch 5] All losses: [[diffusion=0.0828694 ; kl=3.92717e+10 ; lpips=0.205352 ; repa=0.518688]] +[2025-10-27 06:13:53,521][main][INFO] - [Epoch 6] Test metrics: [[MSE=29.02 | MAE=0.1223 | LPIPS=0.2011 | PSNR=15.37 | SSIM=0.3643 | dreamsim=0.3048 | FID=34.65]] +[2025-10-27 06:13:53,524][main][INFO] - [Epoch 6] Best metrics: [[min_MSE=29.02 | min_MAE=0.1223 | min_LPIPS=0.2011 | max_PSNR=15.37 | max_SSIM=0.3643 | min_dreamsim=0.3048 | min_FID=34.65]] +[2025-10-27 06:13:53,525][main][DEBUG] - Writing images to disk... +[2025-10-27 06:13:54,357][main][DEBUG] - Image(s) saved on disk +[2025-10-27 06:13:54,564][main][INFO] - End of epoch timers: [T_train=18:31:23 | T_epoch=03:06:12 | T_eval=00:20:50 | T_total=18:54:34] +[2025-10-27 06:13:54,565][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-27 06:14:09,698][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-27 06:14:20,685][main][INFO] - --- + + +[2025-10-27 06:14:20,686][main][INFO] - [T_total=18:55:00 | T_train=18:31:23] Start epoch 6 +[2025-10-27 09:20:11,071][main][INFO] - [T_total=22:00:50 | T_train=21:37:14 | T_epoch=03:05:50] End of epoch 6 (46662 steps) train loss 38.9426 +[2025-10-27 09:20:11,072][main][INFO] - [Epoch 6] All losses: [[diffusion=0.0819753 ; kl=3.86332e+07 ; lpips=0.199326 ; repa=0.510736]] +[2025-10-27 09:23:38,395][main][INFO] - [Epoch 7] Test metrics: [[MSE=27.01 | MAE=0.1173 | LPIPS=0.191 | PSNR=15.68 | SSIM=0.3792 | dreamsim=0.2861 | FID=30.48]] +[2025-10-27 09:23:38,397][main][INFO] - [Epoch 7] Best metrics: [[min_MSE=27.01 | min_MAE=0.1173 | min_LPIPS=0.191 | max_PSNR=15.68 | max_SSIM=0.3792 | min_dreamsim=0.2861 | min_FID=30.48]] +[2025-10-27 09:23:38,398][main][DEBUG] - Writing images to disk... +[2025-10-27 09:23:39,236][main][DEBUG] - Image(s) saved on disk +[2025-10-27 09:23:39,501][main][INFO] - End of epoch timers: [T_train=21:37:14 | T_epoch=03:05:50 | T_eval=00:24:19 | T_total=22:04:19] +[2025-10-27 09:23:39,505][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-27 09:23:49,578][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-27 09:23:58,982][main][INFO] - --- + + +[2025-10-27 09:23:58,983][main][INFO] - [T_total=22:04:38 | T_train=21:37:14] Start epoch 7 +[2025-10-27 12:29:38,691][main][INFO] - [T_total=25:10:18 | T_train=24:42:53 | T_epoch=03:05:39] End of epoch 7 (53328 steps) train loss 21.8781 +[2025-10-27 12:29:38,692][main][INFO] - [Epoch 7] All losses: [[diffusion=0.0808239 ; kl=2.15753e+07 ; lpips=0.192459 ; repa=0.50294]] +[2025-10-27 12:33:06,379][main][INFO] - [Epoch 8] Test metrics: [[MSE=25.89 | MAE=0.1143 | LPIPS=0.1838 | PSNR=15.87 | SSIM=0.3878 | dreamsim=0.273 | FID=27.51]] +[2025-10-27 12:33:06,381][main][INFO] - [Epoch 8] Best metrics: [[min_MSE=25.89 | min_MAE=0.1143 | min_LPIPS=0.1838 | max_PSNR=15.87 | max_SSIM=0.3878 | min_dreamsim=0.273 | min_FID=27.51]] +[2025-10-27 12:33:06,382][main][DEBUG] - Writing images to disk... +[2025-10-27 12:33:07,208][main][DEBUG] - Image(s) saved on disk +[2025-10-27 12:33:07,412][main][INFO] - End of epoch timers: [T_train=24:42:53 | T_epoch=03:05:39 | T_eval=00:27:47 | T_total=25:13:46] +[2025-10-27 12:33:07,414][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-27 12:33:18,072][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-27 12:33:28,529][main][INFO] - --- + + +[2025-10-27 12:33:28,530][main][INFO] - [T_total=25:14:08 | T_train=24:42:53] Start epoch 8 +[2025-10-27 15:39:17,310][main][INFO] - [T_total=28:19:56 | T_train=27:48:42 | T_epoch=03:05:48] End of epoch 8 (59994 steps) train loss 56.025 +[2025-10-27 15:39:17,312][main][INFO] - [Epoch 8] All losses: [[diffusion=0.0821982 ; kl=5.5718e+07 ; lpips=0.19805 ; repa=0.503114]] +[2025-10-27 15:42:44,898][main][INFO] - [Epoch 9] Test metrics: [[MSE=25.28 | MAE=0.1125 | LPIPS=0.1792 | PSNR=15.97 | SSIM=0.3941 | dreamsim=0.2633 | FID=25.14]] +[2025-10-27 15:42:44,902][main][INFO] - [Epoch 9] Best metrics: [[min_MSE=25.28 | min_MAE=0.1125 | min_LPIPS=0.1792 | max_PSNR=15.97 | max_SSIM=0.3941 | min_dreamsim=0.2633 | min_FID=25.14]] +[2025-10-27 15:42:44,903][main][DEBUG] - Writing images to disk... +[2025-10-27 15:42:45,999][main][DEBUG] - Image(s) saved on disk +[2025-10-27 15:42:46,279][main][INFO] - End of epoch timers: [T_train=27:48:42 | T_epoch=03:05:48 | T_eval=00:31:16 | T_total=28:23:25] +[2025-10-27 15:42:46,281][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-27 15:42:57,387][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-27 15:43:08,322][main][INFO] - --- + + +[2025-10-27 15:43:08,323][main][INFO] - [T_total=28:23:47 | T_train=27:48:42] Start epoch 9 +[2025-10-27 18:48:50,025][main][INFO] - [T_total=31:29:29 | T_train=30:54:24 | T_epoch=03:05:41] End of epoch 9 (66660 steps) train loss 66.697 +[2025-10-27 18:48:50,026][main][INFO] - [Epoch 9] All losses: [[diffusion=0.0810491 ; kl=6.63959e+07 ; lpips=0.191886 ; repa=0.496548]] +[2025-10-27 18:52:17,550][main][INFO] - [Epoch 10] Test metrics: [[MSE=24.51 | MAE=0.1103 | LPIPS=0.1742 | PSNR=16.11 | SSIM=0.4006 | dreamsim=0.2544 | FID=23.1]] +[2025-10-27 18:52:17,551][main][INFO] - [Epoch 10] Best metrics: [[min_MSE=24.51 | min_MAE=0.1103 | min_LPIPS=0.1742 | max_PSNR=16.11 | max_SSIM=0.4006 | min_dreamsim=0.2544 | min_FID=23.1]] +[2025-10-27 18:52:17,552][main][DEBUG] - Writing images to disk... +[2025-10-27 18:52:18,382][main][DEBUG] - Image(s) saved on disk +[2025-10-27 18:52:18,587][main][INFO] - End of epoch timers: [T_train=30:54:24 | T_epoch=03:05:41 | T_eval=00:34:44 | T_total=31:32:58] +[2025-10-27 18:52:18,589][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-27 18:52:30,619][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-27 18:52:40,962][main][INFO] - --- + + +[2025-10-27 18:52:40,963][main][INFO] - [T_total=31:33:20 | T_train=30:54:24] Start epoch 10 +[2025-10-27 21:58:04,023][main][INFO] - [T_total=34:38:43 | T_train=33:59:47 | T_epoch=03:05:23] End of epoch 10 (73326 steps) train loss 5.94657 +[2025-10-27 21:58:04,024][main][INFO] - [Epoch 10] All losses: [[diffusion=0.0795436 ; kl=5.65143e+06 ; lpips=0.186013 ; repa=0.490351]] +[2025-10-27 22:01:31,334][main][INFO] - [Epoch 11] Test metrics: [[MSE=24.04 | MAE=0.109 | LPIPS=0.1708 | PSNR=16.19 | SSIM=0.4055 | dreamsim=0.2477 | FID=21.54]] +[2025-10-27 22:01:31,336][main][INFO] - [Epoch 11] Best metrics: [[min_MSE=24.04 | min_MAE=0.109 | min_LPIPS=0.1708 | max_PSNR=16.19 | max_SSIM=0.4055 | min_dreamsim=0.2477 | min_FID=21.54]] +[2025-10-27 22:01:31,337][main][DEBUG] - Writing images to disk... +[2025-10-27 22:01:32,162][main][DEBUG] - Image(s) saved on disk +[2025-10-27 22:01:32,367][main][INFO] - End of epoch timers: [T_train=33:59:47 | T_epoch=03:05:23 | T_eval=00:38:12 | T_total=34:42:11] +[2025-10-27 22:01:32,368][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-27 22:01:42,990][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-27 22:01:52,731][main][INFO] - --- + + +[2025-10-27 22:01:52,731][main][INFO] - [T_total=34:42:32 | T_train=33:59:47] Start epoch 11 +[2025-10-28 01:06:41,641][main][INFO] - [T_total=37:47:21 | T_train=37:04:36 | T_epoch=03:04:48] End of epoch 11 (79992 steps) train loss 794.458 +[2025-10-28 01:06:41,643][main][INFO] - [Epoch 11] All losses: [[diffusion=0.0796192 ; kl=7.94164e+08 ; lpips=0.1859 ; repa=0.488066]] +[2025-10-28 01:10:08,942][main][INFO] - [Epoch 12] Test metrics: [[MSE=23.46 | MAE=0.1073 | LPIPS=0.1673 | PSNR=16.3 | SSIM=0.4107 | dreamsim=0.2413 | FID=20.23]] +[2025-10-28 01:10:08,944][main][INFO] - [Epoch 12] Best metrics: [[min_MSE=23.46 | min_MAE=0.1073 | min_LPIPS=0.1673 | max_PSNR=16.3 | max_SSIM=0.4107 | min_dreamsim=0.2413 | min_FID=20.23]] +[2025-10-28 01:10:08,945][main][DEBUG] - Writing images to disk... +[2025-10-28 01:10:09,790][main][DEBUG] - Image(s) saved on disk +[2025-10-28 01:10:09,998][main][INFO] - End of epoch timers: [T_train=37:04:36 | T_epoch=03:04:48 | T_eval=00:41:41 | T_total=37:50:49] +[2025-10-28 01:10:09,999][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-28 01:10:20,640][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-28 01:10:31,337][main][INFO] - --- + + +[2025-10-28 01:10:31,338][main][INFO] - [T_total=37:51:10 | T_train=37:04:36] Start epoch 12 +[2025-10-28 04:15:54,812][main][INFO] - [T_total=40:56:34 | T_train=40:09:59 | T_epoch=03:05:23] End of epoch 12 (86658 steps) train loss 4.51982 +[2025-10-28 04:15:54,814][main][INFO] - [Epoch 12] All losses: [[diffusion=0.0793437 ; kl=4.22649e+06 ; lpips=0.185086 ; repa=0.485754]] +[2025-10-28 04:19:22,090][main][INFO] - [Epoch 13] Test metrics: [[MSE=23.14 | MAE=0.1062 | LPIPS=0.1647 | PSNR=16.36 | SSIM=0.414 | dreamsim=0.2363 | FID=19.18]] +[2025-10-28 04:19:22,092][main][INFO] - [Epoch 13] Best metrics: [[min_MSE=23.14 | min_MAE=0.1062 | min_LPIPS=0.1647 | max_PSNR=16.36 | max_SSIM=0.414 | min_dreamsim=0.2363 | min_FID=19.18]] +[2025-10-28 04:19:22,093][main][DEBUG] - Writing images to disk... +[2025-10-28 04:19:22,937][main][DEBUG] - Image(s) saved on disk +[2025-10-28 04:19:23,137][main][INFO] - End of epoch timers: [T_train=40:09:59 | T_epoch=03:05:23 | T_eval=00:45:09 | T_total=41:00:02] +[2025-10-28 04:19:23,138][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-28 04:19:35,064][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-28 04:19:46,497][main][INFO] - --- + + +[2025-10-28 04:19:46,498][main][INFO] - [T_total=41:00:26 | T_train=40:09:59] Start epoch 13 +[2025-10-28 07:25:31,821][main][INFO] - [T_total=44:06:11 | T_train=43:15:45 | T_epoch=03:05:45] End of epoch 13 (93324 steps) train loss 5.05172 +[2025-10-28 07:25:31,823][main][INFO] - [Epoch 13] All losses: [[diffusion=0.0796848 ; kl=4.75808e+06 ; lpips=0.185526 ; repa=0.484762]] +[2025-10-28 07:28:58,913][main][INFO] - [Epoch 14] Test metrics: [[MSE=23.03 | MAE=0.106 | LPIPS=0.1633 | PSNR=16.38 | SSIM=0.4158 | dreamsim=0.2328 | FID=18.39]] +[2025-10-28 07:28:58,915][main][INFO] - [Epoch 14] Best metrics: [[min_MSE=23.03 | min_MAE=0.106 | min_LPIPS=0.1633 | max_PSNR=16.38 | max_SSIM=0.4158 | min_dreamsim=0.2328 | min_FID=18.39]] +[2025-10-28 07:28:58,916][main][DEBUG] - Writing images to disk... +[2025-10-28 07:28:59,747][main][DEBUG] - Image(s) saved on disk +[2025-10-28 07:28:59,993][main][INFO] - End of epoch timers: [T_train=43:15:45 | T_epoch=03:05:45 | T_eval=00:48:37 | T_total=44:09:39] +[2025-10-28 07:28:59,994][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-28 07:29:10,392][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-28 07:29:20,783][main][INFO] - --- + + +[2025-10-28 07:29:20,784][main][INFO] - [T_total=44:10:00 | T_train=43:15:45] Start epoch 14 +[2025-10-28 10:35:12,763][main][INFO] - [T_total=47:15:52 | T_train=46:21:37 | T_epoch=03:05:51] End of epoch 14 (99990 steps) train loss 2.56252 +[2025-10-28 10:35:12,765][main][INFO] - [Epoch 14] All losses: [[diffusion=0.0787443 ; kl=2.27203e+06 ; lpips=0.182851 ; repa=0.481282]] +[2025-10-28 10:38:39,849][main][INFO] - [Epoch 15] Test metrics: [[MSE=22.95 | MAE=0.1058 | LPIPS=0.1619 | PSNR=16.39 | SSIM=0.4198 | dreamsim=0.2294 | FID=17.64]] +[2025-10-28 10:38:39,851][main][INFO] - [Epoch 15] Best metrics: [[min_MSE=22.95 | min_MAE=0.1058 | min_LPIPS=0.1619 | max_PSNR=16.39 | max_SSIM=0.4198 | min_dreamsim=0.2294 | min_FID=17.64]] +[2025-10-28 10:38:39,853][main][DEBUG] - Writing images to disk... +[2025-10-28 10:38:40,690][main][DEBUG] - Image(s) saved on disk +[2025-10-28 10:38:40,939][main][INFO] - End of epoch timers: [T_train=46:21:37 | T_epoch=03:05:51 | T_eval=00:52:05 | T_total=47:19:20] +[2025-10-28 10:38:40,940][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-28 10:38:52,080][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-28 10:39:03,800][main][INFO] - --- + + +[2025-10-28 10:39:03,801][main][INFO] - [T_total=47:19:43 | T_train=46:21:37] Start epoch 15 +[2025-10-28 13:44:56,004][main][INFO] - [T_total=50:25:35 | T_train=49:27:29 | T_epoch=03:05:52] End of epoch 15 (106656 steps) train loss 150.527 +[2025-10-28 13:44:56,005][main][INFO] - [Epoch 15] All losses: [[diffusion=0.0778382 ; kl=1.50242e+08 ; lpips=0.177149 ; repa=0.475895]] +[2025-10-28 13:48:23,438][main][INFO] - [Epoch 16] Test metrics: [[MSE=22.91 | MAE=0.1058 | LPIPS=0.1609 | PSNR=16.4 | SSIM=0.4219 | dreamsim=0.2269 | FID=17.15]] +[2025-10-28 13:48:23,439][main][INFO] - [Epoch 16] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1609 | max_PSNR=16.4 | max_SSIM=0.4219 | min_dreamsim=0.2269 | min_FID=17.15]] +[2025-10-28 13:48:23,440][main][DEBUG] - Writing images to disk... +[2025-10-28 13:48:24,270][main][DEBUG] - Image(s) saved on disk +[2025-10-28 13:48:24,506][main][INFO] - End of epoch timers: [T_train=49:27:29 | T_epoch=03:05:52 | T_eval=00:55:33 | T_total=50:29:04] +[2025-10-28 13:48:24,507][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-28 13:48:36,131][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-28 13:48:46,875][main][INFO] - --- + + +[2025-10-28 13:48:46,876][main][INFO] - [T_total=50:29:26 | T_train=49:27:29] Start epoch 16 +[2025-10-28 16:55:33,548][main][INFO] - [T_total=53:36:13 | T_train=52:34:16 | T_epoch=03:06:46] End of epoch 16 (113322 steps) train loss 254.554 +[2025-10-28 16:55:33,549][main][INFO] - [Epoch 16] All losses: [[diffusion=0.0785965 ; kl=2.54265e+08 ; lpips=0.18152 ; repa=0.477904]] +[2025-10-28 16:59:00,697][main][INFO] - [Epoch 17] Test metrics: [[MSE=23.1 | MAE=0.1065 | LPIPS=0.16 | PSNR=16.36 | SSIM=0.425 | dreamsim=0.2245 | FID=16.63]] +[2025-10-28 16:59:00,699][main][INFO] - [Epoch 17] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.16 | max_PSNR=16.4 | max_SSIM=0.425 | min_dreamsim=0.2245 | min_FID=16.63]] +[2025-10-28 16:59:00,703][main][DEBUG] - Writing images to disk... +[2025-10-28 16:59:01,784][main][DEBUG] - Image(s) saved on disk +[2025-10-28 16:59:02,034][main][INFO] - End of epoch timers: [T_train=52:34:16 | T_epoch=03:06:46 | T_eval=00:59:02 | T_total=53:39:41] +[2025-10-28 16:59:02,035][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-28 16:59:13,349][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-28 16:59:24,416][main][INFO] - --- + + +[2025-10-28 16:59:24,417][main][INFO] - [T_total=53:40:03 | T_train=52:34:16] Start epoch 17 +[2025-10-28 20:04:55,554][main][INFO] - [T_total=56:45:35 | T_train=55:39:47 | T_epoch=03:05:31] End of epoch 17 (119988 steps) train loss 3.54179 +[2025-10-28 20:04:55,556][main][INFO] - [Epoch 17] All losses: [[diffusion=0.0778652 ; kl=3.25619e+06 ; lpips=0.178335 ; repa=0.474268]] +[2025-10-28 20:08:22,775][main][INFO] - [Epoch 18] Test metrics: [[MSE=23.09 | MAE=0.1066 | LPIPS=0.159 | PSNR=16.37 | SSIM=0.4268 | dreamsim=0.2221 | FID=16.09]] +[2025-10-28 20:08:22,777][main][INFO] - [Epoch 18] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.159 | max_PSNR=16.4 | max_SSIM=0.4268 | min_dreamsim=0.2221 | min_FID=16.09]] +[2025-10-28 20:08:22,778][main][DEBUG] - Writing images to disk... +[2025-10-28 20:08:23,864][main][DEBUG] - Image(s) saved on disk +[2025-10-28 20:08:24,074][main][INFO] - End of epoch timers: [T_train=55:39:47 | T_epoch=03:05:31 | T_eval=01:02:30 | T_total=56:49:03] +[2025-10-28 20:08:24,075][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-28 20:08:35,565][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-28 20:08:47,122][main][INFO] - --- + + +[2025-10-28 20:08:47,124][main][INFO] - [T_total=56:49:26 | T_train=55:39:47] Start epoch 18 +[2025-10-28 23:13:47,257][main][INFO] - [T_total=59:54:26 | T_train=58:44:47 | T_epoch=03:05:00] End of epoch 18 (126654 steps) train loss 8.77081 +[2025-10-28 23:13:47,258][main][INFO] - [Epoch 18] All losses: [[diffusion=0.0783209 ; kl=8.4835e+06 ; lpips=0.180506 ; repa=0.474953]] +[2025-10-28 23:17:14,615][main][INFO] - [Epoch 19] Test metrics: [[MSE=23.25 | MAE=0.1072 | LPIPS=0.1586 | PSNR=16.34 | SSIM=0.4269 | dreamsim=0.2204 | FID=15.71]] +[2025-10-28 23:17:14,617][main][INFO] - [Epoch 19] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1586 | max_PSNR=16.4 | max_SSIM=0.4269 | min_dreamsim=0.2204 | min_FID=15.71]] +[2025-10-28 23:17:14,618][main][DEBUG] - Writing images to disk... +[2025-10-28 23:17:15,450][main][DEBUG] - Image(s) saved on disk +[2025-10-28 23:17:15,693][main][INFO] - End of epoch timers: [T_train=58:44:47 | T_epoch=03:05:00 | T_eval=01:05:58 | T_total=59:57:55] +[2025-10-28 23:17:15,694][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-28 23:17:25,883][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-28 23:17:35,331][main][INFO] - --- + + +[2025-10-28 23:17:35,332][main][INFO] - [T_total=59:58:14 | T_train=58:44:47] Start epoch 19 +[2025-10-29 02:22:52,012][main][INFO] - [T_total=63:03:31 | T_train=61:50:04 | T_epoch=03:05:16] End of epoch 19 (133320 steps) train loss 65.3624 +[2025-10-29 02:22:52,014][main][INFO] - [Epoch 19] All losses: [[diffusion=0.0772616 ; kl=6.50794e+07 ; lpips=0.176332 ; repa=0.470315]] +[2025-10-29 02:26:19,233][main][INFO] - [Epoch 20] Test metrics: [[MSE=23.28 | MAE=0.1074 | LPIPS=0.1579 | PSNR=16.33 | SSIM=0.4276 | dreamsim=0.2187 | FID=15.33]] +[2025-10-29 02:26:19,250][main][INFO] - [Epoch 20] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1579 | max_PSNR=16.4 | max_SSIM=0.4276 | min_dreamsim=0.2187 | min_FID=15.33]] +[2025-10-29 02:26:19,251][main][DEBUG] - Writing images to disk... +[2025-10-29 02:26:20,086][main][DEBUG] - Image(s) saved on disk +[2025-10-29 02:26:20,336][main][INFO] - End of epoch timers: [T_train=61:50:04 | T_epoch=03:05:16 | T_eval=01:09:26 | T_total=63:06:59] +[2025-10-29 02:26:20,337][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-29 02:26:31,555][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-29 02:26:42,308][main][INFO] - --- + + +[2025-10-29 02:26:42,309][main][INFO] - [T_total=63:07:21 | T_train=61:50:04] Start epoch 20 +[2025-10-29 05:32:23,347][main][INFO] - [T_total=66:13:02 | T_train=64:55:45 | T_epoch=03:05:41] End of epoch 20 (139986 steps) train loss 0.949199 +[2025-10-29 05:32:23,349][main][INFO] - [Epoch 20] All losses: [[diffusion=0.07719 ; kl=667540 ; lpips=0.174758 ; repa=0.468358]] +[2025-10-29 05:35:50,507][main][INFO] - [Epoch 21] Test metrics: [[MSE=23.35 | MAE=0.1078 | LPIPS=0.1576 | PSNR=16.32 | SSIM=0.4284 | dreamsim=0.2173 | FID=15.02]] +[2025-10-29 05:35:50,509][main][INFO] - [Epoch 21] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1576 | max_PSNR=16.4 | max_SSIM=0.4284 | min_dreamsim=0.2173 | min_FID=15.02]] +[2025-10-29 05:35:50,510][main][DEBUG] - Writing images to disk... +[2025-10-29 05:35:51,347][main][DEBUG] - Image(s) saved on disk +[2025-10-29 05:35:51,557][main][INFO] - End of epoch timers: [T_train=64:55:45 | T_epoch=03:05:41 | T_eval=01:12:55 | T_total=66:16:31] +[2025-10-29 05:35:51,558][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-29 05:36:02,284][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-29 05:36:12,666][main][INFO] - --- + + +[2025-10-29 05:36:12,667][main][INFO] - [T_total=66:16:52 | T_train=64:55:45] Start epoch 21 +[2025-10-29 08:42:54,495][main][INFO] - [T_total=69:23:34 | T_train=68:02:26 | T_epoch=03:06:41] End of epoch 21 (146652 steps) train loss 289.216 +[2025-10-29 08:42:54,496][main][INFO] - [Epoch 21] All losses: [[diffusion=0.0776409 ; kl=2.88933e+08 ; lpips=0.176763 ; repa=0.469242]] +[2025-10-29 08:46:22,241][main][INFO] - [Epoch 22] Test metrics: [[MSE=23.57 | MAE=0.1086 | LPIPS=0.1573 | PSNR=16.28 | SSIM=0.4288 | dreamsim=0.2159 | FID=14.7]] +[2025-10-29 08:46:22,243][main][INFO] - [Epoch 22] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1573 | max_PSNR=16.4 | max_SSIM=0.4288 | min_dreamsim=0.2159 | min_FID=14.7]] +[2025-10-29 08:46:22,244][main][DEBUG] - Writing images to disk... +[2025-10-29 08:46:23,102][main][DEBUG] - Image(s) saved on disk +[2025-10-29 08:46:23,340][main][INFO] - End of epoch timers: [T_train=68:02:26 | T_epoch=03:06:41 | T_eval=01:16:23 | T_total=69:27:02] +[2025-10-29 08:46:23,341][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-29 08:46:34,825][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-29 08:46:45,943][main][INFO] - --- + + +[2025-10-29 08:46:45,944][main][INFO] - [T_total=69:27:25 | T_train=68:02:26] Start epoch 22 +[2025-10-29 11:52:25,547][main][INFO] - [T_total=72:33:05 | T_train=71:08:06 | T_epoch=03:05:39] End of epoch 22 (153318 steps) train loss 6.70619e+06 +[2025-10-29 11:52:25,549][main][INFO] - [Epoch 22] All losses: [[diffusion=0.0782495 ; kl=6.70619e+12 ; lpips=0.180401 ; repa=0.471485]] +[2025-10-29 11:55:52,869][main][INFO] - [Epoch 23] Test metrics: [[MSE=23.7 | MAE=0.109 | LPIPS=0.157 | PSNR=16.25 | SSIM=0.4303 | dreamsim=0.2147 | FID=14.49]] +[2025-10-29 11:55:52,871][main][INFO] - [Epoch 23] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.157 | max_PSNR=16.4 | max_SSIM=0.4303 | min_dreamsim=0.2147 | min_FID=14.49]] +[2025-10-29 11:55:52,872][main][DEBUG] - Writing images to disk... +[2025-10-29 11:55:53,703][main][DEBUG] - Image(s) saved on disk +[2025-10-29 11:55:53,941][main][INFO] - End of epoch timers: [T_train=71:08:06 | T_epoch=03:05:39 | T_eval=01:19:51 | T_total=72:36:33] +[2025-10-29 11:55:53,943][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-29 11:56:05,150][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-29 11:56:16,440][main][INFO] - --- + + +[2025-10-29 11:56:16,441][main][INFO] - [T_total=72:36:55 | T_train=71:08:06] Start epoch 23 +[2025-10-29 15:01:26,171][main][INFO] - [T_total=75:42:05 | T_train=74:13:16 | T_epoch=03:05:09] End of epoch 23 (159984 steps) train loss 168.462 +[2025-10-29 15:01:26,172][main][INFO] - [Epoch 23] All losses: [[diffusion=0.0781783 ; kl=1.68175e+08 ; lpips=0.18157 ; repa=0.471573]] +[2025-10-29 15:04:53,505][main][INFO] - [Epoch 24] Test metrics: [[MSE=23.89 | MAE=0.1096 | LPIPS=0.1566 | PSNR=16.22 | SSIM=0.4302 | dreamsim=0.2137 | FID=14.27]] +[2025-10-29 15:04:53,507][main][INFO] - [Epoch 24] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1566 | max_PSNR=16.4 | max_SSIM=0.4303 | min_dreamsim=0.2137 | min_FID=14.27]] +[2025-10-29 15:04:53,508][main][DEBUG] - Writing images to disk... +[2025-10-29 15:04:54,339][main][DEBUG] - Image(s) saved on disk +[2025-10-29 15:04:54,543][main][INFO] - End of epoch timers: [T_train=74:13:16 | T_epoch=03:05:09 | T_eval=01:23:20 | T_total=75:45:34] +[2025-10-29 15:04:54,544][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-29 15:05:05,591][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-29 15:05:16,848][main][INFO] - --- + + +[2025-10-29 15:05:16,849][main][INFO] - [T_total=75:45:56 | T_train=74:13:16] Start epoch 24 +[2025-10-29 18:10:51,689][main][INFO] - [T_total=78:51:31 | T_train=77:18:51 | T_epoch=03:05:34] End of epoch 24 (166650 steps) train loss 1.72129 +[2025-10-29 18:10:51,690][main][INFO] - [Epoch 24] All losses: [[diffusion=0.0766599 ; kl=1.44158e+06 ; lpips=0.173953 ; repa=0.464302]] +[2025-10-29 18:14:18,759][main][INFO] - [Epoch 25] Test metrics: [[MSE=24.04 | MAE=0.1101 | LPIPS=0.1564 | PSNR=16.19 | SSIM=0.4306 | dreamsim=0.2126 | FID=13.99]] +[2025-10-29 18:14:18,761][main][INFO] - [Epoch 25] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1564 | max_PSNR=16.4 | max_SSIM=0.4306 | min_dreamsim=0.2126 | min_FID=13.99]] +[2025-10-29 18:14:18,762][main][DEBUG] - Writing images to disk... +[2025-10-29 18:14:19,594][main][DEBUG] - Image(s) saved on disk +[2025-10-29 18:14:19,796][main][INFO] - End of epoch timers: [T_train=77:18:51 | T_epoch=03:05:34 | T_eval=01:26:48 | T_total=78:54:59] +[2025-10-29 18:14:19,797][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-29 18:14:31,104][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-29 18:14:42,095][main][INFO] - --- + + +[2025-10-29 18:14:42,095][main][INFO] - [T_total=78:55:21 | T_train=77:18:51] Start epoch 25 +[2025-10-29 21:19:56,417][main][INFO] - [T_total=82:00:35 | T_train=80:24:05 | T_epoch=03:05:14] End of epoch 25 (173316 steps) train loss 2.83654e+07 +[2025-10-29 21:19:56,418][main][INFO] - [Epoch 25] All losses: [[diffusion=0.0773898 ; kl=2.83654e+13 ; lpips=0.17699 ; repa=0.466391]] +[2025-10-29 21:23:23,926][main][INFO] - [Epoch 26] Test metrics: [[MSE=24.22 | MAE=0.1107 | LPIPS=0.1564 | PSNR=16.16 | SSIM=0.4312 | dreamsim=0.2119 | FID=13.81]] +[2025-10-29 21:23:23,928][main][INFO] - [Epoch 26] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1564 | max_PSNR=16.4 | max_SSIM=0.4312 | min_dreamsim=0.2119 | min_FID=13.81]] +[2025-10-29 21:23:23,929][main][DEBUG] - Writing images to disk... +[2025-10-29 21:23:25,021][main][DEBUG] - Image(s) saved on disk +[2025-10-29 21:23:25,253][main][INFO] - End of epoch timers: [T_train=80:24:05 | T_epoch=03:05:14 | T_eval=01:30:16 | T_total=82:04:04] +[2025-10-29 21:23:25,254][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-29 21:23:36,113][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-29 21:23:47,503][main][INFO] - --- + + +[2025-10-29 21:23:47,504][main][INFO] - [T_total=82:04:27 | T_train=80:24:05] Start epoch 26 +[2025-10-30 00:29:03,175][main][INFO] - [T_total=85:09:42 | T_train=83:29:21 | T_epoch=03:05:15] End of epoch 26 (179982 steps) train loss 2.92682 +[2025-10-30 00:29:03,176][main][INFO] - [Epoch 26] All losses: [[diffusion=0.076385 ; kl=2.6494e+06 ; lpips=0.171497 ; repa=0.46114]] +[2025-10-30 00:32:30,822][main][INFO] - [Epoch 27] Test metrics: [[MSE=24.25 | MAE=0.1109 | LPIPS=0.1556 | PSNR=16.15 | SSIM=0.4313 | dreamsim=0.2106 | FID=13.64]] +[2025-10-30 00:32:30,823][main][INFO] - [Epoch 27] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1556 | max_PSNR=16.4 | max_SSIM=0.4313 | min_dreamsim=0.2106 | min_FID=13.64]] +[2025-10-30 00:32:30,825][main][DEBUG] - Writing images to disk... +[2025-10-30 00:32:31,903][main][DEBUG] - Image(s) saved on disk +[2025-10-30 00:32:32,151][main][INFO] - End of epoch timers: [T_train=83:29:21 | T_epoch=03:05:15 | T_eval=01:33:45 | T_total=85:13:11] +[2025-10-30 00:32:32,152][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-30 00:32:42,864][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-30 00:32:54,008][main][INFO] - --- + + +[2025-10-30 00:32:54,009][main][INFO] - [T_total=85:13:33 | T_train=83:29:21] Start epoch 27 +[2025-10-30 03:37:54,148][main][INFO] - [T_total=88:18:33 | T_train=86:34:21 | T_epoch=03:05:00] End of epoch 27 (186648 steps) train loss 0.940592 +[2025-10-30 03:37:54,149][main][INFO] - [Epoch 27] All losses: [[diffusion=0.0759206 ; kl=665087 ; lpips=0.169737 ; repa=0.458863]] +[2025-10-30 03:41:21,260][main][INFO] - [Epoch 28] Test metrics: [[MSE=24.31 | MAE=0.1112 | LPIPS=0.1553 | PSNR=16.14 | SSIM=0.4322 | dreamsim=0.2096 | FID=13.42]] +[2025-10-30 03:41:21,262][main][INFO] - [Epoch 28] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1553 | max_PSNR=16.4 | max_SSIM=0.4322 | min_dreamsim=0.2096 | min_FID=13.42]] +[2025-10-30 03:41:21,263][main][DEBUG] - Writing images to disk... +[2025-10-30 03:41:22,104][main][DEBUG] - Image(s) saved on disk +[2025-10-30 03:41:22,371][main][INFO] - End of epoch timers: [T_train=86:34:21 | T_epoch=03:05:00 | T_eval=01:37:13 | T_total=88:22:01] +[2025-10-30 03:41:22,372][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-30 03:41:34,752][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-30 03:41:46,656][main][INFO] - --- + + +[2025-10-30 03:41:46,657][main][INFO] - [T_total=88:22:26 | T_train=86:34:21] Start epoch 28 +[2025-10-30 06:47:29,123][main][INFO] - [T_total=91:28:08 | T_train=89:40:03 | T_epoch=03:05:42] End of epoch 28 (193314 steps) train loss 37.7653 +[2025-10-30 06:47:29,124][main][INFO] - [Epoch 28] All losses: [[diffusion=0.0768294 ; kl=3.74862e+07 ; lpips=0.173765 ; repa=0.461866]] +[2025-10-30 06:50:55,982][main][INFO] - [Epoch 29] Test metrics: [[MSE=24.39 | MAE=0.1115 | LPIPS=0.1553 | PSNR=16.13 | SSIM=0.4326 | dreamsim=0.209 | FID=13.31]] +[2025-10-30 06:50:55,983][main][INFO] - [Epoch 29] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1553 | max_PSNR=16.4 | max_SSIM=0.4326 | min_dreamsim=0.209 | min_FID=13.31]] +[2025-10-30 06:50:55,984][main][DEBUG] - Writing images to disk... +[2025-10-30 06:50:56,817][main][DEBUG] - Image(s) saved on disk +[2025-10-30 06:50:57,019][main][INFO] - End of epoch timers: [T_train=89:40:03 | T_epoch=03:05:42 | T_eval=01:40:41 | T_total=91:31:36] +[2025-10-30 06:50:57,021][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-30 06:51:07,339][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-30 06:51:16,820][main][INFO] - --- + + +[2025-10-30 06:51:16,821][main][INFO] - [T_total=91:31:56 | T_train=89:40:03] Start epoch 29 +[2025-10-30 09:56:41,969][main][INFO] - [T_total=94:37:21 | T_train=92:45:28 | T_epoch=03:05:25] End of epoch 29 (199980 steps) train loss 422 +[2025-10-30 09:56:41,970][main][INFO] - [Epoch 29] All losses: [[diffusion=0.0763346 ; kl=4.21723e+08 ; lpips=0.171742 ; repa=0.459557]] +[2025-10-30 10:00:09,392][main][INFO] - [Epoch 30] Test metrics: [[MSE=24.44 | MAE=0.1117 | LPIPS=0.1547 | PSNR=16.12 | SSIM=0.4334 | dreamsim=0.2078 | FID=13.08]] +[2025-10-30 10:00:09,394][main][INFO] - [Epoch 30] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1547 | max_PSNR=16.4 | max_SSIM=0.4334 | min_dreamsim=0.2078 | min_FID=13.08]] +[2025-10-30 10:00:09,396][main][DEBUG] - Writing images to disk... +[2025-10-30 10:00:10,246][main][DEBUG] - Image(s) saved on disk +[2025-10-30 10:00:10,452][main][INFO] - End of epoch timers: [T_train=92:45:28 | T_epoch=03:05:25 | T_eval=01:44:09 | T_total=94:40:49] +[2025-10-30 10:00:10,453][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-30 10:00:21,119][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-30 10:00:31,248][main][INFO] - --- + + +[2025-10-30 10:00:31,249][main][INFO] - [T_total=94:41:10 | T_train=92:45:28] Start epoch 30 +[2025-10-30 13:06:42,974][main][INFO] - [T_total=97:47:22 | T_train=95:51:40 | T_epoch=03:06:11] End of epoch 30 (206646 steps) train loss 255193 +[2025-10-30 13:06:42,975][main][INFO] - [Epoch 30] All losses: [[diffusion=0.0769152 ; kl=2.55193e+11 ; lpips=0.174071 ; repa=0.460975]] +[2025-10-30 13:10:10,113][main][INFO] - [Epoch 31] Test metrics: [[MSE=24.51 | MAE=0.112 | LPIPS=0.1546 | PSNR=16.11 | SSIM=0.4338 | dreamsim=0.2072 | FID=12.95]] +[2025-10-30 13:10:10,116][main][INFO] - [Epoch 31] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1546 | max_PSNR=16.4 | max_SSIM=0.4338 | min_dreamsim=0.2072 | min_FID=12.95]] +[2025-10-30 13:10:10,117][main][DEBUG] - Writing images to disk... +[2025-10-30 13:10:10,957][main][DEBUG] - Image(s) saved on disk +[2025-10-30 13:10:11,195][main][INFO] - End of epoch timers: [T_train=95:51:40 | T_epoch=03:06:11 | T_eval=01:47:37 | T_total=97:50:50] +[2025-10-30 13:10:11,197][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-30 13:10:21,570][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-30 13:10:31,290][main][INFO] - --- + + +[2025-10-30 13:10:31,291][main][INFO] - [T_total=97:51:10 | T_train=95:51:40] Start epoch 31 +[2025-10-30 16:15:53,375][main][INFO] - [T_total=100:56:32 | T_train=98:57:02 | T_epoch=03:05:22] End of epoch 31 (213312 steps) train loss 29024 +[2025-10-30 16:15:53,376][main][INFO] - [Epoch 31] All losses: [[diffusion=0.0770869 ; kl=2.90237e+10 ; lpips=0.1765 ; repa=0.462709]] +[2025-10-30 16:19:20,850][main][INFO] - [Epoch 32] Test metrics: [[MSE=24.52 | MAE=0.1121 | LPIPS=0.1545 | PSNR=16.1 | SSIM=0.4351 | dreamsim=0.2065 | FID=12.85]] +[2025-10-30 16:19:20,852][main][INFO] - [Epoch 32] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1545 | max_PSNR=16.4 | max_SSIM=0.4351 | min_dreamsim=0.2065 | min_FID=12.85]] +[2025-10-30 16:19:20,854][main][DEBUG] - Writing images to disk... +[2025-10-30 16:19:21,697][main][DEBUG] - Image(s) saved on disk +[2025-10-30 16:19:21,942][main][INFO] - End of epoch timers: [T_train=98:57:02 | T_epoch=03:05:22 | T_eval=01:51:06 | T_total=101:00:01] +[2025-10-30 16:19:21,942][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-30 16:19:33,022][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-30 16:19:43,573][main][INFO] - --- + + +[2025-10-30 16:19:43,574][main][INFO] - [T_total=101:00:23 | T_train=98:57:02] Start epoch 32 +[2025-10-30 19:24:58,751][main][INFO] - [T_total=104:05:38 | T_train=102:02:17 | T_epoch=03:05:15] End of epoch 32 (219978 steps) train loss 20.2597 +[2025-10-30 19:24:58,752][main][INFO] - [Epoch 32] All losses: [[diffusion=0.0767827 ; kl=1.99806e+07 ; lpips=0.174456 ; repa=0.460265]] +[2025-10-30 19:28:26,469][main][INFO] - [Epoch 33] Test metrics: [[MSE=24.6 | MAE=0.1125 | LPIPS=0.1544 | PSNR=16.09 | SSIM=0.4346 | dreamsim=0.206 | FID=12.76]] +[2025-10-30 19:28:26,471][main][INFO] - [Epoch 33] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1544 | max_PSNR=16.4 | max_SSIM=0.4351 | min_dreamsim=0.206 | min_FID=12.76]] +[2025-10-30 19:28:26,473][main][DEBUG] - Writing images to disk... +[2025-10-30 19:28:27,304][main][DEBUG] - Image(s) saved on disk +[2025-10-30 19:28:27,556][main][INFO] - End of epoch timers: [T_train=102:02:17 | T_epoch=03:05:15 | T_eval=01:54:34 | T_total=104:09:07] +[2025-10-30 19:28:27,558][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-30 19:28:38,944][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-30 19:28:50,181][main][INFO] - --- + + +[2025-10-30 19:28:50,182][main][INFO] - [T_total=104:09:29 | T_train=102:02:17] Start epoch 33 +[2025-10-30 22:34:36,461][main][INFO] - [T_total=107:15:16 | T_train=105:08:04 | T_epoch=03:05:46] End of epoch 33 (226644 steps) train loss 176.126 +[2025-10-30 22:34:36,462][main][INFO] - [Epoch 33] All losses: [[diffusion=0.0773723 ; kl=1.75845e+08 ; lpips=0.176956 ; repa=0.462503]] +[2025-10-30 22:38:03,690][main][INFO] - [Epoch 34] Test metrics: [[MSE=24.58 | MAE=0.1124 | LPIPS=0.1542 | PSNR=16.09 | SSIM=0.4358 | dreamsim=0.2053 | FID=12.61]] +[2025-10-30 22:38:03,692][main][INFO] - [Epoch 34] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1542 | max_PSNR=16.4 | max_SSIM=0.4358 | min_dreamsim=0.2053 | min_FID=12.61]] +[2025-10-30 22:38:03,692][main][DEBUG] - Writing images to disk... +[2025-10-30 22:38:04,540][main][DEBUG] - Image(s) saved on disk +[2025-10-30 22:38:04,752][main][INFO] - End of epoch timers: [T_train=105:08:04 | T_epoch=03:05:46 | T_eval=01:58:02 | T_total=107:18:44] +[2025-10-30 22:38:04,753][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-30 22:38:16,516][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-30 22:38:27,966][main][INFO] - --- + + +[2025-10-30 22:38:27,967][main][INFO] - [T_total=107:19:07 | T_train=105:08:04] Start epoch 34 +[2025-10-31 01:43:52,689][main][INFO] - [T_total=110:24:32 | T_train=108:13:28 | T_epoch=03:05:24] End of epoch 34 (233310 steps) train loss 3098.35 +[2025-10-31 01:43:52,690][main][INFO] - [Epoch 34] All losses: [[diffusion=0.0765349 ; kl=3.09807e+09 ; lpips=0.174148 ; repa=0.459381]] +[2025-10-31 01:47:20,181][main][INFO] - [Epoch 35] Test metrics: [[MSE=24.56 | MAE=0.1125 | LPIPS=0.1538 | PSNR=16.1 | SSIM=0.4369 | dreamsim=0.2045 | FID=12.45]] +[2025-10-31 01:47:20,183][main][INFO] - [Epoch 35] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1538 | max_PSNR=16.4 | max_SSIM=0.4369 | min_dreamsim=0.2045 | min_FID=12.45]] +[2025-10-31 01:47:20,184][main][DEBUG] - Writing images to disk... +[2025-10-31 01:47:21,284][main][DEBUG] - Image(s) saved on disk +[2025-10-31 01:47:21,496][main][INFO] - End of epoch timers: [T_train=108:13:28 | T_epoch=03:05:24 | T_eval=02:01:31 | T_total=110:28:01] +[2025-10-31 01:47:21,497][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-31 01:47:32,897][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-31 01:47:43,716][main][INFO] - --- + + +[2025-10-31 01:47:43,717][main][INFO] - [T_total=110:28:23 | T_train=108:13:28] Start epoch 35 +[2025-10-31 04:53:48,886][main][INFO] - [T_total=113:34:28 | T_train=111:19:33 | T_epoch=03:06:05] End of epoch 35 (239976 steps) train loss 44306.2 +[2025-10-31 04:53:48,887][main][INFO] - [Epoch 35] All losses: [[diffusion=0.0759655 ; kl=4.43059e+10 ; lpips=0.170242 ; repa=0.455204]] +[2025-10-31 04:57:16,489][main][INFO] - [Epoch 36] Test metrics: [[MSE=24.6 | MAE=0.1126 | LPIPS=0.1535 | PSNR=16.09 | SSIM=0.4367 | dreamsim=0.2038 | FID=12.31]] +[2025-10-31 04:57:16,491][main][INFO] - [Epoch 36] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1535 | max_PSNR=16.4 | max_SSIM=0.4369 | min_dreamsim=0.2038 | min_FID=12.31]] +[2025-10-31 04:57:16,493][main][DEBUG] - Writing images to disk... +[2025-10-31 04:57:17,591][main][DEBUG] - Image(s) saved on disk +[2025-10-31 04:57:17,835][main][INFO] - End of epoch timers: [T_train=111:19:33 | T_epoch=03:06:05 | T_eval=02:05:00 | T_total=113:37:57] +[2025-10-31 04:57:17,836][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-31 04:57:28,300][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-31 04:57:38,419][main][INFO] - --- + + +[2025-10-31 04:57:38,420][main][INFO] - [T_total=113:38:17 | T_train=111:19:33] Start epoch 36 +[2025-10-31 08:03:40,896][main][INFO] - [T_total=116:44:20 | T_train=114:25:36 | T_epoch=03:06:02] End of epoch 36 (246642 steps) train loss 13991 +[2025-10-31 08:03:40,897][main][INFO] - [Epoch 36] All losses: [[diffusion=0.0760433 ; kl=1.39907e+10 ; lpips=0.169438 ; repa=0.454577]] +[2025-10-31 08:07:07,981][main][INFO] - [Epoch 37] Test metrics: [[MSE=24.59 | MAE=0.1126 | LPIPS=0.1533 | PSNR=16.09 | SSIM=0.4386 | dreamsim=0.2031 | FID=12.18]] +[2025-10-31 08:07:07,982][main][INFO] - [Epoch 37] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1533 | max_PSNR=16.4 | max_SSIM=0.4386 | min_dreamsim=0.2031 | min_FID=12.18]] +[2025-10-31 08:07:07,983][main][DEBUG] - Writing images to disk... +[2025-10-31 08:07:08,817][main][DEBUG] - Image(s) saved on disk +[2025-10-31 08:07:09,048][main][INFO] - End of epoch timers: [T_train=114:25:36 | T_epoch=03:06:02 | T_eval=02:08:28 | T_total=116:47:48] +[2025-10-31 08:07:09,049][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-31 08:07:19,835][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-31 08:07:30,639][main][INFO] - --- + + +[2025-10-31 08:07:30,640][main][INFO] - [T_total=116:48:10 | T_train=114:25:36] Start epoch 37 +[2025-10-31 11:12:53,371][main][INFO] - [T_total=119:53:32 | T_train=117:30:59 | T_epoch=03:05:22] End of epoch 37 (253308 steps) train loss 13991.7 +[2025-10-31 11:12:53,372][main][INFO] - [Epoch 37] All losses: [[diffusion=0.0767062 ; kl=1.39915e+10 ; lpips=0.174108 ; repa=0.4581]] +[2025-10-31 11:16:19,970][main][INFO] - [Epoch 38] Test metrics: [[MSE=24.53 | MAE=0.1125 | LPIPS=0.1528 | PSNR=16.1 | SSIM=0.4386 | dreamsim=0.2024 | FID=12.07]] +[2025-10-31 11:16:19,972][main][INFO] - [Epoch 38] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1528 | max_PSNR=16.4 | max_SSIM=0.4386 | min_dreamsim=0.2024 | min_FID=12.07]] +[2025-10-31 11:16:19,973][main][DEBUG] - Writing images to disk... +[2025-10-31 11:16:20,809][main][DEBUG] - Image(s) saved on disk +[2025-10-31 11:16:21,014][main][INFO] - End of epoch timers: [T_train=117:30:59 | T_epoch=03:05:22 | T_eval=02:11:55 | T_total=119:57:00] +[2025-10-31 11:16:21,015][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-31 11:16:32,831][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-31 11:16:44,606][main][INFO] - --- + + +[2025-10-31 11:16:44,607][main][INFO] - [T_total=119:57:24 | T_train=117:30:59] Start epoch 38 +[2025-10-31 14:21:53,789][main][INFO] - [T_total=123:02:33 | T_train=120:36:08 | T_epoch=03:05:09] End of epoch 38 (259974 steps) train loss 957.097 +[2025-10-31 14:21:53,790][main][INFO] - [Epoch 38] All losses: [[diffusion=0.0761426 ; kl=9.56822e+08 ; lpips=0.170405 ; repa=0.454693]] +[2025-10-31 14:25:20,866][main][INFO] - [Epoch 39] Test metrics: [[MSE=24.69 | MAE=0.1129 | LPIPS=0.1528 | PSNR=16.07 | SSIM=0.4388 | dreamsim=0.2019 | FID=11.95]] +[2025-10-31 14:25:20,868][main][INFO] - [Epoch 39] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1528 | max_PSNR=16.4 | max_SSIM=0.4388 | min_dreamsim=0.2019 | min_FID=11.95]] +[2025-10-31 14:25:20,869][main][DEBUG] - Writing images to disk... +[2025-10-31 14:25:21,702][main][DEBUG] - Image(s) saved on disk +[2025-10-31 14:25:21,953][main][INFO] - End of epoch timers: [T_train=120:36:08 | T_epoch=03:05:09 | T_eval=02:15:23 | T_total=123:06:01] +[2025-10-31 14:25:21,955][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-31 14:25:32,079][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-31 14:25:43,132][main][INFO] - --- + + +[2025-10-31 14:25:43,136][main][INFO] - [T_total=123:06:22 | T_train=120:36:08] Start epoch 39 +[2025-10-31 17:30:32,200][main][INFO] - [T_total=126:11:11 | T_train=123:40:57 | T_epoch=03:04:49] End of epoch 39 (266640 steps) train loss 2134.48 +[2025-10-31 17:30:32,202][main][INFO] - [Epoch 39] All losses: [[diffusion=0.076013 ; kl=2.1342e+09 ; lpips=0.169987 ; repa=0.453857]] +[2025-10-31 17:33:59,741][main][INFO] - [Epoch 40] Test metrics: [[MSE=24.76 | MAE=0.1132 | LPIPS=0.1528 | PSNR=16.06 | SSIM=0.4394 | dreamsim=0.2017 | FID=11.91]] +[2025-10-31 17:33:59,743][main][INFO] - [Epoch 40] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1528 | max_PSNR=16.4 | max_SSIM=0.4394 | min_dreamsim=0.2017 | min_FID=11.91]] +[2025-10-31 17:33:59,744][main][DEBUG] - Writing images to disk... +[2025-10-31 17:34:00,581][main][DEBUG] - Image(s) saved on disk +[2025-10-31 17:34:00,789][main][INFO] - End of epoch timers: [T_train=123:40:57 | T_epoch=03:04:49 | T_eval=02:18:52 | T_total=126:14:40] +[2025-10-31 17:34:00,790][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-31 17:34:11,470][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-31 17:34:20,752][main][INFO] - --- + + +[2025-10-31 17:34:20,753][main][INFO] - [T_total=126:15:00 | T_train=123:40:57] Start epoch 40 +[2025-10-31 20:39:19,195][main][INFO] - [T_total=129:19:58 | T_train=126:45:55 | T_epoch=03:04:58] End of epoch 40 (273306 steps) train loss 257946 +[2025-10-31 20:39:19,196][main][INFO] - [Epoch 40] All losses: [[diffusion=0.0760113 ; kl=2.57946e+11 ; lpips=0.170735 ; repa=0.454041]] +[2025-10-31 20:42:46,266][main][INFO] - [Epoch 41] Test metrics: [[MSE=24.79 | MAE=0.1133 | LPIPS=0.1526 | PSNR=16.06 | SSIM=0.4406 | dreamsim=0.2009 | FID=11.78]] +[2025-10-31 20:42:46,268][main][INFO] - [Epoch 41] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1526 | max_PSNR=16.4 | max_SSIM=0.4406 | min_dreamsim=0.2009 | min_FID=11.78]] +[2025-10-31 20:42:46,269][main][DEBUG] - Writing images to disk... +[2025-10-31 20:42:47,095][main][DEBUG] - Image(s) saved on disk +[2025-10-31 20:42:47,343][main][INFO] - End of epoch timers: [T_train=126:45:55 | T_epoch=03:04:58 | T_eval=02:22:20 | T_total=129:23:26] +[2025-10-31 20:42:47,345][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-31 20:42:58,982][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-31 20:43:08,773][main][INFO] - --- + + +[2025-10-31 20:43:08,774][main][INFO] - [T_total=129:23:48 | T_train=126:45:55] Start epoch 41 +[2025-10-31 23:47:55,833][main][INFO] - [T_total=132:28:35 | T_train=129:50:42 | T_epoch=03:04:47] End of epoch 41 (279972 steps) train loss 14.3203 +[2025-10-31 23:47:55,839][main][INFO] - [Epoch 41] All losses: [[diffusion=0.0761964 ; kl=1.40445e+07 ; lpips=0.171818 ; repa=0.454763]] +[2025-10-31 23:51:24,059][main][INFO] - [Epoch 42] Test metrics: [[MSE=24.9 | MAE=0.1136 | LPIPS=0.1528 | PSNR=16.04 | SSIM=0.4402 | dreamsim=0.2009 | FID=11.76]] +[2025-10-31 23:51:24,061][main][INFO] - [Epoch 42] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1526 | max_PSNR=16.4 | max_SSIM=0.4406 | min_dreamsim=0.2009 | min_FID=11.76]] +[2025-10-31 23:51:24,063][main][DEBUG] - Writing images to disk... +[2025-10-31 23:51:24,900][main][DEBUG] - Image(s) saved on disk +[2025-10-31 23:51:25,140][main][INFO] - End of epoch timers: [T_train=129:50:42 | T_epoch=03:04:47 | T_eval=02:25:49 | T_total=132:32:04] +[2025-10-31 23:51:25,141][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-10-31 23:51:35,044][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-10-31 23:51:44,524][main][INFO] - --- + + +[2025-10-31 23:51:44,525][main][INFO] - [T_total=132:32:24 | T_train=129:50:42] Start epoch 42 +[2025-11-01 02:57:08,828][main][INFO] - [T_total=135:37:48 | T_train=132:56:07 | T_epoch=03:05:24] End of epoch 42 (286638 steps) train loss 27710.1 +[2025-11-01 02:57:08,829][main][INFO] - [Epoch 42] All losses: [[diffusion=0.0763487 ; kl=2.77098e+10 ; lpips=0.172949 ; repa=0.455514]] +[2025-11-01 03:00:36,341][main][INFO] - [Epoch 43] Test metrics: [[MSE=24.85 | MAE=0.1135 | LPIPS=0.1524 | PSNR=16.05 | SSIM=0.4412 | dreamsim=0.2003 | FID=11.67]] +[2025-11-01 03:00:36,343][main][INFO] - [Epoch 43] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1524 | max_PSNR=16.4 | max_SSIM=0.4412 | min_dreamsim=0.2003 | min_FID=11.67]] +[2025-11-01 03:00:36,344][main][DEBUG] - Writing images to disk... +[2025-11-01 03:00:37,429][main][DEBUG] - Image(s) saved on disk +[2025-11-01 03:00:37,669][main][INFO] - End of epoch timers: [T_train=132:56:07 | T_epoch=03:05:24 | T_eval=02:29:17 | T_total=135:41:17] +[2025-11-01 03:00:37,671][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-11-01 03:00:48,523][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-11-01 03:00:59,238][main][INFO] - --- + + +[2025-11-01 03:00:59,239][main][INFO] - [T_total=135:41:38 | T_train=132:56:07] Start epoch 43 +[2025-11-01 06:06:17,238][main][INFO] - [T_total=138:46:56 | T_train=136:01:25 | T_epoch=03:05:17] End of epoch 43 (293304 steps) train loss 321.888 +[2025-11-01 06:06:17,239][main][INFO] - [Epoch 43] All losses: [[diffusion=0.075849 ; kl=3.21614e+08 ; lpips=0.170029 ; repa=0.452597]] +[2025-11-01 06:09:44,706][main][INFO] - [Epoch 44] Test metrics: [[MSE=24.88 | MAE=0.1137 | LPIPS=0.1522 | PSNR=16.04 | SSIM=0.4413 | dreamsim=0.1998 | FID=11.54]] +[2025-11-01 06:09:44,708][main][INFO] - [Epoch 44] Best metrics: [[min_MSE=22.91 | min_MAE=0.1058 | min_LPIPS=0.1522 | max_PSNR=16.4 | max_SSIM=0.4413 | min_dreamsim=0.1998 | min_FID=11.54]] +[2025-11-01 06:09:44,709][main][DEBUG] - Writing images to disk... +[2025-11-01 06:09:45,799][main][DEBUG] - Image(s) saved on disk +[2025-11-01 06:09:46,004][main][INFO] - End of epoch timers: [T_train=136:01:25 | T_epoch=03:05:17 | T_eval=02:32:46 | T_total=138:50:25] +[2025-11-01 06:09:46,005][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/last +[2025-11-01 06:09:57,354][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_EqM/checkpoints/best +[2025-11-01 06:10:08,846][main][INFO] - --- + + +[2025-11-01 06:10:08,847][main][INFO] - [T_total=138:50:48 | T_train=136:01:25] Start epoch 44 diff --git a/train_enc_dc_f32c32_EqM/tensorboard_logs/events.out.tfevents.1761477560.98629b852e50.63738.0 b/train_enc_dc_f32c32_EqM/tensorboard_logs/events.out.tfevents.1761477560.98629b852e50.63738.0 new file mode 100644 index 0000000000000000000000000000000000000000..537b9c8e5592ffc2dca9ee5ce7e71183dffbe610 --- /dev/null +++ b/train_enc_dc_f32c32_EqM/tensorboard_logs/events.out.tfevents.1761477560.98629b852e50.63738.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ee5d832a2211ad3b54638b31a63f0844d071bf7cce753238c04aab283eb5f06 +size 154206671 diff --git a/train_enc_dc_f32c32_FM/.hydra/config.yaml b/train_enc_dc_f32c32_FM/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6a4fb6d77126fa268c4e1102bf9f26b48444f802 --- /dev/null +++ b/train_enc_dc_f32c32_FM/.hydra/config.yaml @@ -0,0 +1,52 @@ +seed: 0 +task: train +runtime_path: ${hydra:runtime.cwd} +ckpt_dir: ${runtime_path}/runs +run_name: train_enc_dc_f32c32_FM +cache_dir: ${ckpt_dir}/cache +run_dir: ${ckpt_dir}/jobs/${run_name} +checkpoint_path: ${run_dir}/checkpoints +dataset: + imagenet_root: imagenet_data + im_size: 128 + batch_size: 192 + aug_scale: 2 + limit: null +distill_teacher: false +dc_ssdae: + compile: false + checkpoint: null + encoder: f32c32 + encoder_checkpoint: null + encoder_train: true + decoder: S + trainer_type: FM + encoder_type: dc + sampler: + steps: 10 + ema: + decay: 0.999 + start_iter: 50000 +aux_losses: + compile: ${dc_ssdae.compile} + repa: + i_extract: 4 + n_layers: 2 + lpips: true +training: + sdpa_kernel: 2 + mixed_precision: bf16 + grad_accumulate: 1 + grad_clip: 0.1 + epochs: 20 + eval_freq: 1 + save_on_best: FID + log_freq: 100 + lr: 0.0003 + weight_decay: 0.001 +losses: + diffusion: 1 + repa: 0.25 + lpips: 0.5 + kl: 1.0e-06 +show_samples: 8 diff --git a/train_enc_dc_f32c32_FM/.hydra/hydra.yaml b/train_enc_dc_f32c32_FM/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9b1ed6d8ffff04e6e286c5a3bd3b4ca12db84a72 --- /dev/null +++ b/train_enc_dc_f32c32_FM/.hydra/hydra.yaml @@ -0,0 +1,172 @@ +hydra: + run: + dir: ${run_dir} + sweep: + dir: ${run_dir} + subdir: multirun_${hydra:job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + colorlog: + (): colorlog.ColoredFormatter + format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] + - %(message)s' + log_colors: + DEBUG: purple + INFO: green + WARNING: yellow + ERROR: red + CRITICAL: red + handlers: + console: + class: logging.StreamHandler + formatter: colorlog + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra:runtime.output_dir}/${hydra:job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - run_name=train_enc_dc_f32c32_FM + - dataset.im_size=128 + - dataset.aug_scale=2 + - training.epochs=20 + - dc_ssdae.encoder_train=true + job: + name: main + chdir: null + override_dirname: dataset.aug_scale=2,dataset.im_size=128,dc_ssdae.encoder_train=true,run_name=train_enc_dc_f32c32_FM,training.epochs=20 + id: ??? + num: ??? + config_name: dc_f32c32_FM + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /workspace/DC_SSDAE + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /workspace/DC_SSDAE/config + schema: file + provider: main + - path: hydra_plugins.hydra_colorlog.conf + schema: pkg + provider: hydra-colorlog + - path: '' + schema: structured + provider: schema + output_dir: /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: colorlog + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/train_enc_dc_f32c32_FM/.hydra/overrides.yaml b/train_enc_dc_f32c32_FM/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2a3bef2787e52d3da6836f02c657b5ec8999d54d --- /dev/null +++ b/train_enc_dc_f32c32_FM/.hydra/overrides.yaml @@ -0,0 +1,5 @@ +- run_name=train_enc_dc_f32c32_FM +- dataset.im_size=128 +- dataset.aug_scale=2 +- training.epochs=20 +- dc_ssdae.encoder_train=true diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/custom_checkpoint_0.pkl b/train_enc_dc_f32c32_FM/checkpoints/best/custom_checkpoint_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..47b27815a23b43d339515e5c9e0ecf1ba128aaa8 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/custom_checkpoint_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b430d34a7b81d4c51cd8965a0e39388bddf8bb8708f6294838efaa318bd0e92 +size 2293 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/model.safetensors b/train_enc_dc_f32c32_FM/checkpoints/best/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..846109fa327f4196137e5dfa0646d27b52eb89d7 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9f46cd46064e01d1f0ba96ebc959bf9ddd42fd6a73fa54a3adec8fd09e3f8f5 +size 968466492 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/model_1.safetensors b/train_enc_dc_f32c32_FM/checkpoints/best/model_1.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..52fd7642c420d6bc3445d7c475b13e8b44c2eb16 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/model_1.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4bf720565d4c6dd1f5efaf8781daeaf13e7e9dc62abf8ae61401660badf4a2 +size 968466492 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/model_2.safetensors b/train_enc_dc_f32c32_FM/checkpoints/best/model_2.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9e2a596b895f2f4bd337e81e3ade5abb687cdd80 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/model_2.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f22c0ac0a5c37e453f5fa4c25ec76fa5f961cb6d56f1c70a171f60154645aaa9 +size 598032 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/model_ae.safetensors b/train_enc_dc_f32c32_FM/checkpoints/best/model_ae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..846109fa327f4196137e5dfa0646d27b52eb89d7 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/model_ae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9f46cd46064e01d1f0ba96ebc959bf9ddd42fd6a73fa54a3adec8fd09e3f8f5 +size 968466492 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/model_ae_ema.safetensors b/train_enc_dc_f32c32_FM/checkpoints/best/model_ae_ema.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..52fd7642c420d6bc3445d7c475b13e8b44c2eb16 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/model_ae_ema.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4bf720565d4c6dd1f5efaf8781daeaf13e7e9dc62abf8ae61401660badf4a2 +size 968466492 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/model_aux_losses.safetensors b/train_enc_dc_f32c32_FM/checkpoints/best/model_aux_losses.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9e2a596b895f2f4bd337e81e3ade5abb687cdd80 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/model_aux_losses.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f22c0ac0a5c37e453f5fa4c25ec76fa5f961cb6d56f1c70a171f60154645aaa9 +size 598032 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/optimizer.bin b/train_enc_dc_f32c32_FM/checkpoints/best/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..46198a19956b0416d858392b67ac97b7863dd57d --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:476824787533c6eb863844a3543bef24735cfeec45f2dc4d7e595082dce94312 +size 1938294667 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/random_states_0.pkl b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a11b426cf2bf6229568968ba0659a62ad2f69957 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80fc16be4fd72852da9ace45f98ab693699fb8c0dd89cb63f6aca56101ab1f85 +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/random_states_1.pkl b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8eda0f1dd0191aeea970b25644a461eca914fd9f --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1396712d8218465c3eb172e9cfe3e0020585deaa39f56c79817c07a13102cf02 +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/random_states_2.pkl b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e02c32d22a208305c2ff8af69fe20f54a9584ae0 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1428f12a249bb77bcd9baede6e64685171c05b0edd9b761f490e7aa37bd6a5d3 +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/random_states_3.pkl b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ba37573a38350de17b24db75422c7c085c24a9d7 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:058cf090b42ea0b5beea1ee1bd888235d67ef89afbb35d8a27718cc1f488d44b +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/random_states_4.pkl b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..000e58e9435d688051c439be14216712c96e0118 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:677b31dcd8f22e3a1b208a2e8e0444b56e0ad26d236a32b5065411f67d6d45e5 +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/random_states_5.pkl b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c17e5d1332b00e718df9d61792b77c71bc3279ea --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0dcdcf3388287b804fb5d588384fffd0be6ae06f2c09ea948b6f4dc26ea091b +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/random_states_6.pkl b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..851acd6913920f1929e3b514dc40be5a8b4d14bb --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a95211b8a04aef62035dbf60fd400b37bf8b4d190db49b4f89fa5bbd9219fa3f +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/best/random_states_7.pkl b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9081ea4591eee7fa6a4f6a47f0812fad429dd66e --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/best/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e955adb2bba8e8b4dcf36e2f4bc2a273022e6bea1a50637ff991ccb2d4d445ae +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/custom_checkpoint_0.pkl b/train_enc_dc_f32c32_FM/checkpoints/last/custom_checkpoint_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..3b4bfdb2adeae25532a7a9cab4f8d92bccb487ee --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/custom_checkpoint_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46c886730748015d356cfc7a1582f6c04bf4132fa12623de5663e23eb3864178 +size 2293 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/model.safetensors b/train_enc_dc_f32c32_FM/checkpoints/last/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..846109fa327f4196137e5dfa0646d27b52eb89d7 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9f46cd46064e01d1f0ba96ebc959bf9ddd42fd6a73fa54a3adec8fd09e3f8f5 +size 968466492 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/model_1.safetensors b/train_enc_dc_f32c32_FM/checkpoints/last/model_1.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..52fd7642c420d6bc3445d7c475b13e8b44c2eb16 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/model_1.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4bf720565d4c6dd1f5efaf8781daeaf13e7e9dc62abf8ae61401660badf4a2 +size 968466492 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/model_2.safetensors b/train_enc_dc_f32c32_FM/checkpoints/last/model_2.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9e2a596b895f2f4bd337e81e3ade5abb687cdd80 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/model_2.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f22c0ac0a5c37e453f5fa4c25ec76fa5f961cb6d56f1c70a171f60154645aaa9 +size 598032 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/model_ae.safetensors b/train_enc_dc_f32c32_FM/checkpoints/last/model_ae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..846109fa327f4196137e5dfa0646d27b52eb89d7 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/model_ae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9f46cd46064e01d1f0ba96ebc959bf9ddd42fd6a73fa54a3adec8fd09e3f8f5 +size 968466492 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/model_ae_ema.safetensors b/train_enc_dc_f32c32_FM/checkpoints/last/model_ae_ema.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..52fd7642c420d6bc3445d7c475b13e8b44c2eb16 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/model_ae_ema.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc4bf720565d4c6dd1f5efaf8781daeaf13e7e9dc62abf8ae61401660badf4a2 +size 968466492 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/model_aux_losses.safetensors b/train_enc_dc_f32c32_FM/checkpoints/last/model_aux_losses.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..9e2a596b895f2f4bd337e81e3ade5abb687cdd80 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/model_aux_losses.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f22c0ac0a5c37e453f5fa4c25ec76fa5f961cb6d56f1c70a171f60154645aaa9 +size 598032 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/optimizer.bin b/train_enc_dc_f32c32_FM/checkpoints/last/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..46198a19956b0416d858392b67ac97b7863dd57d --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:476824787533c6eb863844a3543bef24735cfeec45f2dc4d7e595082dce94312 +size 1938294667 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/random_states_0.pkl b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a11b426cf2bf6229568968ba0659a62ad2f69957 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80fc16be4fd72852da9ace45f98ab693699fb8c0dd89cb63f6aca56101ab1f85 +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/random_states_1.pkl b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..8eda0f1dd0191aeea970b25644a461eca914fd9f --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1396712d8218465c3eb172e9cfe3e0020585deaa39f56c79817c07a13102cf02 +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/random_states_2.pkl b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..e02c32d22a208305c2ff8af69fe20f54a9584ae0 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1428f12a249bb77bcd9baede6e64685171c05b0edd9b761f490e7aa37bd6a5d3 +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/random_states_3.pkl b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ba37573a38350de17b24db75422c7c085c24a9d7 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:058cf090b42ea0b5beea1ee1bd888235d67ef89afbb35d8a27718cc1f488d44b +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/random_states_4.pkl b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..000e58e9435d688051c439be14216712c96e0118 --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:677b31dcd8f22e3a1b208a2e8e0444b56e0ad26d236a32b5065411f67d6d45e5 +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/random_states_5.pkl b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c17e5d1332b00e718df9d61792b77c71bc3279ea --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0dcdcf3388287b804fb5d588384fffd0be6ae06f2c09ea948b6f4dc26ea091b +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/random_states_6.pkl b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..851acd6913920f1929e3b514dc40be5a8b4d14bb --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a95211b8a04aef62035dbf60fd400b37bf8b4d190db49b4f89fa5bbd9219fa3f +size 16449 diff --git a/train_enc_dc_f32c32_FM/checkpoints/last/random_states_7.pkl b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9081ea4591eee7fa6a4f6a47f0812fad429dd66e --- /dev/null +++ b/train_enc_dc_f32c32_FM/checkpoints/last/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e955adb2bba8e8b4dcf36e2f4bc2a273022e6bea1a50637ff991ccb2d4d445ae +size 16449 diff --git a/train_enc_dc_f32c32_FM/config.yaml b/train_enc_dc_f32c32_FM/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6a4fb6d77126fa268c4e1102bf9f26b48444f802 --- /dev/null +++ b/train_enc_dc_f32c32_FM/config.yaml @@ -0,0 +1,52 @@ +seed: 0 +task: train +runtime_path: ${hydra:runtime.cwd} +ckpt_dir: ${runtime_path}/runs +run_name: train_enc_dc_f32c32_FM +cache_dir: ${ckpt_dir}/cache +run_dir: ${ckpt_dir}/jobs/${run_name} +checkpoint_path: ${run_dir}/checkpoints +dataset: + imagenet_root: imagenet_data + im_size: 128 + batch_size: 192 + aug_scale: 2 + limit: null +distill_teacher: false +dc_ssdae: + compile: false + checkpoint: null + encoder: f32c32 + encoder_checkpoint: null + encoder_train: true + decoder: S + trainer_type: FM + encoder_type: dc + sampler: + steps: 10 + ema: + decay: 0.999 + start_iter: 50000 +aux_losses: + compile: ${dc_ssdae.compile} + repa: + i_extract: 4 + n_layers: 2 + lpips: true +training: + sdpa_kernel: 2 + mixed_precision: bf16 + grad_accumulate: 1 + grad_clip: 0.1 + epochs: 20 + eval_freq: 1 + save_on_best: FID + log_freq: 100 + lr: 0.0003 + weight_decay: 0.001 +losses: + diffusion: 1 + repa: 0.25 + lpips: 0.5 + kl: 1.0e-06 +show_samples: 8 diff --git a/train_enc_dc_f32c32_FM/main.log b/train_enc_dc_f32c32_FM/main.log new file mode 100644 index 0000000000000000000000000000000000000000..78a50123d4bccff886babb8c5551ff192bcaa4f6 --- /dev/null +++ b/train_enc_dc_f32c32_FM/main.log @@ -0,0 +1,711 @@ +[2025-10-25 04:11:21,158][main][INFO] - Will write tensorboard logs inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/tensorboard_logs +[2025-10-25 04:11:21,179][main][INFO] - Runtime at /workspace/DC_SSDAE +[2025-10-25 04:11:21,180][main][INFO] - Running inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM +[2025-10-25 04:11:21,181][main][INFO] - Running args: ['main.py', 'run_name=train_enc_dc_f32c32_FM', 'dataset.im_size=128', 'dataset.aug_scale=2', 'training.epochs=20', 'dc_ssdae.encoder_train=true'] +[2025-10-25 04:11:21,182][main][INFO] - Command: 'main.py' 'run_name=train_enc_dc_f32c32_FM' 'dataset.im_size=128' 'dataset.aug_scale=2' 'training.epochs=20' 'dc_ssdae.encoder_train=true' +[2025-10-25 04:11:21,182][main][INFO] - Accelerator with 8 processes, running on cuda:0 +[2025-10-25 04:11:21,186][main][INFO] - Hydra configuration: +seed: 0 +task: train +runtime_path: ${hydra:runtime.cwd} +ckpt_dir: ${runtime_path}/runs +run_name: train_enc_dc_f32c32_FM +cache_dir: ${ckpt_dir}/cache +run_dir: ${ckpt_dir}/jobs/${run_name} +checkpoint_path: ${run_dir}/checkpoints +dataset: + imagenet_root: imagenet_data + im_size: 128 + batch_size: 192 + aug_scale: 2 + limit: null +distill_teacher: false +dc_ssdae: + compile: false + checkpoint: null + encoder: f32c32 + encoder_checkpoint: null + encoder_train: true + decoder: S + trainer_type: FM + encoder_type: dc + sampler: + steps: 10 + ema: + decay: 0.999 + start_iter: 50000 +aux_losses: + compile: ${dc_ssdae.compile} + repa: + i_extract: 4 + n_layers: 2 + lpips: true +training: + sdpa_kernel: 2 + mixed_precision: bf16 + grad_accumulate: 1 + grad_clip: 0.1 + epochs: 20 + eval_freq: 1 + save_on_best: FID + log_freq: 100 + lr: 0.0003 + weight_decay: 0.001 +losses: + diffusion: 1 + repa: 0.25 + lpips: 0.5 + kl: 1.0e-06 +show_samples: 8 + + + +[2025-10-25 04:11:35,084][main][INFO] - Loaded ImageNet dataset: {'train': Dataset ImageNet + Number of datapoints: 1279867 + Root location: ../../../imagenet_data + Split: train + StandardTransform +Transform: Compose( + RandomResize(min_size=128, max_size=256, interpolation=InterpolationMode.LANCZOS, antialias=True) + RandomCrop(size=(128, 128), pad_if_needed=False, fill=0, padding_mode=constant) + RandomHorizontalFlip(p=0.5) + ToImage() + ToDtype(scale=True) + Normalize(mean=[0.5], std=[0.5], inplace=False) + ), 'test': Dataset ImageNet + Number of datapoints: 49950 + Root location: ../../../imagenet_data + Split: validation + StandardTransform +Transform: Compose( + Resize(size=[128], interpolation=InterpolationMode.BILINEAR, antialias=True) + CenterCrop(size=(128, 128)) + ToImage() + ToDtype(scale=True) + Normalize(mean=[0.5], std=[0.5], inplace=False) + )} +[2025-10-25 04:11:47,948][main][INFO] - ae parameters count: +[2025-10-25 04:11:47,953][main][INFO] - Total: #230.9M (trainable: #230.9M) +[2025-10-25 04:11:47,954][main][INFO] - - encoder: #217.4M (trainable: #217.4M) +[2025-10-25 04:11:47,955][main][INFO] - - project_in: #1.8K (trainable: #1.8K) +[2025-10-25 04:11:47,956][main][INFO] - - stages: #216.9M (trainable: #216.9M) +[2025-10-25 04:11:47,956][main][INFO] - - project_out: #576.1K (trainable: #576.1K) +[2025-10-25 04:11:47,958][main][INFO] - - decoder: #13.5M (trainable: #13.5M) +[2025-10-25 04:11:47,958][main][INFO] - - conv_in_img: #896 (trainable: #896) +[2025-10-25 04:11:47,959][main][INFO] - - conv_in_z: #9.0K (trainable: #9.0K) +[2025-10-25 04:11:47,959][main][INFO] - - conv_in: #36.1K (trainable: #36.1K) +[2025-10-25 04:11:47,959][main][INFO] - - batch_norm_z: #64 (trainable: #64) +[2025-10-25 04:11:47,960][main][INFO] - - time_proj: #0 (trainable: #0) +[2025-10-25 04:11:47,960][main][INFO] - - time_embedding: #80.5K (trainable: #80.5K) +[2025-10-25 04:11:47,960][main][INFO] - - ada_ctx_proj: #54.1K (trainable: #54.1K) +[2025-10-25 04:11:47,961][main][INFO] - - down_blocks: #3.0M (trainable: #3.0M) +[2025-10-25 04:11:47,962][main][INFO] - - mid_block: #3.4M (trainable: #3.4M) +[2025-10-25 04:11:47,963][main][INFO] - - up_blocks: #6.9M (trainable: #6.9M) +[2025-10-25 04:11:47,963][main][INFO] - - conv_norm_out: #128 (trainable: #128) +[2025-10-25 04:11:47,964][main][INFO] - - conv_out_act: #0 (trainable: #0) +[2025-10-25 04:11:47,964][main][INFO] - - conv_out: #1.7K (trainable: #1.7K) +[2025-10-25 04:11:47,969][main][INFO] - ae: EMAWrapper( + (model): DistributedDataParallel( + (module): DC_SSDAE( + (encoder): DCEncoder( + (project_in): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (stages): ModuleList( + (0): OpSequential( + (op_list): ModuleList() + ) + (1): OpSequential( + (op_list): ModuleList( + (0-4): 5 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + (5): ResidualBlock( + (main): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (shortcut): PixelUnshuffleChannelAveragingDownSampleLayer() + ) + ) + ) + (2): OpSequential( + (op_list): ModuleList( + (0-9): 10 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + (10): ResidualBlock( + (main): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(512, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (shortcut): PixelUnshuffleChannelAveragingDownSampleLayer() + ) + ) + ) + (3): OpSequential( + (op_list): ModuleList( + (0-3): 4 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + (4): ResidualBlock( + (main): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (shortcut): PixelUnshuffleChannelAveragingDownSampleLayer() + ) + ) + ) + (4): OpSequential( + (op_list): ModuleList( + (0-3): 4 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + (4): ResidualBlock( + (main): ConvPixelUnshuffleDownSampleLayer( + (conv): ConvLayer( + (conv): Conv2d(1024, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (shortcut): PixelUnshuffleChannelAveragingDownSampleLayer() + ) + ) + ) + (5): OpSequential( + (op_list): ModuleList( + (0-3): 4 x ResidualBlock( + (main): ResBlock( + (conv1): ConvLayer( + (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (act): SiLU() + ) + (conv2): ConvLayer( + (conv): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) + ) + ) + (shortcut): IdentityLayer() + ) + ) + ) + ) + (project_out): OpSequential( + (op_list): ModuleList( + (0): ConvLayer( + (conv): Conv2d(1024, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + ) + (decoder): UViTDecoder( + (conv_in_img): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (conv_in_z): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (conv_in): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (batch_norm_z): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (time_proj): Timesteps() + (time_embedding): TimestepEmbedding( + (linear_1): Linear(in_features=64, out_features=256, bias=True) + (act): SiLU() + (linear_2): Linear(in_features=256, out_features=256, bias=True) + ) + (ada_ctx_proj): Sequential( + (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): SiLU() + (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (down_blocks): ModuleList( + (0): DownBlock2D( + (resnets): ModuleList( + (0-1): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=128, bias=True) + (norm2): GroupNorm(32, 64, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + (downsamplers): ModuleList( + (0): Downsample2D( + (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + ) + (1): DownBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(64, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 192, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + (downsamplers): ModuleList( + (0): Downsample2D( + (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + ) + (2): DownBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 192, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(96, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(96, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + (downsamplers): ModuleList( + (0): Downsample2D( + (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + ) + (3): DownBlock2D( + (resnets): ModuleList( + (0-1): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + ) + ) + (mid_block): UViTMiddleTransformer( + (proj_in): Linear(in_features=160, out_features=160, bias=True) + (transformer_blocks): ModuleList( + (0-7): 8 x TransformerBlock( + (norm1): AdaLayerNorm( + (silu): SiLU() + (linear): Linear(in_features=64, out_features=320, bias=True) + (norm): LayerNorm((160,), eps=1e-05, elementwise_affine=False) + ) + (attn1): Attention( + (to_q): Linear(in_features=160, out_features=160, bias=False) + (to_k): Linear(in_features=160, out_features=160, bias=False) + (to_v): Linear(in_features=160, out_features=160, bias=False) + (out_proj): Linear(in_features=160, out_features=160, bias=True) + (out_drop): Dropout(p=0.0, inplace=False) + ) + (norm2): LayerNorm((160,), eps=1e-05, elementwise_affine=True) + (ff): FeedForward( + (proj_in_act): GEGLU( + (proj): Linear(in_features=160, out_features=1280, bias=True) + ) + (drop): Dropout(p=0.0, inplace=False) + (proj_out): Linear(in_features=640, out_features=160, bias=True) + ) + (relative_position_bias): RelativePositionBias() + ) + ) + (proj_out): Linear(in_features=160, out_features=160, bias=True) + (norm): GroupNorm(32, 160, eps=1e-06, affine=True) + ) + (up_blocks): ModuleList( + (0): UpBlock2D( + (resnets): ModuleList( + (0-2): 3 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(320, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(320, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (upsamplers): ModuleList( + (0): Upsample2D( + (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (1): UpBlock2D( + (resnets): ModuleList( + (0-1): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(320, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(320, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + (2): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 512, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(256, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(256, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (upsamplers): ModuleList( + (0): Upsample2D( + (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (2): UpBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 512, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(256, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(256, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(192, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + (2): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(160, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (upsamplers): ModuleList( + (0): Upsample2D( + (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (3): UpBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=128, bias=True) + (norm2): GroupNorm(32, 64, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(160, 64, kernel_size=(1, 1), stride=(1, 1)) + ) + (1-2): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=128, bias=True) + (norm2): GroupNorm(32, 64, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(128, 64, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + ) + ) + (conv_norm_out): GroupNorm(32, 64, eps=1e-05, affine=True) + (conv_out_act): SiLU() + (conv_out): Conv2d(64, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (ema): EMA(ema_model=DC_SSDAE, decay=0.999, start_iter=50000) +) +[2025-10-25 04:11:47,970][main][INFO] - aux_losses parameters count: +[2025-10-25 04:11:47,971][main][INFO] - Total: #96.7M (trainable: #145.9K) +[2025-10-25 04:11:47,972][main][INFO] - - repa_loss: #82.7M (trainable: #145.9K) +[2025-10-25 04:11:47,972][main][INFO] - - lpips_loss: #14.0M (trainable: #0) +[2025-10-25 04:11:47,973][main][INFO] - aux_losses: DistributedDataParallel( + (module): SSDDLosses( + (repa_loss): REPALoss( + (features_extractor): Frozen(DinoEncoder/Dinov2Model) + (repa_mlp): Sequential( + (0): Linear(in_features=160, out_features=160, bias=True) + (1): SiLU() + (2): Linear(in_features=160, out_features=768, bias=True) + ) + (repa_loss): CosineSimilarity() + ) + (lpips_loss): Frozen(LPIPS) + ) +) +[2025-10-25 04:11:47,978][main][INFO] - Optimizer for autoencoder: RAdamScheduleFree ( +Parameter Group 0 + betas: (0.9, 0.999) + eps: 1e-08 + foreach: True + k: 0 + lr: 0.0003 + lr_max: -1.0 + r: 0.0 + scheduled_lr: 0.0 + silent_sgd_phase: True + train_mode: False + weight_decay: 0.001 + weight_lr_power: 2.0 + weight_sum: 0.0 + +Parameter Group 1 + betas: (0.9, 0.999) + eps: 1e-08 + foreach: True + k: 0 + lr: 0.0003 + lr_max: -1.0 + r: 0.0 + scheduled_lr: 0.0 + silent_sgd_phase: True + train_mode: False + weight_decay: 0.0 + weight_lr_power: 2.0 + weight_sum: 0.0 +) +[2025-10-25 04:11:47,983][main][INFO] - No training state found to resume from None +[2025-10-25 04:11:47,984][main][INFO] - ====================== RUNNING TASK train +[2025-10-25 04:11:47,984][main][INFO] - Starting training +[2025-10-25 04:11:47,984][main][INFO] - Batch size of 192 (24 per GPU, 1 acumulation step(s) 8 process(es)) +[2025-10-25 04:11:47,993][main][INFO] - --- + + +[2025-10-25 04:11:47,993][main][INFO] - [T_total=00:00:26 | T_train=00:00:00] Start epoch 0 +[2025-10-25 07:13:55,982][main][INFO] - [T_total=03:02:34 | T_train=03:02:07 | T_epoch=03:02:07] End of epoch 0 (6666 steps) train loss 67151.8 +[2025-10-25 07:13:55,984][main][INFO] - [Epoch 0] All losses: [[diffusion=0.124198 ; kl=6.71513e+10 ; lpips=0.362462 ; repa=0.669115]] +[2025-10-25 07:17:26,094][main][INFO] - [Epoch 1] Test metrics: [[MSE=55.56 | MAE=0.1763 | LPIPS=0.4494 | PSNR=12.55 | SSIM=0.2492 | dreamsim=0.6301 | FID=116.4]] +[2025-10-25 07:17:26,096][main][INFO] - [Epoch 1] Best metrics: [[min_MSE=55.56 | min_MAE=0.1763 | min_LPIPS=0.4494 | max_PSNR=12.55 | max_SSIM=0.2492 | min_dreamsim=0.6301 | min_FID=116.4]] +[2025-10-25 07:17:26,097][main][DEBUG] - Writing images to disk... +[2025-10-25 07:17:27,322][main][DEBUG] - Image(s) saved on disk +[2025-10-25 07:17:27,651][main][INFO] - End of epoch timers: [T_train=03:02:07 | T_epoch=03:02:07 | T_eval=00:03:31 | T_total=03:06:06] +[2025-10-25 07:17:27,652][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/last +[2025-10-25 07:17:38,934][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/best +[2025-10-25 07:17:49,112][main][INFO] - --- + + +[2025-10-25 07:17:49,113][main][INFO] - [T_total=03:06:27 | T_train=03:02:07] Start epoch 1 +[2025-10-25 10:19:46,050][main][INFO] - [T_total=06:08:24 | T_train=06:04:04 | T_epoch=03:01:56] End of epoch 1 (13332 steps) train loss 25.5196 +[2025-10-25 10:19:46,051][main][INFO] - [Epoch 1] All losses: [[diffusion=0.0923335 ; kl=2.51396e+07 ; lpips=0.280141 ; repa=0.590596]] +[2025-10-25 10:23:14,102][main][INFO] - [Epoch 2] Test metrics: [[MSE=47.13 | MAE=0.1616 | LPIPS=0.3504 | PSNR=13.27 | SSIM=0.2853 | dreamsim=0.5293 | FID=90.87]] +[2025-10-25 10:23:14,104][main][INFO] - [Epoch 2] Best metrics: [[min_MSE=47.13 | min_MAE=0.1616 | min_LPIPS=0.3504 | max_PSNR=13.27 | max_SSIM=0.2853 | min_dreamsim=0.5293 | min_FID=90.87]] +[2025-10-25 10:23:14,106][main][DEBUG] - Writing images to disk... +[2025-10-25 10:23:15,187][main][DEBUG] - Image(s) saved on disk +[2025-10-25 10:23:15,438][main][INFO] - End of epoch timers: [T_train=06:04:04 | T_epoch=03:01:56 | T_eval=00:07:00 | T_total=06:11:54] +[2025-10-25 10:23:15,439][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/last +[2025-10-25 10:23:26,115][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/best +[2025-10-25 10:23:36,944][main][INFO] - --- + + +[2025-10-25 10:23:36,945][main][INFO] - [T_total=06:12:15 | T_train=06:04:04] Start epoch 2 +[2025-10-25 13:25:33,577][main][INFO] - [T_total=09:14:12 | T_train=09:06:01 | T_epoch=03:01:56] End of epoch 2 (19998 steps) train loss 3183.13 +[2025-10-25 13:25:33,578][main][INFO] - [Epoch 2] All losses: [[diffusion=0.0880481 ; kl=3.18278e+09 ; lpips=0.243993 ; repa=0.561169]] +[2025-10-25 13:29:01,907][main][INFO] - [Epoch 3] Test metrics: [[MSE=39.14 | MAE=0.1464 | LPIPS=0.2767 | PSNR=14.07 | SSIM=0.3174 | dreamsim=0.4307 | FID=65.23]] +[2025-10-25 13:29:01,909][main][INFO] - [Epoch 3] Best metrics: [[min_MSE=39.14 | min_MAE=0.1464 | min_LPIPS=0.2767 | max_PSNR=14.07 | max_SSIM=0.3174 | min_dreamsim=0.4307 | min_FID=65.23]] +[2025-10-25 13:29:01,910][main][DEBUG] - Writing images to disk... +[2025-10-25 13:29:02,747][main][DEBUG] - Image(s) saved on disk +[2025-10-25 13:29:02,993][main][INFO] - End of epoch timers: [T_train=09:06:01 | T_epoch=03:01:56 | T_eval=00:10:30 | T_total=09:17:41] +[2025-10-25 13:29:02,995][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/last +[2025-10-25 13:29:15,297][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/best +[2025-10-25 13:29:27,004][main][INFO] - --- + + +[2025-10-25 13:29:27,025][main][INFO] - [T_total=09:18:05 | T_train=09:06:01] Start epoch 3 +[2025-10-25 16:31:23,501][main][INFO] - [T_total=12:20:02 | T_train=12:07:58 | T_epoch=03:01:56] End of epoch 3 (26664 steps) train loss 3955.96 +[2025-10-25 16:31:23,503][main][INFO] - [Epoch 3] All losses: [[diffusion=0.0853818 ; kl=3.95562e+09 ; lpips=0.225845 ; repa=0.542498]] +[2025-10-25 16:34:51,652][main][INFO] - [Epoch 4] Test metrics: [[MSE=33.1 | MAE=0.1326 | LPIPS=0.2336 | PSNR=14.8 | SSIM=0.3397 | dreamsim=0.3658 | FID=48.75]] +[2025-10-25 16:34:51,653][main][INFO] - [Epoch 4] Best metrics: [[min_MSE=33.1 | min_MAE=0.1326 | min_LPIPS=0.2336 | max_PSNR=14.8 | max_SSIM=0.3397 | min_dreamsim=0.3658 | min_FID=48.75]] +[2025-10-25 16:34:51,655][main][DEBUG] - Writing images to disk... +[2025-10-25 16:34:52,487][main][DEBUG] - Image(s) saved on disk +[2025-10-25 16:34:52,740][main][INFO] - End of epoch timers: [T_train=12:07:58 | T_epoch=03:01:56 | T_eval=00:13:59 | T_total=12:23:31] +[2025-10-25 16:34:52,742][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/last +[2025-10-25 16:35:04,325][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/best +[2025-10-25 16:35:16,162][main][INFO] - --- + + +[2025-10-25 16:35:16,163][main][INFO] - [T_total=12:23:55 | T_train=12:07:58] Start epoch 4 +[2025-10-25 19:37:10,375][main][INFO] - [T_total=15:25:49 | T_train=15:09:52 | T_epoch=03:01:54] End of epoch 4 (33330 steps) train loss 323.739 +[2025-10-25 19:37:10,376][main][INFO] - [Epoch 4] All losses: [[diffusion=0.0845231 ; kl=3.23414e+08 ; lpips=0.216933 ; repa=0.530433]] +[2025-10-25 19:40:38,194][main][INFO] - [Epoch 5] Test metrics: [[MSE=30.03 | MAE=0.1249 | LPIPS=0.2154 | PSNR=15.22 | SSIM=0.351 | dreamsim=0.3335 | FID=40.99]] +[2025-10-25 19:40:38,195][main][INFO] - [Epoch 5] Best metrics: [[min_MSE=30.03 | min_MAE=0.1249 | min_LPIPS=0.2154 | max_PSNR=15.22 | max_SSIM=0.351 | min_dreamsim=0.3335 | min_FID=40.99]] +[2025-10-25 19:40:38,196][main][DEBUG] - Writing images to disk... +[2025-10-25 19:40:39,044][main][DEBUG] - Image(s) saved on disk +[2025-10-25 19:40:39,245][main][INFO] - End of epoch timers: [T_train=15:09:52 | T_epoch=03:01:54 | T_eval=00:17:27 | T_total=15:29:18] +[2025-10-25 19:40:39,246][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/last +[2025-10-25 19:40:51,215][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/best +[2025-10-25 19:41:02,678][main][INFO] - --- + + +[2025-10-25 19:41:02,679][main][INFO] - [T_total=15:29:41 | T_train=15:09:52] Start epoch 5 +[2025-10-25 22:43:01,888][main][INFO] - [T_total=18:31:40 | T_train=18:11:51 | T_epoch=03:01:59] End of epoch 5 (39996 steps) train loss 12156.3 +[2025-10-25 22:43:01,890][main][INFO] - [Epoch 5] All losses: [[diffusion=0.0830056 ; kl=1.2156e+10 ; lpips=0.208209 ; repa=0.519751]] +[2025-10-25 22:46:30,029][main][INFO] - [Epoch 6] Test metrics: [[MSE=27.85 | MAE=0.1193 | LPIPS=0.2014 | PSNR=15.55 | SSIM=0.3646 | dreamsim=0.3094 | FID=35.9]] +[2025-10-25 22:46:30,031][main][INFO] - [Epoch 6] Best metrics: [[min_MSE=27.85 | min_MAE=0.1193 | min_LPIPS=0.2014 | max_PSNR=15.55 | max_SSIM=0.3646 | min_dreamsim=0.3094 | min_FID=35.9]] +[2025-10-25 22:46:30,032][main][DEBUG] - Writing images to disk... +[2025-10-25 22:46:30,873][main][DEBUG] - Image(s) saved on disk +[2025-10-25 22:46:31,075][main][INFO] - End of epoch timers: [T_train=18:11:51 | T_epoch=03:01:59 | T_eval=00:20:57 | T_total=18:35:09] +[2025-10-25 22:46:31,076][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/last +[2025-10-25 22:46:42,024][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/best +[2025-10-25 22:46:54,114][main][INFO] - --- + + +[2025-10-25 22:46:54,115][main][INFO] - [T_total=18:35:32 | T_train=18:11:51] Start epoch 6 +[2025-10-26 01:48:51,311][main][INFO] - [T_total=21:37:30 | T_train=21:13:48 | T_epoch=03:01:57] End of epoch 6 (46662 steps) train loss 482.556 +[2025-10-26 01:48:51,313][main][INFO] - [Epoch 6] All losses: [[diffusion=0.0830356 ; kl=4.82242e+08 ; lpips=0.205621 ; repa=0.513838]] +[2025-10-26 01:52:19,589][main][INFO] - [Epoch 7] Test metrics: [[MSE=26.51 | MAE=0.1158 | LPIPS=0.1917 | PSNR=15.77 | SSIM=0.3748 | dreamsim=0.2921 | FID=31.73]] +[2025-10-26 01:52:19,591][main][INFO] - [Epoch 7] Best metrics: [[min_MSE=26.51 | min_MAE=0.1158 | min_LPIPS=0.1917 | max_PSNR=15.77 | max_SSIM=0.3748 | min_dreamsim=0.2921 | min_FID=31.73]] +[2025-10-26 01:52:19,592][main][DEBUG] - Writing images to disk... +[2025-10-26 01:52:20,433][main][DEBUG] - Image(s) saved on disk +[2025-10-26 01:52:20,683][main][INFO] - End of epoch timers: [T_train=21:13:48 | T_epoch=03:01:57 | T_eval=00:24:26 | T_total=21:40:59] +[2025-10-26 01:52:20,685][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/last +[2025-10-26 01:52:32,727][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/best +[2025-10-26 01:52:44,520][main][INFO] - --- + + +[2025-10-26 01:52:44,521][main][INFO] - [T_total=21:41:23 | T_train=21:13:48] Start epoch 7 +[2025-10-26 04:54:39,982][main][INFO] - [T_total=24:43:18 | T_train=24:15:44 | T_epoch=03:01:55] End of epoch 7 (53328 steps) train loss 3.64232 +[2025-10-26 04:54:39,984][main][INFO] - [Epoch 7] All losses: [[diffusion=0.0819999 ; kl=3.33403e+06 ; lpips=0.199347 ; repa=0.506464]] +[2025-10-26 04:58:08,198][main][INFO] - [Epoch 8] Test metrics: [[MSE=25.63 | MAE=0.1131 | LPIPS=0.1843 | PSNR=15.91 | SSIM=0.3824 | dreamsim=0.2779 | FID=28.82]] +[2025-10-26 04:58:08,200][main][INFO] - [Epoch 8] Best metrics: [[min_MSE=25.63 | min_MAE=0.1131 | min_LPIPS=0.1843 | max_PSNR=15.91 | max_SSIM=0.3824 | min_dreamsim=0.2779 | min_FID=28.82]] +[2025-10-26 04:58:08,201][main][DEBUG] - Writing images to disk... +[2025-10-26 04:58:09,050][main][DEBUG] - Image(s) saved on disk +[2025-10-26 04:58:09,290][main][INFO] - End of epoch timers: [T_train=24:15:44 | T_epoch=03:01:55 | T_eval=00:27:55 | T_total=24:46:48] +[2025-10-26 04:58:09,291][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/last +[2025-10-26 04:58:20,610][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/best +[2025-10-26 04:58:31,832][main][INFO] - --- + + +[2025-10-26 04:58:31,833][main][INFO] - [T_total=24:47:10 | T_train=24:15:44] Start epoch 8 +[2025-10-26 08:00:23,920][main][INFO] - [T_total=27:49:02 | T_train=27:17:36 | T_epoch=03:01:52] End of epoch 8 (59994 steps) train loss 6.31716 +[2025-10-26 08:00:23,922][main][INFO] - [Epoch 8] All losses: [[diffusion=0.0804194 ; kl=6.01513e+06 ; lpips=0.19339 ; repa=0.49965]] +[2025-10-26 08:03:51,684][main][INFO] - [Epoch 9] Test metrics: [[MSE=25.07 | MAE=0.1114 | LPIPS=0.1784 | PSNR=16.01 | SSIM=0.3893 | dreamsim=0.2661 | FID=26.23]] +[2025-10-26 08:03:51,686][main][INFO] - [Epoch 9] Best metrics: [[min_MSE=25.07 | min_MAE=0.1114 | min_LPIPS=0.1784 | max_PSNR=16.01 | max_SSIM=0.3893 | min_dreamsim=0.2661 | min_FID=26.23]] +[2025-10-26 08:03:51,687][main][DEBUG] - Writing images to disk... +[2025-10-26 08:03:52,532][main][DEBUG] - Image(s) saved on disk +[2025-10-26 08:03:52,734][main][INFO] - End of epoch timers: [T_train=27:17:36 | T_epoch=03:01:52 | T_eval=00:31:23 | T_total=27:52:31] +[2025-10-26 08:03:52,735][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/last +[2025-10-26 08:04:04,116][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/best +[2025-10-26 08:04:15,898][main][INFO] - --- + + +[2025-10-26 08:04:15,899][main][INFO] - [T_total=27:52:54 | T_train=27:17:36] Start epoch 9 +[2025-10-26 11:06:10,742][main][INFO] - [T_total=30:54:49 | T_train=30:19:31 | T_epoch=03:01:54] End of epoch 9 (66660 steps) train loss 334.899 +[2025-10-26 11:06:10,744][main][INFO] - [Epoch 9] All losses: [[diffusion=0.0824992 ; kl=3.34592e+08 ; lpips=0.198936 ; repa=0.500318]] +[2025-10-26 11:09:38,812][main][INFO] - [Epoch 10] Test metrics: [[MSE=24.11 | MAE=0.1086 | LPIPS=0.1724 | PSNR=16.18 | SSIM=0.3956 | dreamsim=0.255 | FID=23.81]] +[2025-10-26 11:09:38,814][main][INFO] - [Epoch 10] Best metrics: [[min_MSE=24.11 | min_MAE=0.1086 | min_LPIPS=0.1724 | max_PSNR=16.18 | max_SSIM=0.3956 | min_dreamsim=0.255 | min_FID=23.81]] +[2025-10-26 11:09:38,815][main][DEBUG] - Writing images to disk... +[2025-10-26 11:09:39,906][main][DEBUG] - Image(s) saved on disk +[2025-10-26 11:09:40,115][main][INFO] - End of epoch timers: [T_train=30:19:31 | T_epoch=03:01:54 | T_eval=00:34:53 | T_total=30:58:18] +[2025-10-26 11:09:40,117][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/last +[2025-10-26 11:09:51,824][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_dc_f32c32_FM/checkpoints/best +[2025-10-26 11:10:02,555][main][INFO] - --- + + +[2025-10-26 11:10:02,556][main][INFO] - [T_total=30:58:41 | T_train=30:19:31] Start epoch 10 diff --git a/train_enc_dc_f32c32_FM/tensorboard_logs/events.out.tfevents.1761365481.98629b852e50.58214.0 b/train_enc_dc_f32c32_FM/tensorboard_logs/events.out.tfevents.1761365481.98629b852e50.58214.0 new file mode 100644 index 0000000000000000000000000000000000000000..2269cc7e29c7cefb36b826b51ac1081b15dbfc9b --- /dev/null +++ b/train_enc_dc_f32c32_FM/tensorboard_logs/events.out.tfevents.1761365481.98629b852e50.58214.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98a52f85f89f04fc56535b20ea49ea98c2df0958f3a88d2b6eb2aea66f63f5d4 +size 35128118 diff --git a/train_enc_vq_f8c4_FM/.hydra/config.yaml b/train_enc_vq_f8c4_FM/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c107f82650fee8a5f65daa417c32480f6138cfb7 --- /dev/null +++ b/train_enc_vq_f8c4_FM/.hydra/config.yaml @@ -0,0 +1,52 @@ +seed: 0 +task: train +runtime_path: ${hydra:runtime.cwd} +ckpt_dir: ${runtime_path}/runs +run_name: train_enc_vq_f8c4_FM +cache_dir: ${ckpt_dir}/cache +run_dir: ${ckpt_dir}/jobs/${run_name} +checkpoint_path: ${run_dir}/checkpoints +dataset: + imagenet_root: imagenet_data + im_size: 128 + batch_size: 192 + aug_scale: 2 + limit: null +distill_teacher: false +dc_ssdae: + compile: false + checkpoint: null + encoder: f8c4 + encoder_checkpoint: null + encoder_train: true + decoder: S + trainer_type: FM + encoder_type: vq + sampler: + steps: 10 + ema: + decay: 0.999 + start_iter: 50000 +aux_losses: + compile: ${dc_ssdae.compile} + repa: + i_extract: 4 + n_layers: 2 + lpips: true +training: + sdpa_kernel: 2 + mixed_precision: bf16 + grad_accumulate: 1 + grad_clip: 0.1 + epochs: 20 + eval_freq: 1 + save_on_best: FID + log_freq: 100 + lr: 0.0003 + weight_decay: 0.001 +losses: + diffusion: 1 + repa: 0.25 + lpips: 0.5 + kl: 1.0e-06 +show_samples: 8 diff --git a/train_enc_vq_f8c4_FM/.hydra/hydra.yaml b/train_enc_vq_f8c4_FM/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..746fc5392e990f638b99bac9b8fb244976ea04d9 --- /dev/null +++ b/train_enc_vq_f8c4_FM/.hydra/hydra.yaml @@ -0,0 +1,172 @@ +hydra: + run: + dir: ${run_dir} + sweep: + dir: ${run_dir} + subdir: multirun_${hydra:job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + colorlog: + (): colorlog.ColoredFormatter + format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] + - %(message)s' + log_colors: + DEBUG: purple + INFO: green + WARNING: yellow + ERROR: red + CRITICAL: red + handlers: + console: + class: logging.StreamHandler + formatter: colorlog + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra:runtime.output_dir}/${hydra:job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - run_name=train_enc_vq_f8c4_FM + - dataset.im_size=128 + - dataset.aug_scale=2 + - training.epochs=20 + - dc_ssdae.encoder_train=true + job: + name: main + chdir: null + override_dirname: dataset.aug_scale=2,dataset.im_size=128,dc_ssdae.encoder_train=true,run_name=train_enc_vq_f8c4_FM,training.epochs=20 + id: ??? + num: ??? + config_name: vq_f8c4_FM + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /workspace/DC_SSDAE + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /workspace/DC_SSDAE/config + schema: file + provider: main + - path: hydra_plugins.hydra_colorlog.conf + schema: pkg + provider: hydra-colorlog + - path: '' + schema: structured + provider: schema + output_dir: /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM + choices: + hydra/env: default + hydra/callbacks: null + hydra/job_logging: colorlog + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/train_enc_vq_f8c4_FM/.hydra/overrides.yaml b/train_enc_vq_f8c4_FM/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..a2d30f962f489629f27f36087f0acba8018708a3 --- /dev/null +++ b/train_enc_vq_f8c4_FM/.hydra/overrides.yaml @@ -0,0 +1,5 @@ +- run_name=train_enc_vq_f8c4_FM +- dataset.im_size=128 +- dataset.aug_scale=2 +- training.epochs=20 +- dc_ssdae.encoder_train=true diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/custom_checkpoint_0.pkl b/train_enc_vq_f8c4_FM/checkpoints/best/custom_checkpoint_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..59eba57d7ca55294dce5cc2dd5d8b865600cc0c1 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/custom_checkpoint_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc2db4c4c3bf3a021b8b6f1c7f013c302e92e7d0a21be4c017ab4e4ff7a6abc1 +size 2357 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/model.safetensors b/train_enc_vq_f8c4_FM/checkpoints/best/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..65b51040a7e64cac43d42b13825ed34e2b88b9f5 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5421e731138cc7a66aff3318fcb79afb96439b3fd952e9f797ad26a771d30c6c +size 193078740 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/model_1.safetensors b/train_enc_vq_f8c4_FM/checkpoints/best/model_1.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ae9ab31b4cb2876f29e5cd18710068c375d5dd31 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/model_1.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77afb73f7b763a10b674f7e3c0867a2fedaab3496f197ec2b8fef2743a8d961d +size 193078740 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/model_2.safetensors b/train_enc_vq_f8c4_FM/checkpoints/best/model_2.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b96e7ba0db03b337e0fe39bbec1e91dc58d79be8 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/model_2.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e39f3a92103ad48b198373b1c0ee1a8f0434ad544651bf360d5a32bf9b65a716 +size 598032 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/model_ae.safetensors b/train_enc_vq_f8c4_FM/checkpoints/best/model_ae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..65b51040a7e64cac43d42b13825ed34e2b88b9f5 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/model_ae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5421e731138cc7a66aff3318fcb79afb96439b3fd952e9f797ad26a771d30c6c +size 193078740 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/model_ae_ema.safetensors b/train_enc_vq_f8c4_FM/checkpoints/best/model_ae_ema.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ae9ab31b4cb2876f29e5cd18710068c375d5dd31 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/model_ae_ema.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77afb73f7b763a10b674f7e3c0867a2fedaab3496f197ec2b8fef2743a8d961d +size 193078740 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/model_aux_losses.safetensors b/train_enc_vq_f8c4_FM/checkpoints/best/model_aux_losses.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b96e7ba0db03b337e0fe39bbec1e91dc58d79be8 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/model_aux_losses.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e39f3a92103ad48b198373b1c0ee1a8f0434ad544651bf360d5a32bf9b65a716 +size 598032 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/optimizer.bin b/train_enc_vq_f8c4_FM/checkpoints/best/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..6fd690f25c78c0b6e8b14458f42b34c14d148f8b --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1689f0bbf0a09bd5c6cf3c93cf01e8f99b96804e0a5d2adf57566051e88ae199 +size 387527499 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/random_states_0.pkl b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ebc5fd41011910a081d6076fae6284cd53e895d6 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11f3395dc5634dbb587fdc7286987299cfd1ad7661207c6cd6b395375239ff44 +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/random_states_1.pkl b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..68969a49769f85ad09a25362469efb3696f9646e --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dd8d2e5218a4186dc7df03d38f7dfb1ebb3bba19de31c34ba70b17c1393718e +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/random_states_2.pkl b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0d968d78152c2052c0189fae6501f8f94dffa41a --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bfc7ff98fbd761db1508c7417dacc28965d208ee52e9982d6eaa291be2a3439 +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/random_states_3.pkl b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9405185765f5062022522ea3e56928c34860510f --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a6de37e54c36a767be7a2b81bf6190551027a8b79336e81df01edfa29a80d74 +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/random_states_4.pkl b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..27e76c7b1ccf594e27f98e6fd50818f9d293fea8 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2b911a96660a17318b1c5a522d8c2379136bf18d27cec236d5c92df7eec024b +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/random_states_5.pkl b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..97a3ec7784a2e5f578886db01cc9c7f746e2778e --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59339e0955847f1c5fd4c014a19f1fb3aec66d775a24d9166cc10657ec5fd89f +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/random_states_6.pkl b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..694555f59926ab1d6bb3bb4672aff9f8c61f0eb1 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76572c507253cb2b48ca4002a595fd26ec0872e2aebd8278ed7d67e44e27e13e +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/best/random_states_7.pkl b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..fd90dab21c70bdaf918040674f4c5d86c293f0a2 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/best/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:623fca2133507ca062e1c8047d5de5cde0d85869d9221924420e3a64c0c7087c +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/custom_checkpoint_0.pkl b/train_enc_vq_f8c4_FM/checkpoints/last/custom_checkpoint_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..4042b1095332ee88b4b996c493105b6bf9beee14 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/custom_checkpoint_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c71126986dbf9bfac6f9c24d380177a275c2cf323ad67f0f99948925f5da9f4f +size 2357 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/model.safetensors b/train_enc_vq_f8c4_FM/checkpoints/last/model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..65b51040a7e64cac43d42b13825ed34e2b88b9f5 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5421e731138cc7a66aff3318fcb79afb96439b3fd952e9f797ad26a771d30c6c +size 193078740 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/model_1.safetensors b/train_enc_vq_f8c4_FM/checkpoints/last/model_1.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ae9ab31b4cb2876f29e5cd18710068c375d5dd31 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/model_1.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77afb73f7b763a10b674f7e3c0867a2fedaab3496f197ec2b8fef2743a8d961d +size 193078740 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/model_2.safetensors b/train_enc_vq_f8c4_FM/checkpoints/last/model_2.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b96e7ba0db03b337e0fe39bbec1e91dc58d79be8 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/model_2.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e39f3a92103ad48b198373b1c0ee1a8f0434ad544651bf360d5a32bf9b65a716 +size 598032 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/model_ae.safetensors b/train_enc_vq_f8c4_FM/checkpoints/last/model_ae.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..65b51040a7e64cac43d42b13825ed34e2b88b9f5 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/model_ae.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5421e731138cc7a66aff3318fcb79afb96439b3fd952e9f797ad26a771d30c6c +size 193078740 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/model_ae_ema.safetensors b/train_enc_vq_f8c4_FM/checkpoints/last/model_ae_ema.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ae9ab31b4cb2876f29e5cd18710068c375d5dd31 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/model_ae_ema.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77afb73f7b763a10b674f7e3c0867a2fedaab3496f197ec2b8fef2743a8d961d +size 193078740 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/model_aux_losses.safetensors b/train_enc_vq_f8c4_FM/checkpoints/last/model_aux_losses.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b96e7ba0db03b337e0fe39bbec1e91dc58d79be8 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/model_aux_losses.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e39f3a92103ad48b198373b1c0ee1a8f0434ad544651bf360d5a32bf9b65a716 +size 598032 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/optimizer.bin b/train_enc_vq_f8c4_FM/checkpoints/last/optimizer.bin new file mode 100644 index 0000000000000000000000000000000000000000..6fd690f25c78c0b6e8b14458f42b34c14d148f8b --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/optimizer.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1689f0bbf0a09bd5c6cf3c93cf01e8f99b96804e0a5d2adf57566051e88ae199 +size 387527499 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/random_states_0.pkl b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ebc5fd41011910a081d6076fae6284cd53e895d6 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_0.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11f3395dc5634dbb587fdc7286987299cfd1ad7661207c6cd6b395375239ff44 +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/random_states_1.pkl b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..68969a49769f85ad09a25362469efb3696f9646e --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_1.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dd8d2e5218a4186dc7df03d38f7dfb1ebb3bba19de31c34ba70b17c1393718e +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/random_states_2.pkl b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0d968d78152c2052c0189fae6501f8f94dffa41a --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bfc7ff98fbd761db1508c7417dacc28965d208ee52e9982d6eaa291be2a3439 +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/random_states_3.pkl b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_3.pkl new file mode 100644 index 0000000000000000000000000000000000000000..9405185765f5062022522ea3e56928c34860510f --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_3.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a6de37e54c36a767be7a2b81bf6190551027a8b79336e81df01edfa29a80d74 +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/random_states_4.pkl b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_4.pkl new file mode 100644 index 0000000000000000000000000000000000000000..27e76c7b1ccf594e27f98e6fd50818f9d293fea8 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_4.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2b911a96660a17318b1c5a522d8c2379136bf18d27cec236d5c92df7eec024b +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/random_states_5.pkl b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_5.pkl new file mode 100644 index 0000000000000000000000000000000000000000..97a3ec7784a2e5f578886db01cc9c7f746e2778e --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_5.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59339e0955847f1c5fd4c014a19f1fb3aec66d775a24d9166cc10657ec5fd89f +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/random_states_6.pkl b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_6.pkl new file mode 100644 index 0000000000000000000000000000000000000000..694555f59926ab1d6bb3bb4672aff9f8c61f0eb1 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_6.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76572c507253cb2b48ca4002a595fd26ec0872e2aebd8278ed7d67e44e27e13e +size 16449 diff --git a/train_enc_vq_f8c4_FM/checkpoints/last/random_states_7.pkl b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_7.pkl new file mode 100644 index 0000000000000000000000000000000000000000..fd90dab21c70bdaf918040674f4c5d86c293f0a2 --- /dev/null +++ b/train_enc_vq_f8c4_FM/checkpoints/last/random_states_7.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:623fca2133507ca062e1c8047d5de5cde0d85869d9221924420e3a64c0c7087c +size 16449 diff --git a/train_enc_vq_f8c4_FM/config.yaml b/train_enc_vq_f8c4_FM/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c107f82650fee8a5f65daa417c32480f6138cfb7 --- /dev/null +++ b/train_enc_vq_f8c4_FM/config.yaml @@ -0,0 +1,52 @@ +seed: 0 +task: train +runtime_path: ${hydra:runtime.cwd} +ckpt_dir: ${runtime_path}/runs +run_name: train_enc_vq_f8c4_FM +cache_dir: ${ckpt_dir}/cache +run_dir: ${ckpt_dir}/jobs/${run_name} +checkpoint_path: ${run_dir}/checkpoints +dataset: + imagenet_root: imagenet_data + im_size: 128 + batch_size: 192 + aug_scale: 2 + limit: null +distill_teacher: false +dc_ssdae: + compile: false + checkpoint: null + encoder: f8c4 + encoder_checkpoint: null + encoder_train: true + decoder: S + trainer_type: FM + encoder_type: vq + sampler: + steps: 10 + ema: + decay: 0.999 + start_iter: 50000 +aux_losses: + compile: ${dc_ssdae.compile} + repa: + i_extract: 4 + n_layers: 2 + lpips: true +training: + sdpa_kernel: 2 + mixed_precision: bf16 + grad_accumulate: 1 + grad_clip: 0.1 + epochs: 20 + eval_freq: 1 + save_on_best: FID + log_freq: 100 + lr: 0.0003 + weight_decay: 0.001 +losses: + diffusion: 1 + repa: 0.25 + lpips: 0.5 + kl: 1.0e-06 +show_samples: 8 diff --git a/train_enc_vq_f8c4_FM/main.log b/train_enc_vq_f8c4_FM/main.log new file mode 100644 index 0000000000000000000000000000000000000000..654b457b9dd33218eee3d8f523e114ade8e45bb9 --- /dev/null +++ b/train_enc_vq_f8c4_FM/main.log @@ -0,0 +1,770 @@ +[2025-10-24 11:27:55,703][main][INFO] - Will write tensorboard logs inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/tensorboard_logs +[2025-10-24 11:27:55,722][main][INFO] - Runtime at /workspace/DC_SSDAE +[2025-10-24 11:27:55,723][main][INFO] - Running inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM +[2025-10-24 11:27:55,724][main][INFO] - Running args: ['main.py', 'run_name=train_enc_vq_f8c4_FM', 'dataset.im_size=128', 'dataset.aug_scale=2', 'training.epochs=20', 'dc_ssdae.encoder_train=true'] +[2025-10-24 11:27:55,725][main][INFO] - Command: 'main.py' 'run_name=train_enc_vq_f8c4_FM' 'dataset.im_size=128' 'dataset.aug_scale=2' 'training.epochs=20' 'dc_ssdae.encoder_train=true' +[2025-10-24 11:27:55,726][main][INFO] - Accelerator with 8 processes, running on cuda:0 +[2025-10-24 11:27:55,729][main][INFO] - Hydra configuration: +seed: 0 +task: train +runtime_path: ${hydra:runtime.cwd} +ckpt_dir: ${runtime_path}/runs +run_name: train_enc_vq_f8c4_FM +cache_dir: ${ckpt_dir}/cache +run_dir: ${ckpt_dir}/jobs/${run_name} +checkpoint_path: ${run_dir}/checkpoints +dataset: + imagenet_root: imagenet_data + im_size: 128 + batch_size: 192 + aug_scale: 2 + limit: null +distill_teacher: false +dc_ssdae: + compile: false + checkpoint: null + encoder: f8c4 + encoder_checkpoint: null + encoder_train: true + decoder: S + trainer_type: FM + encoder_type: vq + sampler: + steps: 10 + ema: + decay: 0.999 + start_iter: 50000 +aux_losses: + compile: ${dc_ssdae.compile} + repa: + i_extract: 4 + n_layers: 2 + lpips: true +training: + sdpa_kernel: 2 + mixed_precision: bf16 + grad_accumulate: 1 + grad_clip: 0.1 + epochs: 20 + eval_freq: 1 + save_on_best: FID + log_freq: 100 + lr: 0.0003 + weight_decay: 0.001 +losses: + diffusion: 1 + repa: 0.25 + lpips: 0.5 + kl: 1.0e-06 +show_samples: 8 + + + +[2025-10-24 11:28:09,494][main][INFO] - Loaded ImageNet dataset: {'train': Dataset ImageNet + Number of datapoints: 1279867 + Root location: ../../../imagenet_data + Split: train + StandardTransform +Transform: Compose( + RandomResize(min_size=128, max_size=256, interpolation=InterpolationMode.LANCZOS, antialias=True) + RandomCrop(size=(128, 128), pad_if_needed=False, fill=0, padding_mode=constant) + RandomHorizontalFlip(p=0.5) + ToImage() + ToDtype(scale=True) + Normalize(mean=[0.5], std=[0.5], inplace=False) + ), 'test': Dataset ImageNet + Number of datapoints: 49950 + Root location: ../../../imagenet_data + Split: validation + StandardTransform +Transform: Compose( + Resize(size=[128], interpolation=InterpolationMode.BILINEAR, antialias=True) + CenterCrop(size=(128, 128)) + ToImage() + ToDtype(scale=True) + Normalize(mean=[0.5], std=[0.5], inplace=False) + )} +[2025-10-24 11:28:18,537][main][INFO] - ae parameters count: +[2025-10-24 11:28:18,540][main][INFO] - Total: #46.0M (trainable: #46.0M) +[2025-10-24 11:28:18,541][main][INFO] - - encoder: #32.6M (trainable: #32.6M) +[2025-10-24 11:28:18,542][main][INFO] - - conv_in: #3.5K (trainable: #3.5K) +[2025-10-24 11:28:18,543][main][INFO] - - down: #22.5M (trainable: #22.5M) +[2025-10-24 11:28:18,543][main][INFO] - - mid: #10.0M (trainable: #10.0M) +[2025-10-24 11:28:18,544][main][INFO] - - norm_out: #1.0K (trainable: #1.0K) +[2025-10-24 11:28:18,545][main][INFO] - - act_out: #0 (trainable: #0) +[2025-10-24 11:28:18,545][main][INFO] - - conv_out: #36.0K (trainable: #36.0K) +[2025-10-24 11:28:18,546][main][INFO] - - out_proj: #72 (trainable: #72) +[2025-10-24 11:28:18,547][main][INFO] - - decoder: #13.4M (trainable: #13.4M) +[2025-10-24 11:28:18,548][main][INFO] - - conv_in_img: #896 (trainable: #896) +[2025-10-24 11:28:18,548][main][INFO] - - conv_in_z: #1.2K (trainable: #1.2K) +[2025-10-24 11:28:18,549][main][INFO] - - conv_in: #36.1K (trainable: #36.1K) +[2025-10-24 11:28:18,550][main][INFO] - - batch_norm_z: #8 (trainable: #8) +[2025-10-24 11:28:18,550][main][INFO] - - time_proj: #0 (trainable: #0) +[2025-10-24 11:28:18,551][main][INFO] - - time_embedding: #80.5K (trainable: #80.5K) +[2025-10-24 11:28:18,551][main][INFO] - - ada_ctx_proj: #38.4K (trainable: #38.4K) +[2025-10-24 11:28:18,552][main][INFO] - - down_blocks: #3.0M (trainable: #3.0M) +[2025-10-24 11:28:18,553][main][INFO] - - mid_block: #3.4M (trainable: #3.4M) +[2025-10-24 11:28:18,554][main][INFO] - - up_blocks: #6.9M (trainable: #6.9M) +[2025-10-24 11:28:18,554][main][INFO] - - conv_norm_out: #128 (trainable: #128) +[2025-10-24 11:28:18,555][main][INFO] - - conv_out_act: #0 (trainable: #0) +[2025-10-24 11:28:18,555][main][INFO] - - conv_out: #1.7K (trainable: #1.7K) +[2025-10-24 11:28:18,557][main][INFO] - ae: EMAWrapper( + (model): DistributedDataParallel( + (module): DC_SSDAE( + (encoder): VQEncoder( + (conv_in): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (down): ModuleList( + (0): Module( + (block): ModuleList( + (0-1): 2 x VQGResnetBlock( + (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) + (act1): SwishActivation() + (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) + (act2): SwishActivation() + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (attn): ModuleList() + (downsample): VQGDownsample( + (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2)) + ) + ) + (1): Module( + (block): ModuleList( + (0): VQGResnetBlock( + (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) + (act1): SwishActivation() + (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) + (act2): SwishActivation() + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nin_shortcut): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): VQGResnetBlock( + (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) + (act1): SwishActivation() + (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) + (act2): SwishActivation() + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (attn): ModuleList() + (downsample): VQGDownsample( + (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2)) + ) + ) + (2): Module( + (block): ModuleList( + (0): VQGResnetBlock( + (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) + (act1): SwishActivation() + (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (act2): SwishActivation() + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nin_shortcut): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): VQGResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (act1): SwishActivation() + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (act2): SwishActivation() + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (attn): ModuleList() + (downsample): VQGDownsample( + (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2)) + ) + ) + (3): Module( + (block): ModuleList( + (0-1): 2 x VQGResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (act1): SwishActivation() + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (act2): SwishActivation() + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (attn): ModuleList() + ) + ) + (mid): Module( + (block_1): VQGResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (act1): SwishActivation() + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (act2): SwishActivation() + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (attn_1): VQGAttnBlock( + (norm): GroupNorm(32, 512, eps=1e-06, affine=True) + (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) + (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) + (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) + (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1)) + ) + (block_2): VQGResnetBlock( + (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) + (act1): SwishActivation() + (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) + (act2): SwishActivation() + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + (norm_out): GroupNorm(32, 512, eps=1e-06, affine=True) + (act_out): SwishActivation() + (conv_out): Conv2d(512, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (out_proj): Conv2d(8, 8, kernel_size=(1, 1), stride=(1, 1)) + ) + (decoder): UViTDecoder( + (conv_in_img): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (conv_in_z): Conv2d(4, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (conv_in): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (batch_norm_z): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) + (time_proj): Timesteps() + (time_embedding): TimestepEmbedding( + (linear_1): Linear(in_features=64, out_features=256, bias=True) + (act): SiLU() + (linear_2): Linear(in_features=256, out_features=256, bias=True) + ) + (ada_ctx_proj): Sequential( + (0): Conv2d(4, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (1): SiLU() + (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + (down_blocks): ModuleList( + (0): DownBlock2D( + (resnets): ModuleList( + (0-1): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=128, bias=True) + (norm2): GroupNorm(32, 64, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + (downsamplers): ModuleList( + (0): Downsample2D( + (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + ) + (1): DownBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 128, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(64, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(64, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 192, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + (downsamplers): ModuleList( + (0): Downsample2D( + (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + ) + (2): DownBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 192, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(96, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(96, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + (downsamplers): ModuleList( + (0): Downsample2D( + (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)) + ) + ) + ) + (3): DownBlock2D( + (resnets): ModuleList( + (0-1): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + ) + ) + ) + ) + (mid_block): UViTMiddleTransformer( + (proj_in): Linear(in_features=160, out_features=160, bias=True) + (transformer_blocks): ModuleList( + (0-7): 8 x TransformerBlock( + (norm1): AdaLayerNorm( + (silu): SiLU() + (linear): Linear(in_features=64, out_features=320, bias=True) + (norm): LayerNorm((160,), eps=1e-05, elementwise_affine=False) + ) + (attn1): Attention( + (to_q): Linear(in_features=160, out_features=160, bias=False) + (to_k): Linear(in_features=160, out_features=160, bias=False) + (to_v): Linear(in_features=160, out_features=160, bias=False) + (out_proj): Linear(in_features=160, out_features=160, bias=True) + (out_drop): Dropout(p=0.0, inplace=False) + ) + (norm2): LayerNorm((160,), eps=1e-05, elementwise_affine=True) + (ff): FeedForward( + (proj_in_act): GEGLU( + (proj): Linear(in_features=160, out_features=1280, bias=True) + ) + (drop): Dropout(p=0.0, inplace=False) + (proj_out): Linear(in_features=640, out_features=160, bias=True) + ) + (relative_position_bias): RelativePositionBias() + ) + ) + (proj_out): Linear(in_features=160, out_features=160, bias=True) + (norm): GroupNorm(32, 160, eps=1e-06, affine=True) + ) + (up_blocks): ModuleList( + (0): UpBlock2D( + (resnets): ModuleList( + (0-2): 3 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(320, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(320, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (upsamplers): ModuleList( + (0): Upsample2D( + (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (1): UpBlock2D( + (resnets): ModuleList( + (0-1): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 640, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(320, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(320, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + (2): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 512, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(256, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=320, bias=True) + (norm2): GroupNorm(32, 160, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(256, 160, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (upsamplers): ModuleList( + (0): Upsample2D( + (conv): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (2): UpBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 512, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(256, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(256, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + (1): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 384, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(192, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + (2): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=192, bias=True) + (norm2): GroupNorm(32, 96, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(160, 96, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + (upsamplers): ModuleList( + (0): Upsample2D( + (conv): Conv2d(96, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (3): UpBlock2D( + (resnets): ModuleList( + (0): ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 320, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(160, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=128, bias=True) + (norm2): GroupNorm(32, 64, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(160, 64, kernel_size=(1, 1), stride=(1, 1)) + ) + (1-2): 2 x ResnetBlock2D( + (norm1): AdaGroupNorm2D( + (ctx_proj): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1)) + ) + (conv1): Conv2d(128, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (time_emb_proj): Linear(in_features=256, out_features=128, bias=True) + (norm2): GroupNorm(32, 64, eps=1e-05, affine=True) + (dropout): Dropout(p=0.0, inplace=False) + (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + (nonlinearity): SiLU() + (conv_shortcut): Conv2d(128, 64, kernel_size=(1, 1), stride=(1, 1)) + ) + ) + ) + ) + (conv_norm_out): GroupNorm(32, 64, eps=1e-05, affine=True) + (conv_out_act): SiLU() + (conv_out): Conv2d(64, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) + ) + ) + ) + (ema): EMA(ema_model=DC_SSDAE, decay=0.999, start_iter=50000) +) +[2025-10-24 11:28:18,558][main][INFO] - aux_losses parameters count: +[2025-10-24 11:28:18,559][main][INFO] - Total: #96.7M (trainable: #145.9K) +[2025-10-24 11:28:18,560][main][INFO] - - repa_loss: #82.7M (trainable: #145.9K) +[2025-10-24 11:28:18,561][main][INFO] - - lpips_loss: #14.0M (trainable: #0) +[2025-10-24 11:28:18,561][main][INFO] - aux_losses: DistributedDataParallel( + (module): SSDDLosses( + (repa_loss): REPALoss( + (features_extractor): Frozen(DinoEncoder/Dinov2Model) + (repa_mlp): Sequential( + (0): Linear(in_features=160, out_features=160, bias=True) + (1): SiLU() + (2): Linear(in_features=160, out_features=768, bias=True) + ) + (repa_loss): CosineSimilarity() + ) + (lpips_loss): Frozen(LPIPS) + ) +) +[2025-10-24 11:28:18,565][main][INFO] - Optimizer for autoencoder: RAdamScheduleFree ( +Parameter Group 0 + betas: (0.9, 0.999) + eps: 1e-08 + foreach: True + k: 0 + lr: 0.0003 + lr_max: -1.0 + r: 0.0 + scheduled_lr: 0.0 + silent_sgd_phase: True + train_mode: False + weight_decay: 0.001 + weight_lr_power: 2.0 + weight_sum: 0.0 + +Parameter Group 1 + betas: (0.9, 0.999) + eps: 1e-08 + foreach: True + k: 0 + lr: 0.0003 + lr_max: -1.0 + r: 0.0 + scheduled_lr: 0.0 + silent_sgd_phase: True + train_mode: False + weight_decay: 0.0 + weight_lr_power: 2.0 + weight_sum: 0.0 +) +[2025-10-24 11:28:18,570][main][INFO] - No training state found to resume from None +[2025-10-24 11:28:18,571][main][INFO] - ====================== RUNNING TASK train +[2025-10-24 11:28:18,572][main][INFO] - Starting training +[2025-10-24 11:28:18,572][main][INFO] - Batch size of 192 (24 per GPU, 1 acumulation step(s) 8 process(es)) +[2025-10-24 11:28:18,582][main][INFO] - --- + + +[2025-10-24 11:28:18,583][main][INFO] - [T_total=00:00:22 | T_train=00:00:00] Start epoch 0 +[2025-10-24 12:31:01,697][main][INFO] - [T_total=01:03:06 | T_train=01:02:43 | T_epoch=01:02:43] End of epoch 0 (6666 steps) train loss 0.379739 +[2025-10-24 12:31:01,700][main][INFO] - [Epoch 0] All losses: [[diffusion=0.0877689 ; kl=3611.6 ; lpips=0.251927 ; repa=0.64958]] +[2025-10-24 12:34:30,741][main][INFO] - [Epoch 1] Test metrics: [[MSE=14.16 | MAE=0.0884 | LPIPS=0.1332 | PSNR=18.49 | SSIM=0.6156 | dreamsim=0.2237 | FID=21.41]] +[2025-10-24 12:34:30,743][main][INFO] - [Epoch 1] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.1332 | max_PSNR=18.49 | max_SSIM=0.6156 | min_dreamsim=0.2237 | min_FID=21.41]] +[2025-10-24 12:34:30,744][main][DEBUG] - Writing images to disk... +[2025-10-24 12:34:31,976][main][DEBUG] - Image(s) saved on disk +[2025-10-24 12:34:32,219][main][INFO] - End of epoch timers: [T_train=01:02:43 | T_epoch=01:02:43 | T_eval=00:03:30 | T_total=01:06:36] +[2025-10-24 12:34:32,220][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-24 12:34:34,794][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-24 12:34:36,825][main][INFO] - --- + + +[2025-10-24 12:34:36,826][main][INFO] - [T_total=01:06:41 | T_train=01:02:43] Start epoch 1 +[2025-10-24 13:37:11,223][main][INFO] - [T_total=02:09:15 | T_train=02:05:17 | T_epoch=01:02:34] End of epoch 1 (13332 steps) train loss 0.295457 +[2025-10-24 13:37:11,224][main][INFO] - [Epoch 1] All losses: [[diffusion=0.0670763 ; kl=3707.57 ; lpips=0.1699 ; repa=0.55889]] +[2025-10-24 13:40:38,432][main][INFO] - [Epoch 2] Test metrics: [[MSE=18.03 | MAE=0.1014 | LPIPS=0.1322 | PSNR=17.44 | SSIM=0.6126 | dreamsim=0.2068 | FID=15.49]] +[2025-10-24 13:40:38,434][main][INFO] - [Epoch 2] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.1322 | max_PSNR=18.49 | max_SSIM=0.6156 | min_dreamsim=0.2068 | min_FID=15.49]] +[2025-10-24 13:40:38,435][main][DEBUG] - Writing images to disk... +[2025-10-24 13:40:39,512][main][DEBUG] - Image(s) saved on disk +[2025-10-24 13:40:39,760][main][INFO] - End of epoch timers: [T_train=02:05:17 | T_epoch=01:02:34 | T_eval=00:06:58 | T_total=02:12:44] +[2025-10-24 13:40:39,762][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-24 13:40:42,329][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-24 13:40:44,960][main][INFO] - --- + + +[2025-10-24 13:40:44,961][main][INFO] - [T_total=02:12:49 | T_train=02:05:17] Start epoch 2 +[2025-10-24 14:43:20,521][main][INFO] - [T_total=03:15:24 | T_train=03:07:53 | T_epoch=01:02:35] End of epoch 2 (19998 steps) train loss 0.278652 +[2025-10-24 14:43:20,523][main][INFO] - [Epoch 2] All losses: [[diffusion=0.0649925 ; kl=3695.06 ; lpips=0.154526 ; repa=0.530805]] +[2025-10-24 14:46:47,655][main][INFO] - [Epoch 3] Test metrics: [[MSE=20.47 | MAE=0.1086 | LPIPS=0.13 | PSNR=16.89 | SSIM=0.6123 | dreamsim=0.1954 | FID=12.6]] +[2025-10-24 14:46:47,656][main][INFO] - [Epoch 3] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.13 | max_PSNR=18.49 | max_SSIM=0.6156 | min_dreamsim=0.1954 | min_FID=12.6]] +[2025-10-24 14:46:47,657][main][DEBUG] - Writing images to disk... +[2025-10-24 14:46:48,722][main][DEBUG] - Image(s) saved on disk +[2025-10-24 14:46:48,970][main][INFO] - End of epoch timers: [T_train=03:07:53 | T_epoch=01:02:35 | T_eval=00:10:27 | T_total=03:18:53] +[2025-10-24 14:46:48,971][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-24 14:46:51,733][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-24 14:46:54,361][main][INFO] - --- + + +[2025-10-24 14:46:54,362][main][INFO] - [T_total=03:18:58 | T_train=03:07:53] Start epoch 3 +[2025-10-24 15:49:29,065][main][INFO] - [T_total=04:21:33 | T_train=04:10:27 | T_epoch=01:02:34] End of epoch 3 (26664 steps) train loss 0.268908 +[2025-10-24 15:49:29,066][main][INFO] - [Epoch 3] All losses: [[diffusion=0.0635387 ; kl=3692.76 ; lpips=0.146519 ; repa=0.513671]] +[2025-10-24 15:52:56,207][main][INFO] - [Epoch 4] Test metrics: [[MSE=21.69 | MAE=0.112 | LPIPS=0.127 | PSNR=16.64 | SSIM=0.6152 | dreamsim=0.1867 | FID=10.75]] +[2025-10-24 15:52:56,209][main][INFO] - [Epoch 4] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.127 | max_PSNR=18.49 | max_SSIM=0.6156 | min_dreamsim=0.1867 | min_FID=10.75]] +[2025-10-24 15:52:56,210][main][DEBUG] - Writing images to disk... +[2025-10-24 15:52:57,298][main][DEBUG] - Image(s) saved on disk +[2025-10-24 15:52:57,498][main][INFO] - End of epoch timers: [T_train=04:10:27 | T_epoch=01:02:34 | T_eval=00:13:55 | T_total=04:25:01] +[2025-10-24 15:52:57,500][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-24 15:52:59,857][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-24 15:53:02,578][main][INFO] - --- + + +[2025-10-24 15:53:02,579][main][INFO] - [T_total=04:25:06 | T_train=04:10:27] Start epoch 4 +[2025-10-24 16:55:38,098][main][INFO] - [T_total=05:27:42 | T_train=05:13:03 | T_epoch=01:02:35] End of epoch 4 (33330 steps) train loss 0.262561 +[2025-10-24 16:55:38,102][main][INFO] - [Epoch 4] All losses: [[diffusion=0.06292 ; kl=3688.83 ; lpips=0.141097 ; repa=0.501614]] +[2025-10-24 16:59:05,267][main][INFO] - [Epoch 5] Test metrics: [[MSE=21.7 | MAE=0.1119 | LPIPS=0.1238 | PSNR=16.64 | SSIM=0.6186 | dreamsim=0.1799 | FID=9.549]] +[2025-10-24 16:59:05,270][main][INFO] - [Epoch 5] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.1238 | max_PSNR=18.49 | max_SSIM=0.6186 | min_dreamsim=0.1799 | min_FID=9.549]] +[2025-10-24 16:59:05,271][main][DEBUG] - Writing images to disk... +[2025-10-24 16:59:06,351][main][DEBUG] - Image(s) saved on disk +[2025-10-24 16:59:06,591][main][INFO] - End of epoch timers: [T_train=05:13:03 | T_epoch=01:02:35 | T_eval=00:17:23 | T_total=05:31:10] +[2025-10-24 16:59:06,592][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-24 16:59:09,275][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-24 16:59:11,878][main][INFO] - --- + + +[2025-10-24 16:59:11,879][main][INFO] - [T_total=05:31:16 | T_train=05:13:03] Start epoch 5 +[2025-10-24 18:01:46,540][main][INFO] - [T_total=06:33:50 | T_train=06:15:37 | T_epoch=01:02:34] End of epoch 5 (39996 steps) train loss 0.257655 +[2025-10-24 18:01:46,542][main][INFO] - [Epoch 5] All losses: [[diffusion=0.0621701 ; kl=3687.45 ; lpips=0.137338 ; repa=0.492512]] +[2025-10-24 18:05:13,288][main][INFO] - [Epoch 6] Test metrics: [[MSE=21.93 | MAE=0.1125 | LPIPS=0.1213 | PSNR=16.59 | SSIM=0.6218 | dreamsim=0.1746 | FID=8.68]] +[2025-10-24 18:05:13,290][main][INFO] - [Epoch 6] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.1213 | max_PSNR=18.49 | max_SSIM=0.6218 | min_dreamsim=0.1746 | min_FID=8.68]] +[2025-10-24 18:05:13,291][main][DEBUG] - Writing images to disk... +[2025-10-24 18:05:14,398][main][DEBUG] - Image(s) saved on disk +[2025-10-24 18:05:14,601][main][INFO] - End of epoch timers: [T_train=06:15:37 | T_epoch=01:02:34 | T_eval=00:20:51 | T_total=06:37:18] +[2025-10-24 18:05:14,604][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-24 18:05:17,445][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-24 18:05:19,868][main][INFO] - --- + + +[2025-10-24 18:05:19,869][main][INFO] - [T_total=06:37:24 | T_train=06:15:37] Start epoch 6 +[2025-10-24 19:07:56,898][main][INFO] - [T_total=07:40:01 | T_train=07:18:14 | T_epoch=01:02:37] End of epoch 6 (46662 steps) train loss 0.253725 +[2025-10-24 19:07:56,900][main][INFO] - [Epoch 6] All losses: [[diffusion=0.0615326 ; kl=3688.36 ; lpips=0.134359 ; repa=0.485297]] +[2025-10-24 19:11:24,094][main][INFO] - [Epoch 7] Test metrics: [[MSE=22.28 | MAE=0.1135 | LPIPS=0.1196 | PSNR=16.52 | SSIM=0.624 | dreamsim=0.1707 | FID=8.082]] +[2025-10-24 19:11:24,096][main][INFO] - [Epoch 7] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.1196 | max_PSNR=18.49 | max_SSIM=0.624 | min_dreamsim=0.1707 | min_FID=8.082]] +[2025-10-24 19:11:24,097][main][DEBUG] - Writing images to disk... +[2025-10-24 19:11:25,201][main][DEBUG] - Image(s) saved on disk +[2025-10-24 19:11:25,400][main][INFO] - End of epoch timers: [T_train=07:18:14 | T_epoch=01:02:37 | T_eval=00:24:19 | T_total=07:43:29] +[2025-10-24 19:11:25,403][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-24 19:11:28,161][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-24 19:11:30,853][main][INFO] - --- + + +[2025-10-24 19:11:30,853][main][INFO] - [T_total=07:43:35 | T_train=07:18:14] Start epoch 7 +[2025-10-24 20:14:06,645][main][INFO] - [T_total=08:46:10 | T_train=08:20:50 | T_epoch=01:02:35] End of epoch 7 (53328 steps) train loss 0.250756 +[2025-10-24 20:14:06,647][main][INFO] - [Epoch 7] All losses: [[diffusion=0.06134 ; kl=3691.13 ; lpips=0.131829 ; repa=0.479243]] +[2025-10-24 20:17:33,486][main][INFO] - [Epoch 8] Test metrics: [[MSE=22.07 | MAE=0.1128 | LPIPS=0.1169 | PSNR=16.56 | SSIM=0.6267 | dreamsim=0.1663 | FID=7.509]] +[2025-10-24 20:17:33,488][main][INFO] - [Epoch 8] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.1169 | max_PSNR=18.49 | max_SSIM=0.6267 | min_dreamsim=0.1663 | min_FID=7.509]] +[2025-10-24 20:17:33,489][main][DEBUG] - Writing images to disk... +[2025-10-24 20:17:34,577][main][DEBUG] - Image(s) saved on disk +[2025-10-24 20:17:34,803][main][INFO] - End of epoch timers: [T_train=08:20:50 | T_epoch=01:02:35 | T_eval=00:27:47 | T_total=08:49:39] +[2025-10-24 20:17:34,804][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-24 20:17:37,556][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-24 20:17:40,188][main][INFO] - --- + + +[2025-10-24 20:17:40,189][main][INFO] - [T_total=08:49:44 | T_train=08:20:50] Start epoch 8 +[2025-10-24 21:20:17,007][main][INFO] - [T_total=09:52:21 | T_train=09:23:27 | T_epoch=01:02:36] End of epoch 8 (59994 steps) train loss 0.248044 +[2025-10-24 21:20:17,008][main][INFO] - [Epoch 8] All losses: [[diffusion=0.0607502 ; kl=3693.54 ; lpips=0.130101 ; repa=0.474199]] +[2025-10-24 21:23:44,408][main][INFO] - [Epoch 9] Test metrics: [[MSE=21.7 | MAE=0.1117 | LPIPS=0.1145 | PSNR=16.64 | SSIM=0.6294 | dreamsim=0.1627 | FID=7.034]] +[2025-10-24 21:23:44,410][main][INFO] - [Epoch 9] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.1145 | max_PSNR=18.49 | max_SSIM=0.6294 | min_dreamsim=0.1627 | min_FID=7.034]] +[2025-10-24 21:23:44,411][main][DEBUG] - Writing images to disk... +[2025-10-24 21:23:45,508][main][DEBUG] - Image(s) saved on disk +[2025-10-24 21:23:45,708][main][INFO] - End of epoch timers: [T_train=09:23:27 | T_epoch=01:02:36 | T_eval=00:31:16 | T_total=09:55:50] +[2025-10-24 21:23:45,709][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-24 21:23:48,374][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-24 21:23:51,020][main][INFO] - --- + + +[2025-10-24 21:23:51,020][main][INFO] - [T_total=09:55:55 | T_train=09:23:27] Start epoch 9 +[2025-10-24 22:26:26,704][main][INFO] - [T_total=10:58:31 | T_train=10:26:03 | T_epoch=01:02:35] End of epoch 9 (66660 steps) train loss 0.245806 +[2025-10-24 22:26:26,706][main][INFO] - [Epoch 9] All losses: [[diffusion=0.0604599 ; kl=3695.63 ; lpips=0.128417 ; repa=0.469767]] +[2025-10-24 22:29:54,018][main][INFO] - [Epoch 10] Test metrics: [[MSE=21.54 | MAE=0.1112 | LPIPS=0.1129 | PSNR=16.67 | SSIM=0.6308 | dreamsim=0.1598 | FID=6.658]] +[2025-10-24 22:29:54,020][main][INFO] - [Epoch 10] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.1129 | max_PSNR=18.49 | max_SSIM=0.6308 | min_dreamsim=0.1598 | min_FID=6.658]] +[2025-10-24 22:29:54,021][main][DEBUG] - Writing images to disk... +[2025-10-24 22:29:55,134][main][DEBUG] - Image(s) saved on disk +[2025-10-24 22:29:55,337][main][INFO] - End of epoch timers: [T_train=10:26:03 | T_epoch=01:02:35 | T_eval=00:34:44 | T_total=11:01:59] +[2025-10-24 22:29:55,338][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-24 22:29:58,129][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-24 22:30:00,883][main][INFO] - --- + + +[2025-10-24 22:30:00,884][main][INFO] - [T_total=11:02:05 | T_train=10:26:03] Start epoch 10 +[2025-10-24 23:32:38,551][main][INFO] - [T_total=12:04:42 | T_train=11:28:40 | T_epoch=01:02:37] End of epoch 10 (73326 steps) train loss 0.243893 +[2025-10-24 23:32:38,553][main][INFO] - [Epoch 10] All losses: [[diffusion=0.0602009 ; kl=3698.58 ; lpips=0.126997 ; repa=0.465981]] +[2025-10-24 23:36:06,224][main][INFO] - [Epoch 11] Test metrics: [[MSE=21.29 | MAE=0.1104 | LPIPS=0.1112 | PSNR=16.72 | SSIM=0.6335 | dreamsim=0.1568 | FID=6.331]] +[2025-10-24 23:36:06,230][main][INFO] - [Epoch 11] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.1112 | max_PSNR=18.49 | max_SSIM=0.6335 | min_dreamsim=0.1568 | min_FID=6.331]] +[2025-10-24 23:36:06,231][main][DEBUG] - Writing images to disk... +[2025-10-24 23:36:07,086][main][DEBUG] - Image(s) saved on disk +[2025-10-24 23:36:07,296][main][INFO] - End of epoch timers: [T_train=11:28:40 | T_epoch=01:02:37 | T_eval=00:38:13 | T_total=12:08:11] +[2025-10-24 23:36:07,298][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-24 23:36:10,288][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-24 23:36:12,899][main][INFO] - --- + + +[2025-10-24 23:36:12,900][main][INFO] - [T_total=12:08:17 | T_train=11:28:40] Start epoch 11 +[2025-10-25 00:38:51,954][main][INFO] - [T_total=13:10:56 | T_train=12:31:19 | T_epoch=01:02:39] End of epoch 11 (79992 steps) train loss 0.242062 +[2025-10-25 00:38:51,955][main][INFO] - [Epoch 11] All losses: [[diffusion=0.0598045 ; kl=3702.03 ; lpips=0.125852 ; repa=0.46252]] +[2025-10-25 00:42:19,563][main][INFO] - [Epoch 12] Test metrics: [[MSE=21.05 | MAE=0.1097 | LPIPS=0.1098 | PSNR=16.77 | SSIM=0.6344 | dreamsim=0.1546 | FID=6.035]] +[2025-10-25 00:42:19,565][main][INFO] - [Epoch 12] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.1098 | max_PSNR=18.49 | max_SSIM=0.6344 | min_dreamsim=0.1546 | min_FID=6.035]] +[2025-10-25 00:42:19,566][main][DEBUG] - Writing images to disk... +[2025-10-25 00:42:20,661][main][DEBUG] - Image(s) saved on disk +[2025-10-25 00:42:20,894][main][INFO] - End of epoch timers: [T_train=12:31:19 | T_epoch=01:02:39 | T_eval=00:41:42 | T_total=13:14:25] +[2025-10-25 00:42:20,895][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-25 00:42:23,582][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-25 00:42:26,171][main][INFO] - --- + + +[2025-10-25 00:42:26,172][main][INFO] - [T_total=13:14:30 | T_train=12:31:19] Start epoch 12 +[2025-10-25 01:45:03,014][main][INFO] - [T_total=14:17:07 | T_train=13:33:56 | T_epoch=01:02:36] End of epoch 12 (86658 steps) train loss 0.240598 +[2025-10-25 01:45:03,015][main][INFO] - [Epoch 12] All losses: [[diffusion=0.0596262 ; kl=3704.82 ; lpips=0.124782 ; repa=0.459504]] +[2025-10-25 01:48:30,676][main][INFO] - [Epoch 13] Test metrics: [[MSE=21.07 | MAE=0.1098 | LPIPS=0.1087 | PSNR=16.76 | SSIM=0.6359 | dreamsim=0.1527 | FID=5.793]] +[2025-10-25 01:48:30,678][main][INFO] - [Epoch 13] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.1087 | max_PSNR=18.49 | max_SSIM=0.6359 | min_dreamsim=0.1527 | min_FID=5.793]] +[2025-10-25 01:48:30,679][main][DEBUG] - Writing images to disk... +[2025-10-25 01:48:31,757][main][DEBUG] - Image(s) saved on disk +[2025-10-25 01:48:31,960][main][INFO] - End of epoch timers: [T_train=13:33:56 | T_epoch=01:02:36 | T_eval=00:45:11 | T_total=14:20:36] +[2025-10-25 01:48:31,961][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-25 01:48:34,485][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-25 01:48:37,247][main][INFO] - --- + + +[2025-10-25 01:48:37,248][main][INFO] - [T_total=14:20:41 | T_train=13:33:56] Start epoch 13 +[2025-10-25 02:51:12,948][main][INFO] - [T_total=15:23:17 | T_train=14:36:32 | T_epoch=01:02:35] End of epoch 13 (93324 steps) train loss 0.239412 +[2025-10-25 02:51:12,949][main][INFO] - [Epoch 13] All losses: [[diffusion=0.0596692 ; kl=3706.94 ; lpips=0.123694 ; repa=0.456758]] +[2025-10-25 02:54:40,598][main][INFO] - [Epoch 14] Test metrics: [[MSE=20.86 | MAE=0.1092 | LPIPS=0.1076 | PSNR=16.81 | SSIM=0.6381 | dreamsim=0.1507 | FID=5.573]] +[2025-10-25 02:54:40,600][main][INFO] - [Epoch 14] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.1076 | max_PSNR=18.49 | max_SSIM=0.6381 | min_dreamsim=0.1507 | min_FID=5.573]] +[2025-10-25 02:54:40,605][main][DEBUG] - Writing images to disk... +[2025-10-25 02:54:41,487][main][DEBUG] - Image(s) saved on disk +[2025-10-25 02:54:41,728][main][INFO] - End of epoch timers: [T_train=14:36:32 | T_epoch=01:02:35 | T_eval=00:48:39 | T_total=15:26:46] +[2025-10-25 02:54:41,731][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-25 02:54:45,053][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-25 02:54:47,715][main][INFO] - --- + + +[2025-10-25 02:54:47,717][main][INFO] - [T_total=15:26:52 | T_train=14:36:32] Start epoch 14 +[2025-10-25 03:57:24,404][main][INFO] - [T_total=16:29:28 | T_train=15:39:09 | T_epoch=01:02:36] End of epoch 14 (99990 steps) train loss 0.238048 +[2025-10-25 03:57:24,406][main][INFO] - [Epoch 14] All losses: [[diffusion=0.0592936 ; kl=3709.87 ; lpips=0.122931 ; repa=0.454315]] +[2025-10-25 04:00:51,619][main][INFO] - [Epoch 15] Test metrics: [[MSE=20.7 | MAE=0.1087 | LPIPS=0.1065 | PSNR=16.84 | SSIM=0.6397 | dreamsim=0.149 | FID=5.367]] +[2025-10-25 04:00:51,621][main][INFO] - [Epoch 15] Best metrics: [[min_MSE=14.16 | min_MAE=0.0884 | min_LPIPS=0.1065 | max_PSNR=18.49 | max_SSIM=0.6397 | min_dreamsim=0.149 | min_FID=5.367]] +[2025-10-25 04:00:51,622][main][DEBUG] - Writing images to disk... +[2025-10-25 04:00:52,707][main][DEBUG] - Image(s) saved on disk +[2025-10-25 04:00:52,907][main][INFO] - End of epoch timers: [T_train=15:39:09 | T_epoch=01:02:36 | T_eval=00:52:07 | T_total=16:32:57] +[2025-10-25 04:00:52,908][main][INFO] - Storing model checkpoint inside /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/last +[2025-10-25 04:00:55,517][main][INFO] - Best FID so far, storing a copy of the model checkpoint to /workspace/DC_SSDAE/runs/jobs/train_enc_vq_f8c4_FM/checkpoints/best +[2025-10-25 04:00:57,799][main][INFO] - --- + + +[2025-10-25 04:00:57,800][main][INFO] - [T_total=16:33:02 | T_train=15:39:09] Start epoch 15 diff --git a/train_enc_vq_f8c4_FM/tensorboard_logs/events.out.tfevents.1761305275.98629b852e50.48913.0 b/train_enc_vq_f8c4_FM/tensorboard_logs/events.out.tfevents.1761305275.98629b852e50.48913.0 new file mode 100644 index 0000000000000000000000000000000000000000..f3e516995af2b86787626ad86234a68a102a69ad --- /dev/null +++ b/train_enc_vq_f8c4_FM/tensorboard_logs/events.out.tfevents.1761305275.98629b852e50.48913.0 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e17e0d6bc0c664f3290d60f1eca7711d097579efb9a898eebb02d2f80e8f0a0 +size 52073988