First model version

Browse files

Files changed (15) hide show

config.json +188 -0
feature_extractor/preprocessor_config.json +28 -0
model_index.json +34 -0
scheduler/scheduler_config.json +15 -0
text_encoder/config.json +25 -0
text_encoder/model.safetensors +3 -0
tokenizer/merges.txt +0 -0
tokenizer/special_tokens_map.json +24 -0
tokenizer/tokenizer_config.json +38 -0
tokenizer/vocab.json +0 -0
unet/config.json +93 -0
unet/diffusion_pytorch_model.safetensors +3 -0
unet/pytorch_lora_weights.safetensors +3 -0
vae/config.json +32 -0
vae/diffusion_pytorch_model.safetensors +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+    "dataset_args": {
+        "remove_empty_masks": false,
+        "load_point_clouds": false,
+        "load_depths": false,
+        "load_depth_masks": false,
+        "load_masks": false,
+        "box_crop": false,
+        "image_width": null,
+        "image_height": null,
+        "pick_sequence": [],
+        "exclude_sequence": [],
+        "n_frames_per_sequence": -1
+    },
+    "batch": {
+        "n_parallel_images": 3,
+        "image_width": 256,
+        "image_height": 256,
+        "other_selection": "mix",
+        "other_selection_frame_indices": [],
+        "sequence_offset": 1,
+        "crop": "random",
+        "mask_foreground": false,
+        "prompt": "Editorial Style Photo, ${category}, 4k --ar 16:9",
+        "use_blip_prompt": true,
+        "load_recentered": true,
+        "replace_pose_with_spherical_start_phi": -400.0,
+        "replace_pose_with_spherical_end_phi": 360.0,
+        "replace_pose_with_spherical_phi_endpoint": false,
+        "replace_pose_with_spherical_radius": 4.0,
+        "replace_pose_with_spherical_theta": 45.0
+    },
+    "co3d_root": "/mnt/data/shoji_noguchi/venv/viewdiff/ViewDiff/co3d/dataset",
+    "category": "teddybear",
+    "subset": null,
+    "split": null,
+    "max_sequences": 50,
+    "seed": 42,
+    "training": {
+        "validation_epochs": 1,
+        "train_batch_size": 1,
+        "num_train_epochs": 1000,
+        "max_train_steps": null,
+        "dataloader_num_workers": 0,
+        "local_rank": -1,
+        "mixed_precision": "no",
+        "noise_prediction_type": "epsilon",
+        "remove_cfa_skip_connections_at_iter": -1,
+        "changed_cfa_last_layer": "no_residual_connection",
+        "dreambooth_prior_preservation_loss_weight": 0.1,
+        "dreambooth_prior_preservation_every_nth": 1,
+        "prob_images_not_noisy": 0.25,
+        "max_num_images_not_noisy": 2
+    },
+    "optimizer": {
+        "learning_rate": 5e-05,
+        "vol_rend_learning_rate": 0.001,
+        "vol_rend_adam_weight_decay": 0.0,
+        "scale_lr": false,
+        "lr_scheduler": "constant",
+        "lr_warmup_steps": 500,
+        "use_8bit_adam": false,
+        "allow_tf32": false,
+        "adam_beta1": 0.9,
+        "adam_beta2": 0.999,
+        "adam_weight_decay": 0.01,
+        "adam_epsilon": 1e-08,
+        "max_grad_norm": 0.005,
+        "gradient_accumulation_steps": 1,
+        "only_train_new_layers": false
+    },
+    "model": {
+        "n_input_images": 3,
+        "pose_cond_mode": "sa-ca",
+        "pose_cond_coord_space": "absolute",
+        "pose_cond_lora_rank": 64,
+        "pose_cond_dim": 10,
+        "conditioning_dropout_prob": 0.1,
+        "use_ema": false,
+        "enable_xformers_memory_efficient_attention": false,
+        "gradient_checkpointing": false
+    },
+    "cross_frame_attention": {
+        "mode": "pretrained",
+        "n_cfa_down_blocks": 0,
+        "n_cfa_up_blocks": 0,
+        "no_cfa_in_mid_block": false,
+        "to_k_other_frames": 2,
+        "with_self_attention": true,
+        "random_others": true,
+        "last_layer_mode": "no_residual_connection",
+        "unproj_reproj_mode": "only_unproj_reproj",
+        "num_3d_layers": 1,
+        "dim_3d_latent": 16,
+        "dim_3d_grid": 64,
+        "vol_rend_proj_in_mode": "multiple",
+        "vol_rend_proj_out_mode": "multiple",
+        "vol_rend_aggregator_mode": "ibrnet",
+        "vol_rend_model_background": true,
+        "vol_rend_background_grid_percentage": 0.5,
+        "vol_rend_disparity_at_inf": 0.5,
+        "n_novel_images": 1,
+        "use_temb_cond": true
+    },
+    "io": {
+        "save": {
+            "image_grids": false,
+            "pred_files": true,
+            "pred_video": true,
+            "pred_gif": false,
+            "denoise_files": false,
+            "denoise_video": false,
+            "cams": true,
+            "prompts": true,
+            "rendered_depth": false,
+            "cond_files": false,
+            "image_metrics": true
+        },
+        "pretrained_model_name_or_path": "stabilityai/stable-diffusion-2-1-base",
+        "revision": null,
+        "output_dir": "outputs/train/teddybear/50_sequences/subset_all/input_3/train/train_teddybear",
+        "experiment_name": "train_teddybear",
+        "logging_dir": "logs",
+        "log_images_every_nth": 500,
+        "report_to": "custom_tensorboard",
+        "checkpointing_steps": 500,
+        "checkpoints_total_limit": 2,
+        "resume_from_checkpoint": null,
+        "automatic_checkpoint_resume": false
+    },
+    "selected_sequences": {
+        "train": [
+            "391_46974_93640",
+            "373_41560_82975",
+            "46_2596_7562",
+            "393_47968_95188",
+            "534_77570_150656",
+            "444_63772_125902",
+            "38_1675_5008",
+            "457_64379_126856",
+            "42_2104_6430",
+            "387_46466_92678",
+            "392_47802_94952",
+            "353_37579_70740",
+            "380_44886_89716",
+            "598_91959_182995",
+            "392_47443_94381",
+            "38_1686_5058",
+            "540_78977_153006",
+            "46_2595_7548",
+            "392_47834_94937",
+            "612_97540_195796",
+            "387_46699_92990",
+            "394_48750_96460",
+            "341_35542_65393",
+            "391_46884_93389",
+            "246_26304_51384",
+            "394_48525_95999",
+            "482_68882_133595",
+            "392_47808_94891",
+            "353_37349_70281",
+            "387_46523_92748",
+            "616_99318_199096",
+            "394_48514_95986",
+            "387_46530_92758",
+            "575_84513_166963",
+            "472_66680_130870",
+            "537_78247_152118",
+            "378_44012_87822",
+            "392_47776_94858",
+            "377_43705_87007",
+            "392_47498_94471",
+            "612_97904_197160",
+            "604_93996_187948",
+            "392_47757_94835",
+            "620_101667_204118",
+            "504_72676_141358",
+            "620_101591_203905",
+            "475_67237_131972",
+            "444_63708_125709",
+            "377_43631_86945",
+            "579_85828_169718"
+        ],
+        "val": [
+            "581_86292_171292"
+        ]
+    }
+}

feature_extractor/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "crop_size": {
+    "height": 224,
+    "width": 224
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 224
+  }
+}

model_index.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "_class_name": "CustomStableDiffusionPipeline",
+  "_diffusers_version": "0.21.2",
+  "_name_or_path": "stabilityai/stable-diffusion-2-1-base",
+  "feature_extractor": [
+    "transformers",
+    "CLIPImageProcessor"
+  ],
+  "requires_safety_checker": false,
+  "safety_checker": [
+    null,
+    null
+  ],
+  "scheduler": [
+    "diffusers",
+    "PNDMScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "CLIPTextModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "unet": [
+    "viewdiff.model.custom_unet_2d_condition",
+    "UNet2DConditionCrossFrameInExistingAttnModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ]
+}

scheduler/scheduler_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "_class_name": "PNDMScheduler",
+  "_diffusers_version": "0.21.2",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "num_train_timesteps": 1000,
+  "prediction_type": "epsilon",
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "timestep_spacing": "leading",
+  "trained_betas": null
+}

text_encoder/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "_name_or_path": "stabilityai/stable-diffusion-2-1-base",
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "gelu",
+  "hidden_size": 1024,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 23,
+  "pad_token_id": 1,
+  "projection_dim": 512,
+  "torch_dtype": "float32",
+  "transformers_version": "4.36.0",
+  "vocab_size": 49408
+}

text_encoder/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67e013543d4fac905c882e2993d86a2d454ee69dc9e8f37c0c23d33a48959d15
+size 1361596304

tokenizer/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "!",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "!",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49406": {
+      "content": "<|startoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "49407": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<|startoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "do_lower_case": true,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "model_max_length": 77,
+  "pad_token": "!",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": "<|endoftext|>"
+}

tokenizer/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

unet/config.json ADDED Viewed

	@@ -0,0 +1,93 @@

+{
+  "_class_name": "UNet2DConditionCrossFrameInExistingAttnModel",
+  "_diffusers_version": "0.21.2",
+  "_name_or_path": "outputs/train/teddybear/50_sequences/subset_all/input_3/train/train_teddybear/checkpoint-1061000/unet",
+  "act_fn": "silu",
+  "addition_embed_type": null,
+  "addition_embed_type_num_heads": 64,
+  "addition_time_embed_dim": null,
+  "attention_head_dim": [
+    5,
+    10,
+    20,
+    20
+  ],
+  "attention_type": "default",
+  "block_out_channels": [
+    320,
+    640,
+    1280,
+    1280
+  ],
+  "center_input_sample": false,
+  "class_embed_type": null,
+  "class_embeddings_concat": false,
+  "conv_in_kernel": 3,
+  "conv_out_kernel": 3,
+  "cross_attention_dim": 1024,
+  "cross_attention_norm": null,
+  "dim_3d_grid": 64,
+  "dim_3d_latent": 16,
+  "down_block_types": [
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "CrossAttnDownBlock2D",
+    "DownBlock2D"
+  ],
+  "downsample_padding": 1,
+  "dropout": 0.0,
+  "dual_cross_attention": false,
+  "encoder_hid_dim": null,
+  "encoder_hid_dim_type": null,
+  "flip_sin_to_cos": true,
+  "freq_shift": 0,
+  "in_channels": 4,
+  "last_layer_mode": "no_residual_connection",
+  "layers_per_block": 2,
+  "mid_block_only_cross_attention": null,
+  "mid_block_scale_factor": 1,
+  "mid_block_type": "UNetMidBlock2DCrossFrameInExistingAttn",
+  "n_input_images": 3,
+  "n_novel_images": 1,
+  "network_alpha": null,
+  "norm_eps": 1e-05,
+  "norm_num_groups": 32,
+  "num_3d_layers": 1,
+  "num_attention_heads": null,
+  "num_class_embeds": null,
+  "only_cross_attention": false,
+  "out_channels": 4,
+  "pose_cond_dim": 10,
+  "projection_class_embeddings_input_dim": null,
+  "random_others": true,
+  "rank": 64,
+  "resnet_out_scale_factor": 1.0,
+  "resnet_skip_time_act": false,
+  "resnet_time_scale_shift": "default",
+  "sample_size": 64,
+  "temb_out_size": 8,
+  "time_cond_proj_dim": null,
+  "time_embedding_act_fn": null,
+  "time_embedding_dim": null,
+  "time_embedding_type": "positional",
+  "timestep_post_act": null,
+  "to_k_other_frames": 2,
+  "transformer_layers_per_block": 1,
+  "unproj_reproj_mode": "only_unproj_reproj",
+  "up_block_types": [
+    "UpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D",
+    "CrossAttnUpBlock2D"
+  ],
+  "upcast_attention": false,
+  "use_linear_projection": true,
+  "use_lora_in_cfa": false,
+  "use_temb_in_lora": true,
+  "vol_rend_aggregator_mode": "ibrnet",
+  "vol_rend_background_grid_percentage": 0.5,
+  "vol_rend_disparity_at_inf": 0.5,
+  "vol_rend_model_background": true,
+  "vol_rend_proj_in_mode": "multiple",
+  "vol_rend_proj_out_mode": "multiple"
+}

unet/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:105a721d2f67f4e4f053dd51358cca7dc4aaec91c633ee835b971fb0110d8296
+size 3646552956

unet/pytorch_lora_weights.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7d198e42acfc4e1b18e816d8f180e21926425ab3adcbec339bf890668738af22
+size 106309656

vae/config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_class_name": "AutoencoderKL",
+  "_diffusers_version": "0.21.2",
+  "_name_or_path": "stabilityai/stable-diffusion-2-1-base",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "force_upcast": true,
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 768,
+  "scaling_factor": 0.18215,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ]
+}

vae/diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2aa1f43011b553a4cba7f37456465cdbd48aab7b54b9348b890e8058ea7683ec
+size 334643268