dreambooth

Configuration error

App Files Files Community

amankishore commited on Dec 15, 2022

Commit

0c8e8a3

•

1 Parent(s): 57595b7

Fix xformers, support v2-1

Browse files

Files changed (3) hide show

app.py +8 -2
requirements.txt +1 -2
train_dreambooth.py +40 -31

app.py CHANGED Viewed

@@ -34,6 +34,8 @@ if(is_gpu_associated):
     model_v1 = snapshot_download(repo_id="multimodalart/sd-fine-tunable")
     model_v2 = snapshot_download(repo_id="stabilityai/stable-diffusion-2")
     model_v2_512 = snapshot_download(repo_id="stabilityai/stable-diffusion-2-base")
     safety_checker = snapshot_download(repo_id="multimodalart/sd-sc")
     model_to_load = model_v1
@@ -47,8 +49,12 @@ def swap_base_model(selected_model):
             model_to_load = model_v1
         elif(selected_model == "v2-768"):
             model_to_load = model_v2
-        else:
             model_to_load = model_v2_512
 def count_files(*inputs):
     file_counter = 0
@@ -532,7 +538,7 @@ with gr.Blocks(css=css) as demo:
     with gr.Accordion("Custom Settings", open=False):
         with gr.Row() as what_are_you_training:
-            base_model_to_use = gr.Dropdown(label="Which base model would you like to use?", choices=["v1-5", "v2-512", "v2-768"], value="v1-5", interactive=True)
         swap_auto_calculated = gr.Checkbox(label="Use custom settings")
         gr.Markdown("If not checked, the % of frozen encoder will be tuned automatically to whether you are training an `object`, `person` or `style`. The text-encoder is frozen after 10% of the steps for a style, 30% of the steps for an object and 75% trained for persons. The number of steps varies between 1400 and 2400 depending on how many images uploaded. If you see too many artifacts in your output, it means it may have overfit and you need less steps. If your results aren't really what you wanted, it may be underfitting and you need more steps.")

     model_v1 = snapshot_download(repo_id="multimodalart/sd-fine-tunable")
     model_v2 = snapshot_download(repo_id="stabilityai/stable-diffusion-2")
     model_v2_512 = snapshot_download(repo_id="stabilityai/stable-diffusion-2-base")
+    model_v2_1 = snapshot_download(repo_id="stabilityai/stable-diffusion-2-1")
+    model_v2_1_512 = snapshot_download(repo_id="stabilityai/stable-diffusion-2-1-base")
     safety_checker = snapshot_download(repo_id="multimodalart/sd-sc")
     model_to_load = model_v1
             model_to_load = model_v1
         elif(selected_model == "v2-768"):
             model_to_load = model_v2
+        elif(selected_model == "v2-512"):
             model_to_load = model_v2_512
+        elif(selected_model == "v2-1-768"):
+            model_to_load = model_v2_1
+        else:
+            model_to_load = model_v2_1_512
 def count_files(*inputs):
     file_counter = 0
     with gr.Accordion("Custom Settings", open=False):
         with gr.Row() as what_are_you_training:
+            base_model_to_use = gr.Dropdown(label="Which base model would you like to use?", choices=["v1-5", "v2-512", "v2-768", "v2-1-512", "v2-1-768"], value="v1-5", interactive=True)
         swap_auto_calculated = gr.Checkbox(label="Use custom settings")
         gr.Markdown("If not checked, the % of frozen encoder will be tuned automatically to whether you are training an `object`, `person` or `style`. The text-encoder is frozen after 10% of the steps for a style, 30% of the steps for an object and 75% trained for persons. The number of steps varies between 1400 and 2400 depending on how many images uploaded. If you see too many artifacts in your output, it means it may have overfit and you need less steps. If your results aren't really what you wanted, it may be underfitting and you need more steps.")

requirements.txt CHANGED Viewed

@@ -14,5 +14,4 @@ triton==2.0.0.dev20220701
 bitsandbytes
 python-slugify
 requests
-tensorboard
-https://github.com/apolinario/xformers/releases/download/0.0.2/xformers-0.0.14.dev0-cp38-cp38-linux_x86_64.whl

 bitsandbytes
 python-slugify
 requests
+tensorboard

train_dreambooth.py CHANGED Viewed

@@ -19,6 +19,7 @@ from accelerate.logging import get_logger
 from accelerate.utils import set_seed
 from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
 from huggingface_hub import HfFolder, Repository, whoami
 from PIL import Image
 from torchvision import transforms
@@ -197,15 +198,15 @@ def parse_args():
         default=1,
         help=("Save the model every n global_steps"),
     )
     parser.add_argument(
         "--save_starting_step",
         type=int,
         default=1,
         help=("The step from which it starts saving intermediary checkpoints"),
     )
     parser.add_argument(
         "--stop_text_encoder_training",
         type=int,
@@ -218,39 +219,39 @@ def parse_args():
         "--image_captions_filename",
         action="store_true",
         help="Get captions from filename",
-    )
     parser.add_argument(
         "--dump_only_text_encoder",
         action="store_true",
-        default=False,
         help="Dump only text encoder",
     )
     parser.add_argument(
         "--train_only_unet",
         action="store_true",
-        default=False,
         help="Train only the unet",
     )
     parser.add_argument(
         "--cache_latents",
         action="store_true",
-        default=False,
         help="Train only the unet",
     )
     parser.add_argument(
         "--Session_dir",
         type=str,
-        default="",
         help="Current session directory",
-    )
     parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
@@ -304,7 +305,7 @@ class DreamBoothDataset(Dataset):
         if args.image_captions_filename:
             self.image_captions_filename = True
         if class_data_root is not None:
             self.class_data_root = Path(class_data_root)
             self.class_data_root.mkdir(parents=True, exist_ok=True)
@@ -334,9 +335,9 @@ class DreamBoothDataset(Dataset):
         instance_image = Image.open(path)
         if not instance_image.mode == "RGB":
             instance_image = instance_image.convert("RGB")
         instance_prompt = self.instance_prompt
         if self.image_captions_filename:
             filename = Path(path).stem
             pt=''.join([i for i in filename if not i.isdigit()])
@@ -488,7 +489,7 @@ def run_training(args_imported):
             for example in tqdm(
                 sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
             ):
-                with torch.autocast("cuda"):
                     images = pipeline(example["prompt"]).images
                 for i, image in enumerate(images):
@@ -533,6 +534,14 @@ def run_training(args_imported):
       text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
     vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
     unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
     vae.requires_grad_(False)
     if not args.train_text_encoder:
@@ -735,7 +744,7 @@ def run_training(args_imported):
                 # Predict the noise residual
                 model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
                 # Get the target for loss depending on the prediction type
                 if noise_scheduler.config.prediction_type == "epsilon":
                     target = noise
@@ -743,7 +752,7 @@ def run_training(args_imported):
                     target = noise_scheduler.get_velocity(latents, noise, timesteps)
                 else:
                     raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
                 if args.with_prior_preservation:
                     # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
                     model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
@@ -780,7 +789,7 @@ def run_training(args_imported):
             fll=round((global_step*100)/args.max_train_steps)
             fll=round(fll/4)
             pr=bar(fll)
             logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
             progress_bar.set_description_str("Progress:"+pr)
@@ -791,7 +800,7 @@ def run_training(args_imported):
             if args.train_text_encoder and global_step == args.stop_text_encoder_training and global_step >= 30:
               if accelerator.is_main_process:
-                print(" [0;32m" +" Freezing the text_encoder ..."+" [0m")
                 frz_dir=args.output_dir + "/text_encoder_frozen"
                 if os.path.exists(frz_dir):
                   subprocess.call('rm -r '+ frz_dir, shell=True)
@@ -802,13 +811,13 @@ def run_training(args_imported):
                     text_encoder=accelerator.unwrap_model(text_encoder),
                 )
                 pipeline.text_encoder.save_pretrained(frz_dir)
             if args.save_n_steps >= 200:
                if global_step < args.max_train_steps and global_step+1==i:
                   ckpt_name = "_step_" + str(global_step+1)
                   save_dir = Path(args.output_dir+ckpt_name)
                   save_dir=str(save_dir)
-                  save_dir=save_dir.replace(" ", "_")
                   if not os.path.exists(save_dir):
                      os.mkdir(save_dir)
                   inst=save_dir[16:]
@@ -822,15 +831,15 @@ def run_training(args_imported):
                            text_encoder=accelerator.unwrap_model(text_encoder),
                      )
                      pipeline.save_pretrained(save_dir)
-                     frz_dir=args.output_dir + "/text_encoder_frozen"
                      if args.train_text_encoder and os.path.exists(frz_dir):
                         subprocess.call('rm -r '+save_dir+'/text_encoder/*.*', shell=True)
-                        subprocess.call('cp -f '+frz_dir +'/*.* '+ save_dir+'/text_encoder', shell=True)
                      chkpth=args.Session_dir+"/"+inst+".ckpt"
                      subprocess.call('python /content/diffusers/scripts/convert_diffusers_to_original_stable_diffusion.py --model_path ' + save_dir + ' --checkpoint_path ' + chkpth + ' --half', shell=True)
                      subprocess.call('rm -r '+ save_dir, shell=True)
                      i=i+args.save_n_steps
         accelerator.wait_for_everyone()
     # Create the pipeline using using the trained modules and save it.
@@ -844,7 +853,7 @@ def run_training(args_imported):
              unet=accelerator.unwrap_model(unet),
              text_encoder=accelerator.unwrap_model(text_encoder),
          )
-         pipeline.text_encoder.save_pretrained(txt_dir)
       elif args.train_only_unet:
         pipeline = StableDiffusionPipeline.from_pretrained(
@@ -855,7 +864,7 @@ def run_training(args_imported):
         pipeline.save_pretrained(args.output_dir)
         txt_dir=args.output_dir + "/text_encoder_trained"
         subprocess.call('rm -r '+txt_dir, shell=True)
       else:
         pipeline = StableDiffusionPipeline.from_pretrained(
             args.pretrained_model_name_or_path,
@@ -866,7 +875,7 @@ def run_training(args_imported):
         pipeline.save_pretrained(args.output_dir)
         if args.train_text_encoder and os.path.exists(frz_dir):
            subprocess.call('mv -f '+frz_dir +'/*.* '+ args.output_dir+'/text_encoder', shell=True)
-           subprocess.call('rm -r '+ frz_dir, shell=True)
         if args.push_to_hub:
             repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)

 from accelerate.utils import set_seed
 from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
+from diffusers.utils.import_utils import is_xformers_available
 from huggingface_hub import HfFolder, Repository, whoami
 from PIL import Image
 from torchvision import transforms
         default=1,
         help=("Save the model every n global_steps"),
     )
     parser.add_argument(
         "--save_starting_step",
         type=int,
         default=1,
         help=("The step from which it starts saving intermediary checkpoints"),
     )
     parser.add_argument(
         "--stop_text_encoder_training",
         type=int,
         "--image_captions_filename",
         action="store_true",
         help="Get captions from filename",
+    )
     parser.add_argument(
         "--dump_only_text_encoder",
         action="store_true",
+        default=False,
         help="Dump only text encoder",
     )
     parser.add_argument(
         "--train_only_unet",
         action="store_true",
+        default=False,
         help="Train only the unet",
     )
     parser.add_argument(
         "--cache_latents",
         action="store_true",
+        default=False,
         help="Train only the unet",
     )
     parser.add_argument(
         "--Session_dir",
         type=str,
+        default="",
         help="Current session directory",
+    )
     parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
         if args.image_captions_filename:
             self.image_captions_filename = True
         if class_data_root is not None:
             self.class_data_root = Path(class_data_root)
             self.class_data_root.mkdir(parents=True, exist_ok=True)
         instance_image = Image.open(path)
         if not instance_image.mode == "RGB":
             instance_image = instance_image.convert("RGB")
         instance_prompt = self.instance_prompt
         if self.image_captions_filename:
             filename = Path(path).stem
             pt=''.join([i for i in filename if not i.isdigit()])
             for example in tqdm(
                 sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
             ):
+                with torch.autocast("cuda"):
                     images = pipeline(example["prompt"]).images
                 for i, image in enumerate(images):
       text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
     vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
     unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
+    if is_xformers_available():
+        try:
+            print("Enabling memory efficient attention with xformers...")
+            unet.enable_xformers_memory_efficient_attention()
+        except Exception as e:
+            logger.warning(
+                f"Could not enable memory efficient attention. Make sure xformers is installed correctly and a GPU is available: {e}"
+            )
     vae.requires_grad_(False)
     if not args.train_text_encoder:
                 # Predict the noise residual
                 model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
                 # Get the target for loss depending on the prediction type
                 if noise_scheduler.config.prediction_type == "epsilon":
                     target = noise
                     target = noise_scheduler.get_velocity(latents, noise, timesteps)
                 else:
                     raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
                 if args.with_prior_preservation:
                     # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
                     model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
             fll=round((global_step*100)/args.max_train_steps)
             fll=round(fll/4)
             pr=bar(fll)
             logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
             progress_bar.set_postfix(**logs)
             progress_bar.set_description_str("Progress:"+pr)
             if args.train_text_encoder and global_step == args.stop_text_encoder_training and global_step >= 30:
               if accelerator.is_main_process:
+                print(" [0;32m" +" Freezing the text_encoder ..."+" [0m")
                 frz_dir=args.output_dir + "/text_encoder_frozen"
                 if os.path.exists(frz_dir):
                   subprocess.call('rm -r '+ frz_dir, shell=True)
                     text_encoder=accelerator.unwrap_model(text_encoder),
                 )
                 pipeline.text_encoder.save_pretrained(frz_dir)
             if args.save_n_steps >= 200:
                if global_step < args.max_train_steps and global_step+1==i:
                   ckpt_name = "_step_" + str(global_step+1)
                   save_dir = Path(args.output_dir+ckpt_name)
                   save_dir=str(save_dir)
+                  save_dir=save_dir.replace(" ", "_")
                   if not os.path.exists(save_dir):
                      os.mkdir(save_dir)
                   inst=save_dir[16:]
                            text_encoder=accelerator.unwrap_model(text_encoder),
                      )
                      pipeline.save_pretrained(save_dir)
+                     frz_dir=args.output_dir + "/text_encoder_frozen"
                      if args.train_text_encoder and os.path.exists(frz_dir):
                         subprocess.call('rm -r '+save_dir+'/text_encoder/*.*', shell=True)
+                        subprocess.call('cp -f '+frz_dir +'/*.* '+ save_dir+'/text_encoder', shell=True)
                      chkpth=args.Session_dir+"/"+inst+".ckpt"
                      subprocess.call('python /content/diffusers/scripts/convert_diffusers_to_original_stable_diffusion.py --model_path ' + save_dir + ' --checkpoint_path ' + chkpth + ' --half', shell=True)
                      subprocess.call('rm -r '+ save_dir, shell=True)
                      i=i+args.save_n_steps
         accelerator.wait_for_everyone()
     # Create the pipeline using using the trained modules and save it.
              unet=accelerator.unwrap_model(unet),
              text_encoder=accelerator.unwrap_model(text_encoder),
          )
+         pipeline.text_encoder.save_pretrained(txt_dir)
       elif args.train_only_unet:
         pipeline = StableDiffusionPipeline.from_pretrained(
         pipeline.save_pretrained(args.output_dir)
         txt_dir=args.output_dir + "/text_encoder_trained"
         subprocess.call('rm -r '+txt_dir, shell=True)
       else:
         pipeline = StableDiffusionPipeline.from_pretrained(
             args.pretrained_model_name_or_path,
         pipeline.save_pretrained(args.output_dir)
         if args.train_text_encoder and os.path.exists(frz_dir):
            subprocess.call('mv -f '+frz_dir +'/*.* '+ args.output_dir+'/text_encoder', shell=True)
+           subprocess.call('rm -r '+ frz_dir, shell=True)
         if args.push_to_hub:
             repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)