amankishore commited on
Commit
0c8e8a3
1 Parent(s): 57595b7

Fix xformers, support v2-1

Browse files
Files changed (3) hide show
  1. app.py +8 -2
  2. requirements.txt +1 -2
  3. train_dreambooth.py +40 -31
app.py CHANGED
@@ -34,6 +34,8 @@ if(is_gpu_associated):
34
  model_v1 = snapshot_download(repo_id="multimodalart/sd-fine-tunable")
35
  model_v2 = snapshot_download(repo_id="stabilityai/stable-diffusion-2")
36
  model_v2_512 = snapshot_download(repo_id="stabilityai/stable-diffusion-2-base")
 
 
37
  safety_checker = snapshot_download(repo_id="multimodalart/sd-sc")
38
  model_to_load = model_v1
39
 
@@ -47,8 +49,12 @@ def swap_base_model(selected_model):
47
  model_to_load = model_v1
48
  elif(selected_model == "v2-768"):
49
  model_to_load = model_v2
50
- else:
51
  model_to_load = model_v2_512
 
 
 
 
52
 
53
  def count_files(*inputs):
54
  file_counter = 0
@@ -532,7 +538,7 @@ with gr.Blocks(css=css) as demo:
532
 
533
  with gr.Accordion("Custom Settings", open=False):
534
  with gr.Row() as what_are_you_training:
535
- base_model_to_use = gr.Dropdown(label="Which base model would you like to use?", choices=["v1-5", "v2-512", "v2-768"], value="v1-5", interactive=True)
536
 
537
  swap_auto_calculated = gr.Checkbox(label="Use custom settings")
538
  gr.Markdown("If not checked, the % of frozen encoder will be tuned automatically to whether you are training an `object`, `person` or `style`. The text-encoder is frozen after 10% of the steps for a style, 30% of the steps for an object and 75% trained for persons. The number of steps varies between 1400 and 2400 depending on how many images uploaded. If you see too many artifacts in your output, it means it may have overfit and you need less steps. If your results aren't really what you wanted, it may be underfitting and you need more steps.")
34
  model_v1 = snapshot_download(repo_id="multimodalart/sd-fine-tunable")
35
  model_v2 = snapshot_download(repo_id="stabilityai/stable-diffusion-2")
36
  model_v2_512 = snapshot_download(repo_id="stabilityai/stable-diffusion-2-base")
37
+ model_v2_1 = snapshot_download(repo_id="stabilityai/stable-diffusion-2-1")
38
+ model_v2_1_512 = snapshot_download(repo_id="stabilityai/stable-diffusion-2-1-base")
39
  safety_checker = snapshot_download(repo_id="multimodalart/sd-sc")
40
  model_to_load = model_v1
41
 
49
  model_to_load = model_v1
50
  elif(selected_model == "v2-768"):
51
  model_to_load = model_v2
52
+ elif(selected_model == "v2-512"):
53
  model_to_load = model_v2_512
54
+ elif(selected_model == "v2-1-768"):
55
+ model_to_load = model_v2_1
56
+ else:
57
+ model_to_load = model_v2_1_512
58
 
59
  def count_files(*inputs):
60
  file_counter = 0
538
 
539
  with gr.Accordion("Custom Settings", open=False):
540
  with gr.Row() as what_are_you_training:
541
+ base_model_to_use = gr.Dropdown(label="Which base model would you like to use?", choices=["v1-5", "v2-512", "v2-768", "v2-1-512", "v2-1-768"], value="v1-5", interactive=True)
542
 
543
  swap_auto_calculated = gr.Checkbox(label="Use custom settings")
544
  gr.Markdown("If not checked, the % of frozen encoder will be tuned automatically to whether you are training an `object`, `person` or `style`. The text-encoder is frozen after 10% of the steps for a style, 30% of the steps for an object and 75% trained for persons. The number of steps varies between 1400 and 2400 depending on how many images uploaded. If you see too many artifacts in your output, it means it may have overfit and you need less steps. If your results aren't really what you wanted, it may be underfitting and you need more steps.")
requirements.txt CHANGED
@@ -14,5 +14,4 @@ triton==2.0.0.dev20220701
14
  bitsandbytes
15
  python-slugify
16
  requests
17
- tensorboard
18
- https://github.com/apolinario/xformers/releases/download/0.0.2/xformers-0.0.14.dev0-cp38-cp38-linux_x86_64.whl
14
  bitsandbytes
15
  python-slugify
16
  requests
17
+ tensorboard
 
train_dreambooth.py CHANGED
@@ -19,6 +19,7 @@ from accelerate.logging import get_logger
19
  from accelerate.utils import set_seed
20
  from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
21
  from diffusers.optimization import get_scheduler
 
22
  from huggingface_hub import HfFolder, Repository, whoami
23
  from PIL import Image
24
  from torchvision import transforms
@@ -197,15 +198,15 @@ def parse_args():
197
  default=1,
198
  help=("Save the model every n global_steps"),
199
  )
200
-
201
-
202
  parser.add_argument(
203
  "--save_starting_step",
204
  type=int,
205
  default=1,
206
  help=("The step from which it starts saving intermediary checkpoints"),
207
  )
208
-
209
  parser.add_argument(
210
  "--stop_text_encoder_training",
211
  type=int,
@@ -218,39 +219,39 @@ def parse_args():
218
  "--image_captions_filename",
219
  action="store_true",
220
  help="Get captions from filename",
221
- )
222
-
223
-
224
  parser.add_argument(
225
  "--dump_only_text_encoder",
226
  action="store_true",
227
- default=False,
228
  help="Dump only text encoder",
229
  )
230
 
231
  parser.add_argument(
232
  "--train_only_unet",
233
  action="store_true",
234
- default=False,
235
  help="Train only the unet",
236
  )
237
-
238
  parser.add_argument(
239
  "--cache_latents",
240
  action="store_true",
241
- default=False,
242
  help="Train only the unet",
243
  )
244
-
245
  parser.add_argument(
246
  "--Session_dir",
247
  type=str,
248
- default="",
249
  help="Current session directory",
250
- )
 
 
251
 
252
-
253
-
254
 
255
  parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
256
 
@@ -304,7 +305,7 @@ class DreamBoothDataset(Dataset):
304
 
305
  if args.image_captions_filename:
306
  self.image_captions_filename = True
307
-
308
  if class_data_root is not None:
309
  self.class_data_root = Path(class_data_root)
310
  self.class_data_root.mkdir(parents=True, exist_ok=True)
@@ -334,9 +335,9 @@ class DreamBoothDataset(Dataset):
334
  instance_image = Image.open(path)
335
  if not instance_image.mode == "RGB":
336
  instance_image = instance_image.convert("RGB")
337
-
338
  instance_prompt = self.instance_prompt
339
-
340
  if self.image_captions_filename:
341
  filename = Path(path).stem
342
  pt=''.join([i for i in filename if not i.isdigit()])
@@ -488,7 +489,7 @@ def run_training(args_imported):
488
  for example in tqdm(
489
  sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
490
  ):
491
- with torch.autocast("cuda"):
492
  images = pipeline(example["prompt"]).images
493
 
494
  for i, image in enumerate(images):
@@ -533,6 +534,14 @@ def run_training(args_imported):
533
  text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
534
  vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
535
  unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
 
 
 
 
 
 
 
 
536
 
537
  vae.requires_grad_(False)
538
  if not args.train_text_encoder:
@@ -735,7 +744,7 @@ def run_training(args_imported):
735
 
736
  # Predict the noise residual
737
  model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
738
-
739
  # Get the target for loss depending on the prediction type
740
  if noise_scheduler.config.prediction_type == "epsilon":
741
  target = noise
@@ -743,7 +752,7 @@ def run_training(args_imported):
743
  target = noise_scheduler.get_velocity(latents, noise, timesteps)
744
  else:
745
  raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
746
-
747
  if args.with_prior_preservation:
748
  # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
749
  model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
@@ -780,7 +789,7 @@ def run_training(args_imported):
780
  fll=round((global_step*100)/args.max_train_steps)
781
  fll=round(fll/4)
782
  pr=bar(fll)
783
-
784
  logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
785
  progress_bar.set_postfix(**logs)
786
  progress_bar.set_description_str("Progress:"+pr)
@@ -791,7 +800,7 @@ def run_training(args_imported):
791
 
792
  if args.train_text_encoder and global_step == args.stop_text_encoder_training and global_step >= 30:
793
  if accelerator.is_main_process:
794
- print(" " +" Freezing the text_encoder ..."+" ")
795
  frz_dir=args.output_dir + "/text_encoder_frozen"
796
  if os.path.exists(frz_dir):
797
  subprocess.call('rm -r '+ frz_dir, shell=True)
@@ -802,13 +811,13 @@ def run_training(args_imported):
802
  text_encoder=accelerator.unwrap_model(text_encoder),
803
  )
804
  pipeline.text_encoder.save_pretrained(frz_dir)
805
-
806
  if args.save_n_steps >= 200:
807
  if global_step < args.max_train_steps and global_step+1==i:
808
  ckpt_name = "_step_" + str(global_step+1)
809
  save_dir = Path(args.output_dir+ckpt_name)
810
  save_dir=str(save_dir)
811
- save_dir=save_dir.replace(" ", "_")
812
  if not os.path.exists(save_dir):
813
  os.mkdir(save_dir)
814
  inst=save_dir[16:]
@@ -822,15 +831,15 @@ def run_training(args_imported):
822
  text_encoder=accelerator.unwrap_model(text_encoder),
823
  )
824
  pipeline.save_pretrained(save_dir)
825
- frz_dir=args.output_dir + "/text_encoder_frozen"
826
  if args.train_text_encoder and os.path.exists(frz_dir):
827
  subprocess.call('rm -r '+save_dir+'/text_encoder/*.*', shell=True)
828
- subprocess.call('cp -f '+frz_dir +'/*.* '+ save_dir+'/text_encoder', shell=True)
829
  chkpth=args.Session_dir+"/"+inst+".ckpt"
830
  subprocess.call('python /content/diffusers/scripts/convert_diffusers_to_original_stable_diffusion.py --model_path ' + save_dir + ' --checkpoint_path ' + chkpth + ' --half', shell=True)
831
  subprocess.call('rm -r '+ save_dir, shell=True)
832
  i=i+args.save_n_steps
833
-
834
  accelerator.wait_for_everyone()
835
 
836
  # Create the pipeline using using the trained modules and save it.
@@ -844,7 +853,7 @@ def run_training(args_imported):
844
  unet=accelerator.unwrap_model(unet),
845
  text_encoder=accelerator.unwrap_model(text_encoder),
846
  )
847
- pipeline.text_encoder.save_pretrained(txt_dir)
848
 
849
  elif args.train_only_unet:
850
  pipeline = StableDiffusionPipeline.from_pretrained(
@@ -855,7 +864,7 @@ def run_training(args_imported):
855
  pipeline.save_pretrained(args.output_dir)
856
  txt_dir=args.output_dir + "/text_encoder_trained"
857
  subprocess.call('rm -r '+txt_dir, shell=True)
858
-
859
  else:
860
  pipeline = StableDiffusionPipeline.from_pretrained(
861
  args.pretrained_model_name_or_path,
@@ -866,7 +875,7 @@ def run_training(args_imported):
866
  pipeline.save_pretrained(args.output_dir)
867
  if args.train_text_encoder and os.path.exists(frz_dir):
868
  subprocess.call('mv -f '+frz_dir +'/*.* '+ args.output_dir+'/text_encoder', shell=True)
869
- subprocess.call('rm -r '+ frz_dir, shell=True)
870
 
871
  if args.push_to_hub:
872
  repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
19
  from accelerate.utils import set_seed
20
  from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
21
  from diffusers.optimization import get_scheduler
22
+ from diffusers.utils.import_utils import is_xformers_available
23
  from huggingface_hub import HfFolder, Repository, whoami
24
  from PIL import Image
25
  from torchvision import transforms
198
  default=1,
199
  help=("Save the model every n global_steps"),
200
  )
201
+
202
+
203
  parser.add_argument(
204
  "--save_starting_step",
205
  type=int,
206
  default=1,
207
  help=("The step from which it starts saving intermediary checkpoints"),
208
  )
209
+
210
  parser.add_argument(
211
  "--stop_text_encoder_training",
212
  type=int,
219
  "--image_captions_filename",
220
  action="store_true",
221
  help="Get captions from filename",
222
+ )
223
+
224
+
225
  parser.add_argument(
226
  "--dump_only_text_encoder",
227
  action="store_true",
228
+ default=False,
229
  help="Dump only text encoder",
230
  )
231
 
232
  parser.add_argument(
233
  "--train_only_unet",
234
  action="store_true",
235
+ default=False,
236
  help="Train only the unet",
237
  )
238
+
239
  parser.add_argument(
240
  "--cache_latents",
241
  action="store_true",
242
+ default=False,
243
  help="Train only the unet",
244
  )
245
+
246
  parser.add_argument(
247
  "--Session_dir",
248
  type=str,
249
+ default="",
250
  help="Current session directory",
251
+ )
252
+
253
+
254
 
 
 
255
 
256
  parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
257
 
305
 
306
  if args.image_captions_filename:
307
  self.image_captions_filename = True
308
+
309
  if class_data_root is not None:
310
  self.class_data_root = Path(class_data_root)
311
  self.class_data_root.mkdir(parents=True, exist_ok=True)
335
  instance_image = Image.open(path)
336
  if not instance_image.mode == "RGB":
337
  instance_image = instance_image.convert("RGB")
338
+
339
  instance_prompt = self.instance_prompt
340
+
341
  if self.image_captions_filename:
342
  filename = Path(path).stem
343
  pt=''.join([i for i in filename if not i.isdigit()])
489
  for example in tqdm(
490
  sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
491
  ):
492
+ with torch.autocast("cuda"):
493
  images = pipeline(example["prompt"]).images
494
 
495
  for i, image in enumerate(images):
534
  text_encoder = CLIPTextModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="text_encoder")
535
  vae = AutoencoderKL.from_pretrained(args.pretrained_model_name_or_path, subfolder="vae")
536
  unet = UNet2DConditionModel.from_pretrained(args.pretrained_model_name_or_path, subfolder="unet")
537
+ if is_xformers_available():
538
+ try:
539
+ print("Enabling memory efficient attention with xformers...")
540
+ unet.enable_xformers_memory_efficient_attention()
541
+ except Exception as e:
542
+ logger.warning(
543
+ f"Could not enable memory efficient attention. Make sure xformers is installed correctly and a GPU is available: {e}"
544
+ )
545
 
546
  vae.requires_grad_(False)
547
  if not args.train_text_encoder:
744
 
745
  # Predict the noise residual
746
  model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
747
+
748
  # Get the target for loss depending on the prediction type
749
  if noise_scheduler.config.prediction_type == "epsilon":
750
  target = noise
752
  target = noise_scheduler.get_velocity(latents, noise, timesteps)
753
  else:
754
  raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
755
+
756
  if args.with_prior_preservation:
757
  # Chunk the noise and model_pred into two parts and compute the loss on each part separately.
758
  model_pred, model_pred_prior = torch.chunk(model_pred, 2, dim=0)
789
  fll=round((global_step*100)/args.max_train_steps)
790
  fll=round(fll/4)
791
  pr=bar(fll)
792
+
793
  logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
794
  progress_bar.set_postfix(**logs)
795
  progress_bar.set_description_str("Progress:"+pr)
800
 
801
  if args.train_text_encoder and global_step == args.stop_text_encoder_training and global_step >= 30:
802
  if accelerator.is_main_process:
803
+ print(" " +" Freezing the text_encoder ..."+" ")
804
  frz_dir=args.output_dir + "/text_encoder_frozen"
805
  if os.path.exists(frz_dir):
806
  subprocess.call('rm -r '+ frz_dir, shell=True)
811
  text_encoder=accelerator.unwrap_model(text_encoder),
812
  )
813
  pipeline.text_encoder.save_pretrained(frz_dir)
814
+
815
  if args.save_n_steps >= 200:
816
  if global_step < args.max_train_steps and global_step+1==i:
817
  ckpt_name = "_step_" + str(global_step+1)
818
  save_dir = Path(args.output_dir+ckpt_name)
819
  save_dir=str(save_dir)
820
+ save_dir=save_dir.replace(" ", "_")
821
  if not os.path.exists(save_dir):
822
  os.mkdir(save_dir)
823
  inst=save_dir[16:]
831
  text_encoder=accelerator.unwrap_model(text_encoder),
832
  )
833
  pipeline.save_pretrained(save_dir)
834
+ frz_dir=args.output_dir + "/text_encoder_frozen"
835
  if args.train_text_encoder and os.path.exists(frz_dir):
836
  subprocess.call('rm -r '+save_dir+'/text_encoder/*.*', shell=True)
837
+ subprocess.call('cp -f '+frz_dir +'/*.* '+ save_dir+'/text_encoder', shell=True)
838
  chkpth=args.Session_dir+"/"+inst+".ckpt"
839
  subprocess.call('python /content/diffusers/scripts/convert_diffusers_to_original_stable_diffusion.py --model_path ' + save_dir + ' --checkpoint_path ' + chkpth + ' --half', shell=True)
840
  subprocess.call('rm -r '+ save_dir, shell=True)
841
  i=i+args.save_n_steps
842
+
843
  accelerator.wait_for_everyone()
844
 
845
  # Create the pipeline using using the trained modules and save it.
853
  unet=accelerator.unwrap_model(unet),
854
  text_encoder=accelerator.unwrap_model(text_encoder),
855
  )
856
+ pipeline.text_encoder.save_pretrained(txt_dir)
857
 
858
  elif args.train_only_unet:
859
  pipeline = StableDiffusionPipeline.from_pretrained(
864
  pipeline.save_pretrained(args.output_dir)
865
  txt_dir=args.output_dir + "/text_encoder_trained"
866
  subprocess.call('rm -r '+txt_dir, shell=True)
867
+
868
  else:
869
  pipeline = StableDiffusionPipeline.from_pretrained(
870
  args.pretrained_model_name_or_path,
875
  pipeline.save_pretrained(args.output_dir)
876
  if args.train_text_encoder and os.path.exists(frz_dir):
877
  subprocess.call('mv -f '+frz_dir +'/*.* '+ args.output_dir+'/text_encoder', shell=True)
878
+ subprocess.call('rm -r '+ frz_dir, shell=True)
879
 
880
  if args.push_to_hub:
881
  repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)