LanHarmony commited on
Commit
2ce295b
1 Parent(s): b0cbfee

introduce control net from diffusers

Browse files
Files changed (1) hide show
  1. visual_foundation_models.py +760 -331
visual_foundation_models.py CHANGED
@@ -6,8 +6,10 @@ from diffusers import StableDiffusionInpaintPipeline
6
  from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
7
  from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
8
  from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector
 
9
  from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
10
  from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
 
11
  from ldm.util import instantiate_from_config
12
  from ControlNet.cldm.model import create_model, load_state_dict
13
  from ControlNet.cldm.ddim_hacked import DDIMSampler
@@ -26,6 +28,46 @@ from pytorch_lightning import seed_everything
26
  import cv2
27
  import random
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def HWC3(x):
30
  assert x.dtype == np.uint8
31
  if x.ndim == 2:
@@ -355,83 +397,82 @@ class line2image_new:
355
  return updated_image_path
356
 
357
 
358
- class image2line:
359
- def __init__(self):
360
- print("Direct detect straight line...")
361
- self.detector = MLSDdetector()
362
- self.value_thresh = 0.1
363
- self.dis_thresh = 0.1
364
- self.resolution = 512
365
-
366
- def inference(self, inputs):
367
- print("===>Starting image2hough Inference")
368
- image = Image.open(inputs)
369
- image = np.array(image)
370
- image = HWC3(image)
371
- hough = self.detector(resize_image(image, self.resolution), self.value_thresh, self.dis_thresh)
372
- updated_image_path = get_new_image_name(inputs, func_name="line-of")
373
- hough = 255 - cv2.dilate(hough, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1)
374
- image = Image.fromarray(hough)
375
- image.save(updated_image_path)
376
- return updated_image_path
377
-
378
-
379
- class line2image:
380
- def __init__(self, device):
381
- print("Initialize the line2image model...")
382
- model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
383
- model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_mlsd.pth', location='cpu'))
384
- self.model = model.to(device)
385
- self.device = device
386
- self.ddim_sampler = DDIMSampler(self.model)
387
- self.ddim_steps = 20
388
- self.image_resolution = 512
389
- self.num_samples = 1
390
- self.save_memory = False
391
- self.strength = 1.0
392
- self.guess_mode = False
393
- self.scale = 9.0
394
- self.seed = -1
395
- self.a_prompt = 'best quality, extremely detailed'
396
- self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
397
-
398
- def inference(self, inputs):
399
- print("===>Starting line2image Inference")
400
- image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
401
- image = Image.open(image_path)
402
- image = np.array(image)
403
- image = 255 - image
404
- prompt = instruct_text
405
- img = resize_image(HWC3(image), self.image_resolution)
406
- H, W, C = img.shape
407
- img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
408
- control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
409
- control = torch.stack([control for _ in range(self.num_samples)], dim=0)
410
- control = einops.rearrange(control, 'b h w c -> b c h w').clone()
411
- self.seed = random.randint(0, 65535)
412
- seed_everything(self.seed)
413
- if self.save_memory:
414
- self.model.low_vram_shift(is_diffusing=False)
415
- cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
416
- un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
417
- shape = (4, H // 8, W // 8)
418
- self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
419
- samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
420
- if self.save_memory:
421
- self.model.low_vram_shift(is_diffusing=False)
422
- x_samples = self.model.decode_first_stage(samples)
423
- x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).\
424
- cpu().numpy().clip(0,255).astype(np.uint8)
425
- updated_image_path = get_new_image_name(image_path, func_name="line2image")
426
- real_image = Image.fromarray(x_samples[0]) # default the index0 image
427
- real_image.save(updated_image_path)
428
- return updated_image_path
429
-
430
 
431
- class image2hed:
432
  def __init__(self):
433
  print("Direct detect soft HED boundary...")
434
- self.detector = HEDdetector()
435
  self.resolution = 512
436
 
437
  def inference(self, inputs):
@@ -439,29 +480,30 @@ class image2hed:
439
  image = Image.open(inputs)
440
  image = np.array(image)
441
  image = HWC3(image)
442
- hed = self.detector(resize_image(image, self.resolution))
 
 
443
  updated_image_path = get_new_image_name(inputs, func_name="hed-boundary")
444
- image = Image.fromarray(hed)
445
- image.save(updated_image_path)
446
  return updated_image_path
447
 
448
-
449
- class hed2image:
450
  def __init__(self, device):
451
  print("Initialize the hed2image model...")
452
- model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
453
- model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_hed.pth', location='cpu'))
454
- self.model = model.to(device)
455
- self.device = device
456
- self.ddim_sampler = DDIMSampler(self.model)
457
- self.ddim_steps = 20
 
 
 
 
458
  self.image_resolution = 512
459
- self.num_samples = 1
460
- self.save_memory = False
461
- self.strength = 1.0
462
- self.guess_mode = False
463
- self.scale = 9.0
464
  self.seed = -1
 
465
  self.a_prompt = 'best quality, extremely detailed'
466
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
467
 
@@ -470,35 +512,91 @@ class hed2image:
470
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
471
  image = Image.open(image_path)
472
  image = np.array(image)
473
- prompt = instruct_text
474
  img = resize_image(HWC3(image), self.image_resolution)
475
- H, W, C = img.shape
476
- img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
477
- control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
478
- control = torch.stack([control for _ in range(self.num_samples)], dim=0)
479
- control = einops.rearrange(control, 'b h w c -> b c h w').clone()
480
  self.seed = random.randint(0, 65535)
481
  seed_everything(self.seed)
482
- if self.save_memory:
483
- self.model.low_vram_shift(is_diffusing=False)
484
- cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
485
- un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
486
- shape = (4, H // 8, W // 8)
487
- self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
488
- samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
489
- if self.save_memory:
490
- self.model.low_vram_shift(is_diffusing=False)
491
- x_samples = self.model.decode_first_stage(samples)
492
- x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
493
  updated_image_path = get_new_image_name(image_path, func_name="hed2image")
494
- real_image = Image.fromarray(x_samples[0]) # default the index0 image
495
- real_image.save(updated_image_path)
496
  return updated_image_path
497
 
498
- class image2scribble:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
499
  def __init__(self):
500
  print("Direct detect scribble.")
501
- self.detector = HEDdetector()
502
  self.resolution = 512
503
 
504
  def inference(self, inputs):
@@ -506,76 +604,136 @@ class image2scribble:
506
  image = Image.open(inputs)
507
  image = np.array(image)
508
  image = HWC3(image)
509
- detected_map = self.detector(resize_image(image, self.resolution))
510
- detected_map = HWC3(detected_map)
511
  image = resize_image(image, self.resolution)
512
- H, W, C = image.shape
513
- detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
514
- detected_map = nms(detected_map, 127, 3.0)
515
- detected_map = cv2.GaussianBlur(detected_map, (0, 0), 3.0)
516
- detected_map[detected_map > 4] = 255
517
- detected_map[detected_map < 255] = 0
518
- detected_map = 255 - detected_map
519
  updated_image_path = get_new_image_name(inputs, func_name="scribble")
520
- image = Image.fromarray(detected_map)
521
- image.save(updated_image_path)
522
  return updated_image_path
523
 
524
- class scribble2image:
525
  def __init__(self, device):
526
- print("Initialize the scribble2image model...")
527
- model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
528
- model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_scribble.pth', location='cpu'))
529
- self.model = model.to(device)
530
- self.device = device
531
- self.ddim_sampler = DDIMSampler(self.model)
532
- self.ddim_steps = 20
 
 
 
533
  self.image_resolution = 512
534
- self.num_samples = 1
535
- self.save_memory = False
536
- self.strength = 1.0
537
- self.guess_mode = False
538
- self.scale = 9.0
539
  self.seed = -1
 
540
  self.a_prompt = 'best quality, extremely detailed'
541
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
542
 
543
  def inference(self, inputs):
544
  print("===>Starting scribble2image Inference")
545
- print(f'sketch device {self.device}')
546
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
547
  image = Image.open(image_path)
548
  image = np.array(image)
549
- prompt = instruct_text
550
  image = 255 - image
551
  img = resize_image(HWC3(image), self.image_resolution)
552
- H, W, C = img.shape
553
- img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
554
- control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
555
- control = torch.stack([control for _ in range(self.num_samples)], dim=0)
556
- control = einops.rearrange(control, 'b h w c -> b c h w').clone()
557
  self.seed = random.randint(0, 65535)
558
  seed_everything(self.seed)
559
- if self.save_memory:
560
- self.model.low_vram_shift(is_diffusing=False)
561
- cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
562
- un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
563
- shape = (4, H // 8, W // 8)
564
- self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
565
- samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
566
- if self.save_memory:
567
- self.model.low_vram_shift(is_diffusing=False)
568
- x_samples = self.model.decode_first_stage(samples)
569
- x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
570
  updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
571
- real_image = Image.fromarray(x_samples[0]) # default the index0 image
572
- real_image.save(updated_image_path)
573
  return updated_image_path
574
 
575
- class image2pose:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
576
  def __init__(self):
577
- print("Direct human pose.")
578
- self.detector = OpenposeDetector()
579
  self.resolution = 512
580
 
581
  def inference(self, inputs):
@@ -583,32 +741,30 @@ class image2pose:
583
  image = Image.open(inputs)
584
  image = np.array(image)
585
  image = HWC3(image)
586
- detected_map, _ = self.detector(resize_image(image, self.resolution))
587
- detected_map = HWC3(detected_map)
588
  image = resize_image(image, self.resolution)
589
- H, W, C = image.shape
590
- detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
 
591
  updated_image_path = get_new_image_name(inputs, func_name="human-pose")
592
- image = Image.fromarray(detected_map)
593
- image.save(updated_image_path)
594
  return updated_image_path
595
 
596
- class pose2image:
597
  def __init__(self, device):
598
- print("Initialize the pose2image model...")
599
- model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
600
- model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_openpose.pth', location='cpu'))
601
- self.model = model.to(device)
602
- self.device = device
603
- self.ddim_sampler = DDIMSampler(self.model)
604
- self.ddim_steps = 20
 
 
 
605
  self.image_resolution = 512
606
- self.num_samples = 1
607
- self.save_memory = False
608
- self.strength = 1.0
609
- self.guess_mode = False
610
- self.scale = 9.0
611
  self.seed = -1
 
612
  self.a_prompt = 'best quality, extremely detailed'
613
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
614
 
@@ -617,68 +773,141 @@ class pose2image:
617
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
618
  image = Image.open(image_path)
619
  image = np.array(image)
620
- prompt = instruct_text
621
  img = resize_image(HWC3(image), self.image_resolution)
622
- H, W, C = img.shape
623
- img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
624
- control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
625
- control = torch.stack([control for _ in range(self.num_samples)], dim=0)
626
- control = einops.rearrange(control, 'b h w c -> b c h w').clone()
627
  self.seed = random.randint(0, 65535)
628
  seed_everything(self.seed)
629
- if self.save_memory:
630
- self.model.low_vram_shift(is_diffusing=False)
631
- cond = {"c_concat": [control], "c_crossattn": [ self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
632
- un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
633
- shape = (4, H // 8, W // 8)
634
- self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
635
- samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
636
- if self.save_memory:
637
- self.model.low_vram_shift(is_diffusing=False)
638
- x_samples = self.model.decode_first_stage(samples)
639
- x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
640
  updated_image_path = get_new_image_name(image_path, func_name="pose2image")
641
- real_image = Image.fromarray(x_samples[0]) # default the index0 image
642
- real_image.save(updated_image_path)
643
  return updated_image_path
644
 
645
- class image2seg:
646
- def __init__(self):
647
- print("Direct segmentations.")
648
- self.detector = UniformerDetector()
649
- self.resolution = 512
650
 
651
- def inference(self, inputs):
652
- print("===>Starting image2seg Inference")
653
- image = Image.open(inputs)
654
- image = np.array(image)
655
- image = HWC3(image)
656
- detected_map = self.detector(resize_image(image, self.resolution))
657
- detected_map = HWC3(detected_map)
658
- image = resize_image(image, self.resolution)
659
- H, W, C = image.shape
660
- detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
661
  updated_image_path = get_new_image_name(inputs, func_name="segmentation")
662
- image = Image.fromarray(detected_map)
663
- image.save(updated_image_path)
664
  return updated_image_path
665
 
666
- class seg2image:
667
  def __init__(self, device):
668
- print("Initialize the seg2image model...")
669
- model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
670
- model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_seg.pth', location='cpu'))
671
- self.model = model.to(device)
672
- self.device = device
673
- self.ddim_sampler = DDIMSampler(self.model)
674
- self.ddim_steps = 20
 
 
 
675
  self.image_resolution = 512
676
- self.num_samples = 1
677
- self.save_memory = False
678
- self.strength = 1.0
679
- self.guess_mode = False
680
- self.scale = 9.0
681
  self.seed = -1
 
682
  self.a_prompt = 'best quality, extremely detailed'
683
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
684
 
@@ -687,68 +916,130 @@ class seg2image:
687
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
688
  image = Image.open(image_path)
689
  image = np.array(image)
690
- prompt = instruct_text
691
  img = resize_image(HWC3(image), self.image_resolution)
692
- H, W, C = img.shape
693
- img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
694
- control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
695
- control = torch.stack([control for _ in range(self.num_samples)], dim=0)
696
- control = einops.rearrange(control, 'b h w c -> b c h w').clone()
697
  self.seed = random.randint(0, 65535)
698
  seed_everything(self.seed)
699
- if self.save_memory:
700
- self.model.low_vram_shift(is_diffusing=False)
701
- cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
702
- un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
703
- shape = (4, H // 8, W // 8)
704
- self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
705
- samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
706
- if self.save_memory:
707
- self.model.low_vram_shift(is_diffusing=False)
708
- x_samples = self.model.decode_first_stage(samples)
709
- x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
710
  updated_image_path = get_new_image_name(image_path, func_name="segment2image")
711
- real_image = Image.fromarray(x_samples[0]) # default the index0 image
712
- real_image.save(updated_image_path)
713
  return updated_image_path
714
 
715
- class image2depth:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716
  def __init__(self):
717
- print("Direct depth estimation.")
718
- self.detector = MidasDetector()
719
  self.resolution = 512
720
 
721
  def inference(self, inputs):
722
- print("===>Starting image2depth Inference")
723
  image = Image.open(inputs)
724
  image = np.array(image)
725
  image = HWC3(image)
726
- detected_map, _ = self.detector(resize_image(image, self.resolution))
727
- detected_map = HWC3(detected_map)
728
  image = resize_image(image, self.resolution)
729
- H, W, C = image.shape
730
- detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
 
 
 
 
731
  updated_image_path = get_new_image_name(inputs, func_name="depth")
732
- image = Image.fromarray(detected_map)
733
- image.save(updated_image_path)
734
  return updated_image_path
735
 
736
- class depth2image:
737
  def __init__(self, device):
738
- print("Initialize depth2image model...")
739
- model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
740
- model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_depth.pth', location='cpu'))
741
- self.model = model.to(device)
742
- self.device = device
743
- self.ddim_sampler = DDIMSampler(self.model)
744
- self.ddim_steps = 20
 
 
 
745
  self.image_resolution = 512
746
- self.num_samples = 1
747
- self.save_memory = False
748
- self.strength = 1.0
749
- self.guess_mode = False
750
- self.scale = 9.0
751
  self.seed = -1
 
752
  self.a_prompt = 'best quality, extremely detailed'
753
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
754
 
@@ -757,69 +1048,146 @@ class depth2image:
757
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
758
  image = Image.open(image_path)
759
  image = np.array(image)
760
- prompt = instruct_text
761
  img = resize_image(HWC3(image), self.image_resolution)
762
- H, W, C = img.shape
763
- img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
764
- control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
765
- control = torch.stack([control for _ in range(self.num_samples)], dim=0)
766
- control = einops.rearrange(control, 'b h w c -> b c h w').clone()
767
  self.seed = random.randint(0, 65535)
768
  seed_everything(self.seed)
769
- if self.save_memory:
770
- self.model.low_vram_shift(is_diffusing=False)
771
- cond = {"c_concat": [control], "c_crossattn": [ self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
772
- un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
773
- shape = (4, H // 8, W // 8)
774
- self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
775
- samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
776
- if self.save_memory:
777
- self.model.low_vram_shift(is_diffusing=False)
778
- x_samples = self.model.decode_first_stage(samples)
779
- x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
780
  updated_image_path = get_new_image_name(image_path, func_name="depth2image")
781
- real_image = Image.fromarray(x_samples[0]) # default the index0 image
782
- real_image.save(updated_image_path)
783
  return updated_image_path
784
 
785
- class image2normal:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
  def __init__(self):
787
- print("Direct normal estimation.")
788
- self.detector = MidasDetector()
789
  self.resolution = 512
790
- self.bg_threshold = 0.4
791
 
792
  def inference(self, inputs):
793
- print("===>Starting image2 normal Inference")
794
  image = Image.open(inputs)
795
  image = np.array(image)
796
  image = HWC3(image)
797
- _, detected_map = self.detector(resize_image(image, self.resolution), bg_th=self.bg_threshold)
798
- detected_map = HWC3(detected_map)
799
  image = resize_image(image, self.resolution)
800
- H, W, C = image.shape
801
- detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
802
  updated_image_path = get_new_image_name(inputs, func_name="normal-map")
803
- image = Image.fromarray(detected_map)
804
  image.save(updated_image_path)
805
  return updated_image_path
806
 
807
- class normal2image:
808
  def __init__(self, device):
809
- print("Initialize normal2image model...")
810
- model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
811
- model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_normal.pth', location='cpu'))
812
- self.model = model.to(device)
813
- self.device = device
814
- self.ddim_sampler = DDIMSampler(self.model)
815
- self.ddim_steps = 20
 
 
 
816
  self.image_resolution = 512
817
- self.num_samples = 1
818
- self.save_memory = False
819
- self.strength = 1.0
820
- self.guess_mode = False
821
- self.scale = 9.0
822
  self.seed = -1
 
823
  self.a_prompt = 'best quality, extremely detailed'
824
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
825
 
@@ -828,32 +1196,93 @@ class normal2image:
828
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
829
  image = Image.open(image_path)
830
  image = np.array(image)
831
- prompt = instruct_text
832
- img = image[:, :, ::-1].copy()
833
- img = resize_image(HWC3(img), self.image_resolution)
834
- H, W, C = img.shape
835
- img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
836
- control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
837
- control = torch.stack([control for _ in range(self.num_samples)], dim=0)
838
- control = einops.rearrange(control, 'b h w c -> b c h w').clone()
839
  self.seed = random.randint(0, 65535)
840
  seed_everything(self.seed)
841
- if self.save_memory:
842
- self.model.low_vram_shift(is_diffusing=False)
843
- cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
844
- un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
845
- shape = (4, H // 8, W // 8)
846
- self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
847
- samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
848
- if self.save_memory:
849
- self.model.low_vram_shift(is_diffusing=False)
850
- x_samples = self.model.decode_first_stage(samples)
851
- x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
852
  updated_image_path = get_new_image_name(image_path, func_name="normal2image")
853
- real_image = Image.fromarray(x_samples[0]) # default the index0 image
854
- real_image.save(updated_image_path)
855
  return updated_image_path
856
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
857
  class BLIPVQA:
858
  def __init__(self, device):
859
  print("Initializing BLIP VQA to %s" % device)
 
6
  from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
7
  from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
8
  from controlnet_aux import OpenposeDetector, MLSDdetector, HEDdetector
9
+
10
  from transformers import AutoModelForCausalLM, AutoTokenizer, CLIPSegProcessor, CLIPSegForImageSegmentation
11
  from transformers import pipeline, BlipProcessor, BlipForConditionalGeneration, BlipForQuestionAnswering
12
+ from transformers import AutoImageProcessor, UperNetForSemanticSegmentation
13
  from ldm.util import instantiate_from_config
14
  from ControlNet.cldm.model import create_model, load_state_dict
15
  from ControlNet.cldm.ddim_hacked import DDIMSampler
 
28
  import cv2
29
  import random
30
 
31
+ def ade_palette():
32
+ return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
33
+ [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
34
+ [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
35
+ [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
36
+ [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
37
+ [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
38
+ [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
39
+ [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
40
+ [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
41
+ [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
42
+ [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
43
+ [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
44
+ [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
45
+ [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
46
+ [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
47
+ [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
48
+ [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
49
+ [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
50
+ [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
51
+ [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
52
+ [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
53
+ [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
54
+ [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
55
+ [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
56
+ [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
57
+ [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
58
+ [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
59
+ [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
60
+ [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
61
+ [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
62
+ [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
63
+ [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
64
+ [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
65
+ [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
66
+ [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
67
+ [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
68
+ [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
69
+ [102, 255, 0], [92, 0, 255]]
70
+
71
  def HWC3(x):
72
  assert x.dtype == np.uint8
73
  if x.ndim == 2:
 
397
  return updated_image_path
398
 
399
 
400
+ # class image2line:
401
+ # def __init__(self):
402
+ # print("Direct detect straight line...")
403
+ # self.detector = MLSDdetector()
404
+ # self.value_thresh = 0.1
405
+ # self.dis_thresh = 0.1
406
+ # self.resolution = 512
407
+ #
408
+ # def inference(self, inputs):
409
+ # print("===>Starting image2hough Inference")
410
+ # image = Image.open(inputs)
411
+ # image = np.array(image)
412
+ # image = HWC3(image)
413
+ # hough = self.detector(resize_image(image, self.resolution), self.value_thresh, self.dis_thresh)
414
+ # updated_image_path = get_new_image_name(inputs, func_name="line-of")
415
+ # hough = 255 - cv2.dilate(hough, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1)
416
+ # image = Image.fromarray(hough)
417
+ # image.save(updated_image_path)
418
+ # return updated_image_path
419
+ #
420
+ #
421
+ # class line2image:
422
+ # def __init__(self, device):
423
+ # print("Initialize the line2image model...")
424
+ # model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
425
+ # model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_mlsd.pth', location='cpu'))
426
+ # self.model = model.to(device)
427
+ # self.device = device
428
+ # self.ddim_sampler = DDIMSampler(self.model)
429
+ # self.ddim_steps = 20
430
+ # self.image_resolution = 512
431
+ # self.num_samples = 1
432
+ # self.save_memory = False
433
+ # self.strength = 1.0
434
+ # self.guess_mode = False
435
+ # self.scale = 9.0
436
+ # self.seed = -1
437
+ # self.a_prompt = 'best quality, extremely detailed'
438
+ # self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
439
+ #
440
+ # def inference(self, inputs):
441
+ # print("===>Starting line2image Inference")
442
+ # image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
443
+ # image = Image.open(image_path)
444
+ # image = np.array(image)
445
+ # image = 255 - image
446
+ # prompt = instruct_text
447
+ # img = resize_image(HWC3(image), self.image_resolution)
448
+ # H, W, C = img.shape
449
+ # img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
450
+ # control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
451
+ # control = torch.stack([control for _ in range(self.num_samples)], dim=0)
452
+ # control = einops.rearrange(control, 'b h w c -> b c h w').clone()
453
+ # self.seed = random.randint(0, 65535)
454
+ # seed_everything(self.seed)
455
+ # if self.save_memory:
456
+ # self.model.low_vram_shift(is_diffusing=False)
457
+ # cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
458
+ # un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
459
+ # shape = (4, H // 8, W // 8)
460
+ # self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
461
+ # samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
462
+ # if self.save_memory:
463
+ # self.model.low_vram_shift(is_diffusing=False)
464
+ # x_samples = self.model.decode_first_stage(samples)
465
+ # x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).\
466
+ # cpu().numpy().clip(0,255).astype(np.uint8)
467
+ # updated_image_path = get_new_image_name(image_path, func_name="line2image")
468
+ # real_image = Image.fromarray(x_samples[0]) # default the index0 image
469
+ # real_image.save(updated_image_path)
470
+ # return updated_image_path
 
471
 
472
+ class image2hed_new:
473
  def __init__(self):
474
  print("Direct detect soft HED boundary...")
475
+ self.detector = HEDdetector.from_pretrained('lllyasviel/ControlNet')
476
  self.resolution = 512
477
 
478
  def inference(self, inputs):
 
480
  image = Image.open(inputs)
481
  image = np.array(image)
482
  image = HWC3(image)
483
+ image = Image.fromarray(resize_image(image, self.resolution))
484
+ hed = self.detector(image)
485
+
486
  updated_image_path = get_new_image_name(inputs, func_name="hed-boundary")
487
+ hed.save(updated_image_path)
 
488
  return updated_image_path
489
 
490
+ class hed2image_new:
 
491
  def __init__(self, device):
492
  print("Initialize the hed2image model...")
493
+ self.controlnet = ControlNetModel.from_pretrained(
494
+ "fusing/stable-diffusion-v1-5-controlnet-hed"
495
+ )
496
+
497
+ self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
498
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
499
+ )
500
+
501
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
502
+ self.pipe.to(device)
503
  self.image_resolution = 512
504
+ self.num_inference_steps = 20
 
 
 
 
505
  self.seed = -1
506
+ self.unconditional_guidance_scale = 9.0
507
  self.a_prompt = 'best quality, extremely detailed'
508
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
509
 
 
512
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
513
  image = Image.open(image_path)
514
  image = np.array(image)
 
515
  img = resize_image(HWC3(image), self.image_resolution)
516
+ img = Image.fromarray(img)
517
+
 
 
 
518
  self.seed = random.randint(0, 65535)
519
  seed_everything(self.seed)
520
+
521
+ prompt = instruct_text
522
+ prompt = prompt + ', ' + self.a_prompt
523
+ image = \
524
+ self.pipe(prompt, img, num_inference_steps=self.num_inference_steps, eta=0.0, negative_prompt=self.n_prompt,
525
+ guidance_scale=self.unconditional_guidance_scale).images[0]
 
 
 
 
 
526
  updated_image_path = get_new_image_name(image_path, func_name="hed2image")
527
+ image.save(updated_image_path)
 
528
  return updated_image_path
529
 
530
+ # class image2hed:
531
+ # def __init__(self):
532
+ # print("Direct detect soft HED boundary...")
533
+ # self.detector = HEDdetector()
534
+ # self.resolution = 512
535
+ #
536
+ # def inference(self, inputs):
537
+ # print("===>Starting image2hed Inference")
538
+ # image = Image.open(inputs)
539
+ # image = np.array(image)
540
+ # image = HWC3(image)
541
+ # hed = self.detector(resize_image(image, self.resolution))
542
+ # updated_image_path = get_new_image_name(inputs, func_name="hed-boundary")
543
+ # image = Image.fromarray(hed)
544
+ # image.save(updated_image_path)
545
+ # return updated_image_path
546
+ #
547
+ #
548
+ # class hed2image:
549
+ # def __init__(self, device):
550
+ # print("Initialize the hed2image model...")
551
+ # model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
552
+ # model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_hed.pth', location='cpu'))
553
+ # self.model = model.to(device)
554
+ # self.device = device
555
+ # self.ddim_sampler = DDIMSampler(self.model)
556
+ # self.ddim_steps = 20
557
+ # self.image_resolution = 512
558
+ # self.num_samples = 1
559
+ # self.save_memory = False
560
+ # self.strength = 1.0
561
+ # self.guess_mode = False
562
+ # self.scale = 9.0
563
+ # self.seed = -1
564
+ # self.a_prompt = 'best quality, extremely detailed'
565
+ # self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
566
+ #
567
+ # def inference(self, inputs):
568
+ # print("===>Starting hed2image Inference")
569
+ # image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
570
+ # image = Image.open(image_path)
571
+ # image = np.array(image)
572
+ # prompt = instruct_text
573
+ # img = resize_image(HWC3(image), self.image_resolution)
574
+ # H, W, C = img.shape
575
+ # img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
576
+ # control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
577
+ # control = torch.stack([control for _ in range(self.num_samples)], dim=0)
578
+ # control = einops.rearrange(control, 'b h w c -> b c h w').clone()
579
+ # self.seed = random.randint(0, 65535)
580
+ # seed_everything(self.seed)
581
+ # if self.save_memory:
582
+ # self.model.low_vram_shift(is_diffusing=False)
583
+ # cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
584
+ # un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
585
+ # shape = (4, H // 8, W // 8)
586
+ # self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
587
+ # samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
588
+ # if self.save_memory:
589
+ # self.model.low_vram_shift(is_diffusing=False)
590
+ # x_samples = self.model.decode_first_stage(samples)
591
+ # x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
592
+ # updated_image_path = get_new_image_name(image_path, func_name="hed2image")
593
+ # real_image = Image.fromarray(x_samples[0]) # default the index0 image
594
+ # real_image.save(updated_image_path)
595
+ # return updated_image_path
596
+ class image2scribble_new:
597
  def __init__(self):
598
  print("Direct detect scribble.")
599
+ self.detector = HEDdetector.from_pretrained('lllyasviel/ControlNet')
600
  self.resolution = 512
601
 
602
  def inference(self, inputs):
 
604
  image = Image.open(inputs)
605
  image = np.array(image)
606
  image = HWC3(image)
 
 
607
  image = resize_image(image, self.resolution)
608
+ image = Image.fromarray(image)
609
+ scribble = self.detector(image, scribble=True)
610
+ scribble = np.array(scribble)
611
+ scribble = 255 - scribble
612
+ scribble = Image.fromarray(scribble)
 
 
613
  updated_image_path = get_new_image_name(inputs, func_name="scribble")
614
+ scribble.save(updated_image_path)
 
615
  return updated_image_path
616
 
617
+ class scribble2image_new:
618
  def __init__(self, device):
619
+ self.controlnet = ControlNetModel.from_pretrained(
620
+ "fusing/stable-diffusion-v1-5-controlnet-scribble"
621
+ )
622
+
623
+ self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
624
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
625
+ )
626
+
627
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
628
+ self.pipe.to(device)
629
  self.image_resolution = 512
630
+ self.num_inference_steps = 20
 
 
 
 
631
  self.seed = -1
632
+ self.unconditional_guidance_scale = 9.0
633
  self.a_prompt = 'best quality, extremely detailed'
634
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
635
 
636
  def inference(self, inputs):
637
  print("===>Starting scribble2image Inference")
 
638
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
639
  image = Image.open(image_path)
640
  image = np.array(image)
 
641
  image = 255 - image
642
  img = resize_image(HWC3(image), self.image_resolution)
643
+ img = Image.fromarray(img)
644
+
 
 
 
645
  self.seed = random.randint(0, 65535)
646
  seed_everything(self.seed)
647
+
648
+ prompt = instruct_text
649
+ prompt = prompt + ', ' + self.a_prompt
650
+ image = \
651
+ self.pipe(prompt, img, num_inference_steps=self.num_inference_steps, eta=0.0, negative_prompt=self.n_prompt,
652
+ guidance_scale=self.unconditional_guidance_scale).images[0]
 
 
 
 
 
653
  updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
654
+ image.save(updated_image_path)
 
655
  return updated_image_path
656
 
657
+ # class image2scribble:
658
+ # def __init__(self):
659
+ # print("Direct detect scribble.")
660
+ # self.detector = HEDdetector()
661
+ # self.resolution = 512
662
+ #
663
+ # def inference(self, inputs):
664
+ # print("===>Starting image2scribble Inference")
665
+ # image = Image.open(inputs)
666
+ # image = np.array(image)
667
+ # image = HWC3(image)
668
+ # detected_map = self.detector(resize_image(image, self.resolution))
669
+ # detected_map = HWC3(detected_map)
670
+ # image = resize_image(image, self.resolution)
671
+ # H, W, C = image.shape
672
+ # detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
673
+ # detected_map = nms(detected_map, 127, 3.0)
674
+ # detected_map = cv2.GaussianBlur(detected_map, (0, 0), 3.0)
675
+ # detected_map[detected_map > 4] = 255
676
+ # detected_map[detected_map < 255] = 0
677
+ # detected_map = 255 - detected_map
678
+ # updated_image_path = get_new_image_name(inputs, func_name="scribble")
679
+ # image = Image.fromarray(detected_map)
680
+ # image.save(updated_image_path)
681
+ # return updated_image_path
682
+ #
683
+ # class scribble2image:
684
+ # def __init__(self, device):
685
+ # print("Initialize the scribble2image model...")
686
+ # model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
687
+ # model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_scribble.pth', location='cpu'))
688
+ # self.model = model.to(device)
689
+ # self.device = device
690
+ # self.ddim_sampler = DDIMSampler(self.model)
691
+ # self.ddim_steps = 20
692
+ # self.image_resolution = 512
693
+ # self.num_samples = 1
694
+ # self.save_memory = False
695
+ # self.strength = 1.0
696
+ # self.guess_mode = False
697
+ # self.scale = 9.0
698
+ # self.seed = -1
699
+ # self.a_prompt = 'best quality, extremely detailed'
700
+ # self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
701
+ #
702
+ # def inference(self, inputs):
703
+ # print("===>Starting scribble2image Inference")
704
+ # print(f'sketch device {self.device}')
705
+ # image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
706
+ # image = Image.open(image_path)
707
+ # image = np.array(image)
708
+ # prompt = instruct_text
709
+ # image = 255 - image
710
+ # img = resize_image(HWC3(image), self.image_resolution)
711
+ # H, W, C = img.shape
712
+ # img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
713
+ # control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
714
+ # control = torch.stack([control for _ in range(self.num_samples)], dim=0)
715
+ # control = einops.rearrange(control, 'b h w c -> b c h w').clone()
716
+ # self.seed = random.randint(0, 65535)
717
+ # seed_everything(self.seed)
718
+ # if self.save_memory:
719
+ # self.model.low_vram_shift(is_diffusing=False)
720
+ # cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
721
+ # un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
722
+ # shape = (4, H // 8, W // 8)
723
+ # self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
724
+ # samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
725
+ # if self.save_memory:
726
+ # self.model.low_vram_shift(is_diffusing=False)
727
+ # x_samples = self.model.decode_first_stage(samples)
728
+ # x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
729
+ # updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
730
+ # real_image = Image.fromarray(x_samples[0]) # default the index0 image
731
+ # real_image.save(updated_image_path)
732
+ # return updated_image_path
733
+
734
+ class image2pose_new:
735
  def __init__(self):
736
+ self.detector = OpenposeDetector.from_pretrained('lllyasviel/ControlNet')
 
737
  self.resolution = 512
738
 
739
  def inference(self, inputs):
 
741
  image = Image.open(inputs)
742
  image = np.array(image)
743
  image = HWC3(image)
 
 
744
  image = resize_image(image, self.resolution)
745
+ image = Image.fromarray(image)
746
+ pose = self.detector(image)
747
+
748
  updated_image_path = get_new_image_name(inputs, func_name="human-pose")
749
+ pose.save(updated_image_path)
 
750
  return updated_image_path
751
 
752
+ class pose2image_new:
753
  def __init__(self, device):
754
+ self.controlnet = ControlNetModel.from_pretrained(
755
+ "fusing/stable-diffusion-v1-5-controlnet-openpose"
756
+ )
757
+
758
+ self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
759
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
760
+ )
761
+
762
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
763
+ self.pipe.to(device)
764
  self.image_resolution = 512
765
+ self.num_inference_steps = 20
 
 
 
 
766
  self.seed = -1
767
+ self.unconditional_guidance_scale = 9.0
768
  self.a_prompt = 'best quality, extremely detailed'
769
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
770
 
 
773
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
774
  image = Image.open(image_path)
775
  image = np.array(image)
 
776
  img = resize_image(HWC3(image), self.image_resolution)
777
+ img = Image.fromarray(img)
778
+
 
 
 
779
  self.seed = random.randint(0, 65535)
780
  seed_everything(self.seed)
781
+
782
+ prompt = instruct_text
783
+ prompt = prompt + ', ' + self.a_prompt
784
+ image = \
785
+ self.pipe(prompt, img, num_inference_steps=self.num_inference_steps, eta=0.0, negative_prompt=self.n_prompt,
786
+ guidance_scale=self.unconditional_guidance_scale).images[0]
 
 
 
 
 
787
  updated_image_path = get_new_image_name(image_path, func_name="pose2image")
788
+ image.save(updated_image_path)
 
789
  return updated_image_path
790
 
 
 
 
 
 
791
 
792
+ # class image2pose:
793
+ # def __init__(self):
794
+ # print("Direct human pose.")
795
+ # self.detector = OpenposeDetector()
796
+ # self.resolution = 512
797
+ #
798
+ # def inference(self, inputs):
799
+ # print("===>Starting image2pose Inference")
800
+ # image = Image.open(inputs)
801
+ # image = np.array(image)
802
+ # image = HWC3(image)
803
+ # detected_map, _ = self.detector(resize_image(image, self.resolution))
804
+ # detected_map = HWC3(detected_map)
805
+ # image = resize_image(image, self.resolution)
806
+ # H, W, C = image.shape
807
+ # detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
808
+ # updated_image_path = get_new_image_name(inputs, func_name="human-pose")
809
+ # image = Image.fromarray(detected_map)
810
+ # image.save(updated_image_path)
811
+ # return updated_image_path
812
+ #
813
+ # class pose2image:
814
+ # def __init__(self, device):
815
+ # print("Initialize the pose2image model...")
816
+ # model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
817
+ # model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_openpose.pth', location='cpu'))
818
+ # self.model = model.to(device)
819
+ # self.device = device
820
+ # self.ddim_sampler = DDIMSampler(self.model)
821
+ # self.ddim_steps = 20
822
+ # self.image_resolution = 512
823
+ # self.num_samples = 1
824
+ # self.save_memory = False
825
+ # self.strength = 1.0
826
+ # self.guess_mode = False
827
+ # self.scale = 9.0
828
+ # self.seed = -1
829
+ # self.a_prompt = 'best quality, extremely detailed'
830
+ # self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
831
+ #
832
+ # def inference(self, inputs):
833
+ # print("===>Starting pose2image Inference")
834
+ # image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
835
+ # image = Image.open(image_path)
836
+ # image = np.array(image)
837
+ # prompt = instruct_text
838
+ # img = resize_image(HWC3(image), self.image_resolution)
839
+ # H, W, C = img.shape
840
+ # img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
841
+ # control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
842
+ # control = torch.stack([control for _ in range(self.num_samples)], dim=0)
843
+ # control = einops.rearrange(control, 'b h w c -> b c h w').clone()
844
+ # self.seed = random.randint(0, 65535)
845
+ # seed_everything(self.seed)
846
+ # if self.save_memory:
847
+ # self.model.low_vram_shift(is_diffusing=False)
848
+ # cond = {"c_concat": [control], "c_crossattn": [ self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
849
+ # un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
850
+ # shape = (4, H // 8, W // 8)
851
+ # self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
852
+ # samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
853
+ # if self.save_memory:
854
+ # self.model.low_vram_shift(is_diffusing=False)
855
+ # x_samples = self.model.decode_first_stage(samples)
856
+ # x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
857
+ # updated_image_path = get_new_image_name(image_path, func_name="pose2image")
858
+ # real_image = Image.fromarray(x_samples[0]) # default the index0 image
859
+ # real_image.save(updated_image_path)
860
+ # return updated_image_path
861
+ class image2seg_new:
862
+ def __init__(self):
863
+ print("Initialize image2segmentation Inference")
864
+ self.image_processor = AutoImageProcessor.from_pretrained("openmmlab/upernet-convnext-small")
865
+ self.image_segmentor = UperNetForSemanticSegmentation.from_pretrained("openmmlab/upernet-convnext-small")
866
+ self.resolution = 512
867
+
868
+ def inference(self, inputs):
869
+ image = Image.open(inputs)
870
+ image = np.array(image)
871
+ image = HWC3(image)
872
+ image = resize_image(image, self.resolution)
873
+ image = Image.fromarray(image)
874
+ pixel_values = self.image_processor(image, return_tensors="pt").pixel_values
875
+
876
+ with torch.no_grad():
877
+ outputs = self.image_segmentor(pixel_values)
878
+
879
+ seg = self.image_processor.post_process_semantic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]
880
+
881
+ color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) # height, width, 3
882
+
883
+ palette = np.array(ade_palette())
884
+
885
+ for label, color in enumerate(palette):
886
+ color_seg[seg == label, :] = color
887
+
888
+ color_seg = color_seg.astype(np.uint8)
889
+
890
+ segmentation = Image.fromarray(color_seg)
891
  updated_image_path = get_new_image_name(inputs, func_name="segmentation")
892
+ segmentation.save(updated_image_path)
 
893
  return updated_image_path
894
 
895
+ class seg2image_new:
896
  def __init__(self, device):
897
+ self.controlnet = ControlNetModel.from_pretrained(
898
+ "fusing/stable-diffusion-v1-5-controlnet-seg"
899
+ )
900
+
901
+ self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
902
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
903
+ )
904
+
905
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
906
+ self.pipe.to(device)
907
  self.image_resolution = 512
908
+ self.num_inference_steps = 20
 
 
 
 
909
  self.seed = -1
910
+ self.unconditional_guidance_scale = 9.0
911
  self.a_prompt = 'best quality, extremely detailed'
912
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
913
 
 
916
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
917
  image = Image.open(image_path)
918
  image = np.array(image)
 
919
  img = resize_image(HWC3(image), self.image_resolution)
920
+ img = Image.fromarray(img)
921
+
 
 
 
922
  self.seed = random.randint(0, 65535)
923
  seed_everything(self.seed)
924
+
925
+ prompt = instruct_text
926
+ prompt = prompt + ', ' + self.a_prompt
927
+ image = \
928
+ self.pipe(prompt, img, num_inference_steps=self.num_inference_steps, eta=0.0, negative_prompt=self.n_prompt,
929
+ guidance_scale=self.unconditional_guidance_scale).images[0]
 
 
 
 
 
930
  updated_image_path = get_new_image_name(image_path, func_name="segment2image")
931
+ image.save(updated_image_path)
 
932
  return updated_image_path
933
 
934
+
935
+
936
+ # class image2seg:
937
+ # def __init__(self):
938
+ # print("===>Starting image2seg Inference")
939
+ # print("Direct segmentations.")
940
+ # self.detector = UniformerDetector()
941
+ # self.resolution = 512
942
+ #
943
+ # def inference(self, inputs):
944
+ # print("===>Starting image2seg Inference")
945
+ # image = Image.open(inputs)
946
+ # image = np.array(image)
947
+ # image = HWC3(image)
948
+ # detected_map = self.detector(resize_image(image, self.resolution))
949
+ # detected_map = HWC3(detected_map)
950
+ # image = resize_image(image, self.resolution)
951
+ # H, W, C = image.shape
952
+ # detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
953
+ # updated_image_path = get_new_image_name(inputs, func_name="segmentation")
954
+ # image = Image.fromarray(detected_map)
955
+ # image.save(updated_image_path)
956
+ # return updated_image_path
957
+ #
958
+ # class seg2image:
959
+ # def __init__(self, device):
960
+ # print("Initialize the seg2image model...")
961
+ # model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
962
+ # model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_seg.pth', location='cpu'))
963
+ # self.model = model.to(device)
964
+ # self.device = device
965
+ # self.ddim_sampler = DDIMSampler(self.model)
966
+ # self.ddim_steps = 20
967
+ # self.image_resolution = 512
968
+ # self.num_samples = 1
969
+ # self.save_memory = False
970
+ # self.strength = 1.0
971
+ # self.guess_mode = False
972
+ # self.scale = 9.0
973
+ # self.seed = -1
974
+ # self.a_prompt = 'best quality, extremely detailed'
975
+ # self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
976
+ #
977
+ # def inference(self, inputs):
978
+ # print("===>Starting seg2image Inference")
979
+ # image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
980
+ # image = Image.open(image_path)
981
+ # image = np.array(image)
982
+ # prompt = instruct_text
983
+ # img = resize_image(HWC3(image), self.image_resolution)
984
+ # H, W, C = img.shape
985
+ # img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
986
+ # control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
987
+ # control = torch.stack([control for _ in range(self.num_samples)], dim=0)
988
+ # control = einops.rearrange(control, 'b h w c -> b c h w').clone()
989
+ # self.seed = random.randint(0, 65535)
990
+ # seed_everything(self.seed)
991
+ # if self.save_memory:
992
+ # self.model.low_vram_shift(is_diffusing=False)
993
+ # cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
994
+ # un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
995
+ # shape = (4, H // 8, W // 8)
996
+ # self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
997
+ # samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
998
+ # if self.save_memory:
999
+ # self.model.low_vram_shift(is_diffusing=False)
1000
+ # x_samples = self.model.decode_first_stage(samples)
1001
+ # x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
1002
+ # updated_image_path = get_new_image_name(image_path, func_name="segment2image")
1003
+ # real_image = Image.fromarray(x_samples[0]) # default the index0 image
1004
+ # real_image.save(updated_image_path)
1005
+ # return updated_image_path
1006
+ class image2depth_new:
1007
  def __init__(self):
1008
+ print("initialize depth estimation")
1009
+ self.depth_estimator = pipeline('depth-estimation')
1010
  self.resolution = 512
1011
 
1012
  def inference(self, inputs):
 
1013
  image = Image.open(inputs)
1014
  image = np.array(image)
1015
  image = HWC3(image)
 
 
1016
  image = resize_image(image, self.resolution)
1017
+ image = Image.fromarray(image)
1018
+ depth = self.depth_estimator(image)['depth']
1019
+ depth = np.array(depth)
1020
+ depth = depth[:, :, None]
1021
+ depth = np.concatenate([depth, depth, depth], axis=2)
1022
+ depth = Image.fromarray(depth)
1023
  updated_image_path = get_new_image_name(inputs, func_name="depth")
1024
+ depth.save(updated_image_path)
 
1025
  return updated_image_path
1026
 
1027
+ class depth2image_new:
1028
  def __init__(self, device):
1029
+ self.controlnet = ControlNetModel.from_pretrained(
1030
+ "fusing/stable-diffusion-v1-5-controlnet-depth"
1031
+ )
1032
+
1033
+ self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
1034
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
1035
+ )
1036
+
1037
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
1038
+ self.pipe.to(device)
1039
  self.image_resolution = 512
1040
+ self.num_inference_steps = 20
 
 
 
 
1041
  self.seed = -1
1042
+ self.unconditional_guidance_scale = 9.0
1043
  self.a_prompt = 'best quality, extremely detailed'
1044
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
1045
 
 
1048
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
1049
  image = Image.open(image_path)
1050
  image = np.array(image)
 
1051
  img = resize_image(HWC3(image), self.image_resolution)
1052
+ img = Image.fromarray(img)
1053
+
 
 
 
1054
  self.seed = random.randint(0, 65535)
1055
  seed_everything(self.seed)
1056
+
1057
+ prompt = instruct_text
1058
+ prompt = prompt + ', ' + self.a_prompt
1059
+ image = \
1060
+ self.pipe(prompt, img, num_inference_steps=self.num_inference_steps, eta=0.0, negative_prompt=self.n_prompt,
1061
+ guidance_scale=self.unconditional_guidance_scale).images[0]
 
 
 
 
 
1062
  updated_image_path = get_new_image_name(image_path, func_name="depth2image")
1063
+ image.save(updated_image_path)
 
1064
  return updated_image_path
1065
 
1066
+ # class image2depth:
1067
+ # def __init__(self):
1068
+ # print("Direct depth estimation.")
1069
+ # self.detector = MidasDetector()
1070
+ # self.resolution = 512
1071
+ #
1072
+ # def inference(self, inputs):
1073
+ # print("===>Starting image2depth Inference")
1074
+ # image = Image.open(inputs)
1075
+ # image = np.array(image)
1076
+ # image = HWC3(image)
1077
+ # detected_map, _ = self.detector(resize_image(image, self.resolution))
1078
+ # detected_map = HWC3(detected_map)
1079
+ # image = resize_image(image, self.resolution)
1080
+ # H, W, C = image.shape
1081
+ # detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
1082
+ # updated_image_path = get_new_image_name(inputs, func_name="depth")
1083
+ # image = Image.fromarray(detected_map)
1084
+ # image.save(updated_image_path)
1085
+ # return updated_image_path
1086
+ #
1087
+ # class depth2image:
1088
+ # def __init__(self, device):
1089
+ # print("Initialize depth2image model...")
1090
+ # model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
1091
+ # model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_depth.pth', location='cpu'))
1092
+ # self.model = model.to(device)
1093
+ # self.device = device
1094
+ # self.ddim_sampler = DDIMSampler(self.model)
1095
+ # self.ddim_steps = 20
1096
+ # self.image_resolution = 512
1097
+ # self.num_samples = 1
1098
+ # self.save_memory = False
1099
+ # self.strength = 1.0
1100
+ # self.guess_mode = False
1101
+ # self.scale = 9.0
1102
+ # self.seed = -1
1103
+ # self.a_prompt = 'best quality, extremely detailed'
1104
+ # self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
1105
+ #
1106
+ # def inference(self, inputs):
1107
+ # print("===>Starting depth2image Inference")
1108
+ # image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
1109
+ # image = Image.open(image_path)
1110
+ # image = np.array(image)
1111
+ # prompt = instruct_text
1112
+ # img = resize_image(HWC3(image), self.image_resolution)
1113
+ # H, W, C = img.shape
1114
+ # img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
1115
+ # control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
1116
+ # control = torch.stack([control for _ in range(self.num_samples)], dim=0)
1117
+ # control = einops.rearrange(control, 'b h w c -> b c h w').clone()
1118
+ # self.seed = random.randint(0, 65535)
1119
+ # seed_everything(self.seed)
1120
+ # if self.save_memory:
1121
+ # self.model.low_vram_shift(is_diffusing=False)
1122
+ # cond = {"c_concat": [control], "c_crossattn": [ self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
1123
+ # un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
1124
+ # shape = (4, H // 8, W // 8)
1125
+ # self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13) # Magic number. IDK why. Perhaps because 0.825**12<0.01 but 0.826**12>0.01
1126
+ # samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
1127
+ # if self.save_memory:
1128
+ # self.model.low_vram_shift(is_diffusing=False)
1129
+ # x_samples = self.model.decode_first_stage(samples)
1130
+ # x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
1131
+ # updated_image_path = get_new_image_name(image_path, func_name="depth2image")
1132
+ # real_image = Image.fromarray(x_samples[0]) # default the index0 image
1133
+ # real_image.save(updated_image_path)
1134
+ # return updated_image_path
1135
+
1136
+ class image2normal_new:
1137
  def __init__(self):
1138
+ print("normal estimation")
1139
+ self.depth_estimator = pipeline("depth-estimation", model="Intel/dpt-hybrid-midas")
1140
  self.resolution = 512
1141
+ self.bg_threhold = 0.4
1142
 
1143
  def inference(self, inputs):
 
1144
  image = Image.open(inputs)
1145
  image = np.array(image)
1146
  image = HWC3(image)
 
 
1147
  image = resize_image(image, self.resolution)
1148
+ image = Image.fromarray(image)
1149
+ image = self.depth_estimator(image)['predicted_depth'][0]
1150
+
1151
+ image = image.numpy()
1152
+
1153
+ image_depth = image.copy()
1154
+ image_depth -= np.min(image_depth)
1155
+ image_depth /= np.max(image_depth)
1156
+
1157
+ bg_threhold = 0.4
1158
+
1159
+ x = cv2.Sobel(image, cv2.CV_32F, 1, 0, ksize=3)
1160
+ x[image_depth < bg_threhold] = 0
1161
+
1162
+ y = cv2.Sobel(image, cv2.CV_32F, 0, 1, ksize=3)
1163
+ y[image_depth < bg_threhold] = 0
1164
+
1165
+ z = np.ones_like(x) * np.pi * 2.0
1166
+
1167
+ image = np.stack([x, y, z], axis=2)
1168
+ image /= np.sum(image ** 2.0, axis=2, keepdims=True) ** 0.5
1169
+ image = (image * 127.5 + 127.5).clip(0, 255).astype(np.uint8)
1170
+ image = Image.fromarray(image)
1171
  updated_image_path = get_new_image_name(inputs, func_name="normal-map")
 
1172
  image.save(updated_image_path)
1173
  return updated_image_path
1174
 
1175
+ class normal2image_new:
1176
  def __init__(self, device):
1177
+ self.controlnet = ControlNetModel.from_pretrained(
1178
+ "fusing/stable-diffusion-v1-5-controlnet-normal"
1179
+ )
1180
+
1181
+ self.pipe = StableDiffusionControlNetPipeline.from_pretrained(
1182
+ "runwayml/stable-diffusion-v1-5", controlnet=self.controlnet, safety_checker=None
1183
+ )
1184
+
1185
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(self.pipe.scheduler.config)
1186
+ self.pipe.to(device)
1187
  self.image_resolution = 512
1188
+ self.num_inference_steps = 20
 
 
 
 
1189
  self.seed = -1
1190
+ self.unconditional_guidance_scale = 9.0
1191
  self.a_prompt = 'best quality, extremely detailed'
1192
  self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
1193
 
 
1196
  image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
1197
  image = Image.open(image_path)
1198
  image = np.array(image)
1199
+ img = resize_image(HWC3(image), self.image_resolution)
1200
+ img = Image.fromarray(img)
1201
+
 
 
 
 
 
1202
  self.seed = random.randint(0, 65535)
1203
  seed_everything(self.seed)
1204
+
1205
+ prompt = instruct_text
1206
+ prompt = prompt + ', ' + self.a_prompt
1207
+ image = \
1208
+ self.pipe(prompt, img, num_inference_steps=self.num_inference_steps, eta=0.0, negative_prompt=self.n_prompt,
1209
+ guidance_scale=self.unconditional_guidance_scale).images[0]
 
 
 
 
 
1210
  updated_image_path = get_new_image_name(image_path, func_name="normal2image")
1211
+ image.save(updated_image_path)
 
1212
  return updated_image_path
1213
 
1214
+ # class image2normal:
1215
+ # def __init__(self):
1216
+ # print("Direct normal estimation.")
1217
+ # self.detector = MidasDetector()
1218
+ # self.resolution = 512
1219
+ # self.bg_threshold = 0.4
1220
+ #
1221
+ # def inference(self, inputs):
1222
+ # print("===>Starting image2 normal Inference")
1223
+ # image = Image.open(inputs)
1224
+ # image = np.array(image)
1225
+ # image = HWC3(image)
1226
+ # _, detected_map = self.detector(resize_image(image, self.resolution), bg_th=self.bg_threshold)
1227
+ # detected_map = HWC3(detected_map)
1228
+ # image = resize_image(image, self.resolution)
1229
+ # H, W, C = image.shape
1230
+ # detected_map = cv2.resize(detected_map, (W, H), interpolation=cv2.INTER_LINEAR)
1231
+ # updated_image_path = get_new_image_name(inputs, func_name="normal-map")
1232
+ # image = Image.fromarray(detected_map)
1233
+ # image.save(updated_image_path)
1234
+ # return updated_image_path
1235
+ #
1236
+ # class normal2image:
1237
+ # def __init__(self, device):
1238
+ # print("Initialize normal2image model...")
1239
+ # model = create_model('ControlNet/models/cldm_v15.yaml', device=device).to(device)
1240
+ # model.load_state_dict(load_state_dict('ControlNet/models/control_sd15_normal.pth', location='cpu'))
1241
+ # self.model = model.to(device)
1242
+ # self.device = device
1243
+ # self.ddim_sampler = DDIMSampler(self.model)
1244
+ # self.ddim_steps = 20
1245
+ # self.image_resolution = 512
1246
+ # self.num_samples = 1
1247
+ # self.save_memory = False
1248
+ # self.strength = 1.0
1249
+ # self.guess_mode = False
1250
+ # self.scale = 9.0
1251
+ # self.seed = -1
1252
+ # self.a_prompt = 'best quality, extremely detailed'
1253
+ # self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality'
1254
+ #
1255
+ # def inference(self, inputs):
1256
+ # print("===>Starting normal2image Inference")
1257
+ # image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
1258
+ # image = Image.open(image_path)
1259
+ # image = np.array(image)
1260
+ # prompt = instruct_text
1261
+ # img = image[:, :, ::-1].copy()
1262
+ # img = resize_image(HWC3(img), self.image_resolution)
1263
+ # H, W, C = img.shape
1264
+ # img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
1265
+ # control = torch.from_numpy(img.copy()).float().to(device=self.device) / 255.0
1266
+ # control = torch.stack([control for _ in range(self.num_samples)], dim=0)
1267
+ # control = einops.rearrange(control, 'b h w c -> b c h w').clone()
1268
+ # self.seed = random.randint(0, 65535)
1269
+ # seed_everything(self.seed)
1270
+ # if self.save_memory:
1271
+ # self.model.low_vram_shift(is_diffusing=False)
1272
+ # cond = {"c_concat": [control], "c_crossattn": [self.model.get_learned_conditioning([prompt + ', ' + self.a_prompt] * self.num_samples)]}
1273
+ # un_cond = {"c_concat": None if self.guess_mode else [control], "c_crossattn": [self.model.get_learned_conditioning([self.n_prompt] * self.num_samples)]}
1274
+ # shape = (4, H // 8, W // 8)
1275
+ # self.model.control_scales = [self.strength * (0.825 ** float(12 - i)) for i in range(13)] if self.guess_mode else ([self.strength] * 13)
1276
+ # samples, intermediates = self.ddim_sampler.sample(self.ddim_steps, self.num_samples, shape, cond, verbose=False, eta=0., unconditional_guidance_scale=self.scale, unconditional_conditioning=un_cond)
1277
+ # if self.save_memory:
1278
+ # self.model.low_vram_shift(is_diffusing=False)
1279
+ # x_samples = self.model.decode_first_stage(samples)
1280
+ # x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + 127.5).cpu().numpy().clip(0, 255).astype(np.uint8)
1281
+ # updated_image_path = get_new_image_name(image_path, func_name="normal2image")
1282
+ # real_image = Image.fromarray(x_samples[0]) # default the index0 image
1283
+ # real_image.save(updated_image_path)
1284
+ # return updated_image_path
1285
+
1286
  class BLIPVQA:
1287
  def __init__(self, device):
1288
  print("Initializing BLIP VQA to %s" % device)