Long time generation code from google_colab relate to diffusers

#39
by Filipprox - opened

Code from google_colab for generation text_to_image:

class ImageGenerator:
def init(self):
self.device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
self.config_file_c = '/StableCascade/configs/inference/stage_c_3b.yaml'
self.config_file_b = '/StableCascade/configs/inference/stage_b_3b.yaml'
self.setup_models()

def setup_models(self):
    # SETUP STAGE C
    with open(self.config_file_c, "r", encoding="utf-8") as file:
        loaded_config = yaml.safe_load(file)
    self.core = WurstCoreC(config_dict=loaded_config, device=self.device, training=False)
    self.extras = self.core.setup_extras_pre()
    models = self.core.setup_models(self.extras)
    models.generator.eval().requires_grad_(False)
    print("STAGE C READY")

    # SETUP STAGE B
    with open(self.config_file_b, "r", encoding="utf-8") as file:
        config_file_b = yaml.safe_load(file)
    self.core_b = WurstCoreB(config_dict=config_file_b, device=self.device, training=False)
    self.extras_b = self.core_b.setup_extras_pre()
    models_b = self.core_b.setup_models(self.extras_b, skip_clip=True)
    models_b = WurstCoreB.Models(
        **{**models_b.to_dict(), 'tokenizer': models.tokenizer, 'text_model': models.text_model}
    )
    models_b.generator.bfloat16().eval().requires_grad_(False)
    print("STAGE B READY")

    # Compile models
    models = WurstCoreC.Models(
        **{**models.to_dict(), 'generator': torch.compile(models.generator, mode="reduce-overhead", fullgraph=True)}
    )
    models_b = WurstCoreB.Models(
        **{**models_b.to_dict(), 'generator': torch.compile(models_b.generator, mode="reduce-overhead", fullgraph=True)}
    )
    self.models = models
    self.models_b = models_b

def generate_and_save(self, caption, batch_size=1, height=1024, width=1024):
    # PREPARE CONDITIONS
    stage_c_latent_shape, stage_b_latent_shape = calculate_latent_sizes(height, width, batch_size=batch_size)

    # Stage C Parameters
    self.extras.sampling_configs['cfg'] = 4
    self.extras.sampling_configs['shift'] = 2
    self.extras.sampling_configs['timesteps'] = 20
    self.extras.sampling_configs['t_start'] = 1.0

    # Stage B Parameters
    self.extras_b.sampling_configs['cfg'] = 1.1
    self.extras_b.sampling_configs['shift'] = 1
    self.extras_b.sampling_configs['timesteps'] = 10
    self.extras_b.sampling_configs['t_start'] = 1.0

    batch = {'captions': [caption] * batch_size}
    conditions = self.core.get_conditions(batch, self.models, self.extras, is_eval=True, is_unconditional=False, eval_image_embeds=False)
    unconditions = self.core.get_conditions(batch, self.models, self.extras, is_eval=True, is_unconditional=True, eval_image_embeds=False)    
    conditions_b = self.core_b.get_conditions(batch, self.models_b, self.extras_b, is_eval=True, is_unconditional=False)
    unconditions_b = self.core_b.get_conditions(batch, self.models_b, self.extras_b, is_eval=True, is_unconditional=True)

    with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.bfloat16):
        # Stage C Sampling
        sampling_c = self.extras.gdf.sample(
            self.models.generator, conditions, stage_c_latent_shape,
            unconditions, device=self.device, **self.extras.sampling_configs,
        )
        for (sampled_c, _, _) in tqdm(sampling_c, total=self.extras.sampling_configs['timesteps']):
            sampled_c = sampled_c

        # Stage B Sampling
        conditions_b['effnet'] = sampled_c
        unconditions_b['effnet'] = torch.zeros_like(sampled_c)
        sampling_b = self.extras_b.gdf.sample(
            self.models_b.generator, conditions_b, stage_b_latent_shape,
            unconditions_b, device=self.device, **self.extras_b.sampling_configs
        )
        for (sampled_b, _, _) in tqdm(sampling_b, total=self.extras_b.sampling_configs['timesteps']):
            sampled_b = sampled_b
        sampled = self.models_b.stage_a.decode(sampled_b).float()
        sampled_image = T.ToPILImage()(sampled[0].cpu().detach().clamp_(0, 1))
        
        return sampled_image

Generation of input by code from "google_colab" equal 12 min, and generation by code using "diffusers" equal 30 sec.
What could be the reason for such a long generation?
I use :
big - model_version: 3.6B, dtype: bfloat16 , stage_c_bg16.safetensors
big - model_version: 3B , dtype: bfloat16, stage_b_bf16.safetensors

Is this right

self.device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")

I'm not sure about the device numbering on Colab try

 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Thanks for answering!
Yes, it is right, "cuda:2" is the name of my device, and i can see how gpu filling up when i run the code. (I'm running the code on my local machine)
Does anyone have a similar problem? Or do you all have about the same execution time?

Sign up or log in to comment