import importlib import os import random import cv2 import numpy as np import torch import torch.nn.functional as F from transformers import PretrainedConfig def seed_everything(seed): os.environ["PL_GLOBAL_SEED"] = str(seed) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) def is_torch2_available(): return hasattr(F, "scaled_dot_product_attention") def instantiate_from_config(config): if "target" not in config: if config == '__is_first_stage__' or config == "__is_unconditional__": return None raise KeyError("Expected key `target` to instantiate.") return get_obj_from_str(config["target"])(**config.get("params", {})) def get_obj_from_str(string, reload=False): module, cls = string.rsplit(".", 1) if reload: module_imp = importlib.import_module(module) importlib.reload(module_imp) return getattr(importlib.import_module(module, package=None), cls) def drop_seq_token(seq, drop_rate=0.5): idx = torch.randperm(seq.size(1)) num_keep_tokens = int(len(idx) * (1 - drop_rate)) idx = idx[:num_keep_tokens] seq = seq[:, idx] return seq def import_model_class_from_model_name_or_path( pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder" ): text_encoder_config = PretrainedConfig.from_pretrained( pretrained_model_name_or_path, subfolder=subfolder, revision=revision ) model_class = text_encoder_config.architectures[0] if model_class == "CLIPTextModel": from transformers import CLIPTextModel return CLIPTextModel elif model_class == "CLIPTextModelWithProjection": # noqa RET505 from transformers import CLIPTextModelWithProjection return CLIPTextModelWithProjection else: raise ValueError(f"{model_class} is not supported.") def resize_numpy_image_long(image, resize_long_edge=768): h, w = image.shape[:2] if max(h, w) <= resize_long_edge: return image k = resize_long_edge / max(h, w) h = int(h * k) w = int(w * k) image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LANCZOS4) return image