Spaces:

unidiffuser-testing
/

unidiffuser-testing

Paused

App Files Files Community

dg845 commited on May 19, 2023

Commit

10ccafc

•

1 Parent(s): f6adb30

Upload 9 files

Browse files

Add unidiffuser original code since I can't figure out how to package it correctly

Files changed (9) hide show

dpm_solver_pp.py +952 -0
libs/autoencoder.py +519 -0
libs/caption_decoder.py +283 -0
libs/clip.py +38 -0
libs/timm.py +112 -0
libs/uvit_multi_post_ln.py +277 -0
libs/uvit_multi_post_ln_v1.py +285 -0
unidiffuser/sample_v1.py +437 -0
utils.py +80 -0

dpm_solver_pp.py ADDED Viewed

	@@ -0,0 +1,952 @@

+import torch
+import torch.nn.functional as F
+import math
+import numpy as np
+import torch.distributed as dist
+def interpolate_fn(x: torch.Tensor, xp: torch.Tensor, yp: torch.Tensor) -> torch.Tensor:
+    """Performs piecewise linear interpolation for x, using xp and yp keypoints (knots).
+    Performs separate interpolation for each channel.
+    Args:
+        x: [N, C] points to be calibrated (interpolated). Batch with C channels.
+        xp: [C, K] x coordinates of the PWL knots. C is the number of channels, K is the number of knots.
+        yp: [C, K] y coordinates of the PWL knots. C is the number of channels, K is the number of knots.
+    Returns:
+        Interpolated points of the shape [N, C].
+    The piecewise linear function extends for the whole x axis (the outermost keypoints define the outermost
+    infinite lines).
+    For example:
+    >>> calibrate1d(torch.tensor([[0.5]]), torch.tensor([[0.0, 1.0]]), torch.tensor([[0.0, 2.0]]))
+    tensor([[1.0000]])
+    >>> calibrate1d(torch.tensor([[-10]]), torch.tensor([[0.0, 1.0]]), torch.tensor([[0.0, 2.0]]))
+    tensor([[-20.0000]])
+    """
+    x_breakpoints = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((x.shape[0], 1, 1))], dim=2)
+    num_x_points = xp.shape[1]
+    sorted_x_breakpoints, x_indices = torch.sort(x_breakpoints, dim=2)
+    x_idx = torch.argmin(x_indices, dim=2)
+    cand_start_idx = x_idx - 1
+    start_idx = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(1, device=x.device),
+        torch.where(
+            torch.eq(x_idx, num_x_points), torch.tensor(num_x_points - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
+    start_x = torch.gather(sorted_x_breakpoints, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
+    end_x = torch.gather(sorted_x_breakpoints, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
+    start_idx2 = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(0, device=x.device),
+        torch.where(
+            torch.eq(x_idx, num_x_points), torch.tensor(num_x_points - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    y_positions_expanded = yp.unsqueeze(0).expand(x.shape[0], -1, -1)
+    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
+    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
+    cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
+    return cand
+class NoiseScheduleVP:
+    def __init__(self, schedule='discrete', beta_0=1e-4, beta_1=2e-2, total_N=1000, betas=None, alphas_cumprod=None):
+        """Create a wrapper class for the forward SDE (VP type).
+        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
+        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
+        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
+            log_alpha_t = self.marginal_log_mean_coeff(t)
+            sigma_t = self.marginal_std(t)
+            lambda_t = self.marginal_lambda(t)
+        Moreover, as lambda(t) is an invertible function, we also support its inverse function:
+            t = self.inverse_lambda(lambda_t)
+        ===============================================================
+        We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
+        schedule are the default settings in DDPM and improved-DDPM:
+            beta_min: A `float` number. The smallest beta for the linear schedule.
+            beta_max: A `float` number. The largest beta for the linear schedule.
+            cosine_s: A `float` number. The hyperparameter in the cosine schedule.
+            cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
+            T: A `float` number. The ending time of the forward process.
+        Note that the original DDPM (linear schedule) used the discrete-time label (0 to 999). We convert the discrete-time
+        label to the continuous-time time (followed Song et al., 2021), so the beta here is 1000x larger than those in DDPM.
+        ===============================================================
+        Args:
+            schedule: A `str`. The noise schedule of the forward SDE ('linear' or 'cosine').
+        Returns:
+            A wrapper object of the forward SDE (VP type).
+        """
+        if schedule not in ['linear', 'discrete', 'cosine']:
+            raise ValueError("Unsupported noise schedule {}. The schedule needs to be 'linear' or 'cosine'".format(schedule))
+        self.total_N = total_N
+        self.beta_0 = beta_0 * 1000.
+        self.beta_1 = beta_1 * 1000.
+        if schedule == 'discrete':
+            if betas is not None:
+                log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
+            else:
+                assert alphas_cumprod is not None
+                log_alphas = 0.5 * torch.log(alphas_cumprod)
+            self.total_N = len(log_alphas)
+            self.t_discrete = torch.linspace(1. / self.total_N, 1., self.total_N).reshape((1, -1))
+            self.log_alpha_discrete = log_alphas.reshape((1, -1))
+        self.cosine_s = 0.008
+        self.cosine_beta_max = 999.
+        self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
+        self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
+        self.schedule = schedule
+        if schedule == 'cosine':
+            # For the cosine schedule, T = 1 will have numerical issues. So we manually set the ending time T.
+            # Note that T = 0.9946 may be not the optimal setting. However, we find it works well.
+            self.T = 0.9946
+        else:
+            self.T = 1.
+    def marginal_log_mean_coeff(self, t):
+        """
+        Compute log(alpha_t) of a given continuous-time label t in [0, T].
+        """
+        if self.schedule == 'linear':
+            return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
+        elif self.schedule == 'discrete':
+            return interpolate_fn(t.reshape((-1, 1)), self.t_discrete.clone().to(t.device), self.log_alpha_discrete.clone().to(t.device)).reshape((-1,))
+        elif self.schedule == 'cosine':
+            log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
+            log_alpha_t =  log_alpha_fn(t) - self.cosine_log_alpha_0
+            return log_alpha_t
+        else:
+            raise ValueError("Unsupported ")
+    def marginal_alpha(self, t):
+        return torch.exp(self.marginal_log_mean_coeff(t))
+    def marginal_std(self, t):
+        """
+        Compute sigma_t of a given continuous-time label t in [0, T].
+        """
+        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
+    def marginal_lambda(self, t):
+        """
+        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
+        """
+        log_mean_coeff = self.marginal_log_mean_coeff(t)
+        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
+        return log_mean_coeff - log_std
+    def inverse_lambda(self, lamb):
+        """
+        Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
+        """
+        if self.schedule == 'linear':
+            tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
+            Delta = self.beta_0**2 + tmp
+            return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
+        elif self.schedule == 'discrete':
+            log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
+            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_discrete.clone().to(lamb.device), [1]), torch.flip(self.t_discrete.clone().to(lamb.device), [1]))
+            return t.reshape((-1,))
+        else:
+            log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
+            t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
+            t = t_fn(log_alpha)
+            return t
+def model_wrapper(model, noise_schedule=None, is_cond_classifier=False, classifier_fn=None, classifier_scale=1., time_input_type='1', total_N=1000, model_kwargs={}, is_deis=False):
+    """Create a wrapper function for the noise prediction model.
+    DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
+    firstly wrap the model function to a function that accepts the continuous time as the input.
+    The input `model` has the following format:
+    ``
+        model(x, t_input, **model_kwargs) -> noise
+    ``
+    where `x` and `noise` have the same shape, and `t_input` is the time label of the model.
+    (may be discrete-time labels (i.e. 0 to 999) or continuous-time labels (i.e. epsilon to T).)
+    We wrap the model function to the following format:
+    ``
+        def model_fn(x, t_continuous) -> noise:
+            t_input = get_model_input_time(t_continuous)
+            return model(x, t_input, **model_kwargs)
+    ``
+    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.
+    For DPMs with classifier guidance, we also combine the model output with the classifier gradient as used in [1].
+    [1] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis," in Advances in Neural
+    Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
+    ===============================================================
+    Args:
+        model: A noise prediction model with the following format:
+            ``
+                def model(x, t_input, **model_kwargs):
+                    return noise
+            ``
+        noise_schedule: A noise schedule object, such as NoiseScheduleVP. Only used for the classifier guidance.
+        is_cond_classifier: A `bool`. Whether to use the classifier guidance.
+        classifier_fn: A classifier function. Only used for the classifier guidance. The format is:
+            ``
+                def classifier_fn(x, t_input):
+                    return logits
+            ``
+        classifier_scale: A `float`. The scale for the classifier guidance.
+        time_input_type: A `str`. The type for the time input of the model. We support three types:
+            - '0': The continuous-time type. In this case, the model is trained on the continuous time,
+                so `t_input` = `t_continuous`.
+            - '1': The Type-1 discrete type described in the Appendix of DPM-Solver paper.
+                **For discrete-time DPMs, we recommend to use this type for DPM-Solver**.
+            - '2': The Type-2 discrete type described in the Appendix of DPM-Solver paper.
+        total_N: A `int`. The total number of the discrete-time DPMs (default is 1000), used when `time_input_type`
+            is '1' or '2'.
+        model_kwargs: A `dict`. A dict for the other inputs of the model function.
+    Returns:
+        A function that accepts the continuous time as the input, with the following format:
+            ``
+                def model_fn(x, t_continuous):
+                    t_input = get_model_input_time(t_continuous)
+                    return model(x, t_input, **model_kwargs)
+            ``
+    """
+    def get_model_input_time(t_continuous):
+        """
+        Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
+        """
+        if time_input_type == '0':
+            # discrete_type == '0' means that the model is continuous-time model.
+            # For continuous-time DPMs, the continuous time equals to the discrete time.
+            return t_continuous
+        elif time_input_type == '1':
+            # Type-1 discrete label, as detailed in the Appendix of DPM-Solver.
+            return 1000. * torch.max(t_continuous - 1. / total_N, torch.zeros_like(t_continuous).to(t_continuous))
+        elif time_input_type == '2':
+            # Type-2 discrete label, as detailed in the Appendix of DPM-Solver.
+            max_N = (total_N - 1) / total_N * 1000.
+            return max_N * t_continuous
+        else:
+            raise ValueError("Unsupported time input type {}, must be '0' or '1' or '2'".format(time_input_type))
+    def cond_fn(x, t_discrete, y):
+        """
+        Compute the gradient of the classifier, multiplied with the sclae of the classifier guidance.
+        """
+        assert y is not None
+        with torch.enable_grad():
+            x_in = x.detach().requires_grad_(True)
+            logits = classifier_fn(x_in, t_discrete)
+            log_probs = F.log_softmax(logits, dim=-1)
+            selected = log_probs[range(len(logits)), y.view(-1)]
+            return classifier_scale * torch.autograd.grad(selected.sum(), x_in)[0]
+    def model_fn(x, t_continuous):
+        """
+        The noise predicition model function that is used for DPM-Solver.
+        """
+        if t_continuous.reshape((-1,)).shape[0] == 1:
+            t_continuous = torch.ones((x.shape[0],)).to(x.device) * t_continuous
+        if is_cond_classifier:
+            y = model_kwargs.get("y", None)
+            if y is None:
+                raise ValueError("For classifier guidance, the label y has to be in the input.")
+            t_discrete = get_model_input_time(t_continuous)
+            noise_uncond = model(x, t_discrete, **model_kwargs)
+            cond_grad = cond_fn(x, t_discrete, y)
+            if is_deis:
+                sigma_t = noise_schedule.marginal_std(t_continuous / 1000.)
+            else:
+                sigma_t = noise_schedule.marginal_std(t_continuous)
+            dims = len(cond_grad.shape) - 1
+            return noise_uncond - sigma_t[(...,) + (None,)*dims] * cond_grad
+        else:
+            t_discrete = get_model_input_time(t_continuous)
+            return model(x, t_discrete, **model_kwargs)
+    return model_fn
+class DPM_Solver:
+    def __init__(self, model_fn, noise_schedule, predict_x0=False, thresholding=False, max_val=1.):
+        """Construct a DPM-Solver.
+        Args:
+            model_fn: A noise prediction model function which accepts the continuous-time input
+                (t in [epsilon, T]):
+                ``
+                def model_fn(x, t_continuous):
+                    return noise
+                ``
+            noise_schedule: A noise schedule object, such as NoiseScheduleVP.
+        """
+        self.model = model_fn
+        self.noise_schedule = noise_schedule
+        self.predict_x0 = predict_x0
+        self.thresholding = thresholding
+        self.max_val = max_val
+    def model_fn(self, x, t):
+        if self.predict_x0:
+            alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
+            noise = self.model(x, t)
+            dims = len(x.shape) - 1
+            x0 = (x - sigma_t[(...,) + (None,)*dims] * noise) / alpha_t[(...,) + (None,)*dims]
+            if self.thresholding:
+                p = 0.995
+                s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
+                s = torch.maximum(s, torch.ones_like(s).to(s.device))[(...,) + (None,)*dims]
+                x0 = torch.clamp(x0, -s, s) / (s / self.max_val)
+            return x0
+        else:
+            return self.model(x, t)
+    def get_time_steps(self, skip_type, t_T, t_0, N, device):
+        """Compute the intermediate time steps for sampling.
+        Args:
+            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
+                - 'logSNR': uniform logSNR for the time steps, **recommended for DPM-Solver**.
+                - 'time_uniform': uniform time for the time steps. (Used in DDIM and DDPM.)
+                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
+            t_T: A `float`. The starting time of the sampling (default is T).
+            t_0: A `float`. The ending time of the sampling (default is epsilon).
+            N: A `int`. The total number of the spacing of the time steps.
+            device: A torch device.
+        Returns:
+            A pytorch tensor of the time steps, with the shape (N + 1,).
+        """
+        if skip_type == 'logSNR':
+            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
+            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
+            logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
+            # print(torch.min(torch.abs(logSNR_steps - self.noise_schedule.marginal_lambda(self.noise_schedule.inverse_lambda(logSNR_steps)))).item())
+            return self.noise_schedule.inverse_lambda(logSNR_steps)
+        elif skip_type == 't2':
+            t_order = 2
+            t = torch.linspace(t_T**(1. / t_order), t_0**(1. / t_order), N + 1).pow(t_order).to(device)
+            return t
+        elif skip_type == 'time_uniform':
+            return torch.linspace(t_T, t_0, N + 1).to(device)
+        elif skip_type == 'time_quadratic':
+            t = torch.linspace(t_0, t_T, 10000000).to(device)
+            quadratic_t = torch.sqrt(t)
+            quadratic_steps = torch.linspace(quadratic_t[0], quadratic_t[-1], N + 1).to(device)
+            return torch.flip(torch.cat([t[torch.searchsorted(quadratic_t, quadratic_steps)[:-1]], t_T * torch.ones((1,)).to(device)], dim=0), dims=[0])
+        else:
+            raise ValueError("Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
+    def get_time_steps_for_dpm_solver_fast(self, skip_type, t_T, t_0, steps, order, device):
+        """
+        Compute the intermediate time steps and the order of each step for sampling by DPM-Solver-fast.
+        We recommend DPM-Solver-fast for fast sampling of DPMs. Given a fixed number of function evaluations by `steps`,
+        the sampling procedure by DPM-Solver-fast is:
+            - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
+            - If steps % 3 == 0, we use (K - 2) steps of DPM-Solver-3, and 1 step of DPM-Solver-2 and 1 step of DPM-Solver-1.
+            - If steps % 3 == 1, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-1.
+            - If steps % 3 == 2, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-2.
+        ============================================
+        Args:
+            t_T: A `float`. The starting time of the sampling (default is T).
+            t_0: A `float`. The ending time of the sampling (default is epsilon).
+            steps: A `int`. The total number of function evaluations (NFE).
+            device: A torch device.
+        Returns:
+            orders: A list of the solver order of each step.
+            timesteps: A pytorch tensor of the time steps, with the shape of (K + 1,).
+        """
+        if order == 3:
+            K = steps // 3 + 1
+            if steps % 3 == 0:
+                orders = [3,] * (K - 2) + [2, 1]
+            elif steps % 3 == 1:
+                orders = [3,] * (K - 1) + [1]
+            else:
+                orders = [3,] * (K - 1) + [2]
+            timesteps = self.get_time_steps(skip_type, t_T, t_0, K, device)
+            return orders, timesteps
+        elif order == 2:
+            K = steps // 2
+            if steps % 2 == 0:
+                orders = [2,] * K
+            else:
+                orders = [2,] * K + [1]
+            timesteps = self.get_time_steps(skip_type, t_T, t_0, K, device)
+            return orders, timesteps
+        else:
+            raise ValueError("order must >= 2")
+    def denoise_fn(self, x, s, noise_s=None):
+        ns = self.noise_schedule
+        dims = len(x.shape) - 1
+        log_alpha_s = ns.marginal_log_mean_coeff(s)
+        sigma_s = ns.marginal_std(s)
+        if noise_s is None:
+            noise_s = self.model_fn(x, s)
+        x_0 = (
+            (x - sigma_s[(...,) + (None,)*dims] * noise_s) / torch.exp(log_alpha_s)[(...,) + (None,)*dims]
+        )
+        return x_0
+    def dpm_solver_first_update(self, x, s, t, noise_s=None, return_noise=False):
+        """
+        A single step for DPM-Solver-1.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
+            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
+            return_noise: A `bool`. If true, also return the predicted noise at time `s`.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        ns = self.noise_schedule
+        dims = len(x.shape) - 1
+        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
+        h = lambda_t - lambda_s
+        log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t)
+        sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t)
+        alpha_t = torch.exp(log_alpha_t)
+        if self.predict_x0:
+            phi_1 = (torch.exp(-h) - 1.) / (-1.)
+            if noise_s is None:
+                noise_s = self.model_fn(x, s)
+            x_t = (
+                (sigma_t / sigma_s)[(...,) + (None,)*dims] * x
+                + (alpha_t * phi_1)[(...,) + (None,)*dims] * noise_s
+            )
+            if return_noise:
+                return x_t, {'noise_s': noise_s}
+            else:
+                return x_t
+        else:
+            phi_1 = torch.expm1(h)
+            if noise_s is None:
+                noise_s = self.model_fn(x, s)
+            x_t = (
+                torch.exp(log_alpha_t - log_alpha_s)[(...,) + (None,)*dims] * x
+                - (sigma_t * phi_1)[(...,) + (None,)*dims] * noise_s
+            )
+            if return_noise:
+                return x_t, {'noise_s': noise_s}
+            else:
+                return x_t
+    def dpm_solver_second_update(self, x, s, t, r1=0.5, noise_s=None, return_noise=False, solver_type='dpm_solver'):
+        """
+        A single step for DPM-Solver-2.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
+            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
+            r1: A `float`. The hyperparameter of the second-order solver. We recommend the default setting `0.5`.
+            noise_s: A pytorch tensor. The predicted noise at time `s`.
+                If `noise_s` is None, we compute the predicted noise by `x` and `s`; otherwise we directly use it.
+            return_noise: A `bool`. If true, also return the predicted noise at time `s` and `s1` (the intermediate time).
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        if r1 is None:
+            r1 = 0.5
+        ns = self.noise_schedule
+        dims = len(x.shape) - 1
+        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
+        h = lambda_t - lambda_s
+        lambda_s1 = lambda_s + r1 * h
+        s1 = ns.inverse_lambda(lambda_s1)
+        log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(t)
+        sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t)
+        alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t)
+        if self.predict_x0:
+            phi_11 = torch.expm1(-r1 * h)
+            phi_1 = torch.expm1(-h)
+            if noise_s is None:
+                noise_s = self.model_fn(x, s)
+            x_s1 = (
+                (sigma_s1 / sigma_s)[(...,) + (None,)*dims] * x
+                - (alpha_s1 * phi_11)[(...,) + (None,)*dims] * noise_s
+            )
+            noise_s1 = self.model_fn(x_s1, s1)
+            if solver_type == 'dpm_solver':
+                x_t = (
+                    (sigma_t / sigma_s)[(...,) + (None,)*dims] * x
+                    - (alpha_t * phi_1)[(...,) + (None,)*dims] * noise_s
+                    - (0.5 / r1) * (alpha_t * phi_1)[(...,) + (None,)*dims] * (noise_s1 - noise_s)
+                )
+            elif solver_type == 'taylor':
+                x_t = (
+                    (sigma_t / sigma_s)[(...,) + (None,)*dims] * x
+                    - (alpha_t * phi_1)[(...,) + (None,)*dims] * noise_s
+                    + (1. / r1) * (alpha_t * ((torch.exp(-h) - 1.) / h + 1.))[(...,) + (None,)*dims] * (noise_s1 - noise_s)
+                )
+            else:
+                raise ValueError("solver_type must be either dpm_solver or taylor, got {}".format(solver_type))
+        else:
+            phi_11 = torch.expm1(r1 * h)
+            phi_1 = torch.expm1(h)
+            if noise_s is None:
+                noise_s = self.model_fn(x, s)
+            x_s1 = (
+                torch.exp(log_alpha_s1 - log_alpha_s)[(...,) + (None,)*dims] * x
+                - (sigma_s1 * phi_11)[(...,) + (None,)*dims] * noise_s
+            )
+            noise_s1 = self.model_fn(x_s1, s1)
+            if solver_type == 'dpm_solver':
+                x_t = (
+                    torch.exp(log_alpha_t - log_alpha_s)[(...,) + (None,)*dims] * x
+                    - (sigma_t * phi_1)[(...,) + (None,)*dims] * noise_s
+                    - (0.5 / r1) * (sigma_t * phi_1)[(...,) + (None,)*dims] * (noise_s1 - noise_s)
+                )
+            elif solver_type == 'taylor':
+                x_t = (
+                    torch.exp(log_alpha_t - log_alpha_s)[(...,) + (None,)*dims] * x
+                    - (sigma_t * phi_1)[(...,) + (None,)*dims] * noise_s
+                    - (1. / r1) * (sigma_t * ((torch.exp(h) - 1.) / h - 1.))[(...,) + (None,)*dims] * (noise_s1 - noise_s)
+                )
+            else:
+                raise ValueError("solver_type must be either dpm_solver or taylor, got {}".format(solver_type))
+        if return_noise:
+            return x_t, {'noise_s': noise_s, 'noise_s1': noise_s1}
+        else:
+            return x_t
+    def dpm_multistep_second_update(self, x, noise_prev_list, t_prev_list, t, solver_type="dpm_solver"):
+        ns = self.noise_schedule
+        dims = len(x.shape) - 1
+        noise_prev_1, noise_prev_0 = noise_prev_list
+        t_prev_1, t_prev_0 = t_prev_list
+        lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
+        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+        alpha_t = torch.exp(log_alpha_t)
+        h_0 = lambda_prev_0 - lambda_prev_1
+        h = lambda_t - lambda_prev_0
+        r0 = h_0 / h
+        D1_0 = (1. / r0)[(...,) + (None,)*dims] * (noise_prev_0 - noise_prev_1)
+        if self.predict_x0:
+            if solver_type == 'taylor':
+                x_t = (
+                    (sigma_t / sigma_prev_0)[(...,) + (None,)*dims] * x
+                    - (alpha_t * (torch.exp(-h) - 1.))[(...,) + (None,)*dims] * noise_prev_0
+                    + (alpha_t * ((torch.exp(-h) - 1.) / h + 1.))[(...,) + (None,)*dims] * D1_0
+                )
+            elif solver_type == 'dpm_solver':
+                x_t = (
+                    (sigma_t / sigma_prev_0)[(...,) + (None,)*dims] * x
+                    - (alpha_t * (torch.exp(-h) - 1.))[(...,) + (None,)*dims] * noise_prev_0
+                    - 0.5 * (alpha_t * (torch.exp(-h) - 1.))[(...,) + (None,)*dims] * D1_0
+                )
+        else:
+            if solver_type == 'taylor':
+                x_t = (
+                    torch.exp(log_alpha_t - log_alpha_prev_0)[(...,) + (None,)*dims] * x
+                    - (sigma_t * (torch.exp(h) - 1.))[(...,) + (None,)*dims] * noise_prev_0
+                    - (sigma_t * ((torch.exp(h) - 1.) / h - 1.))[(...,) + (None,)*dims] * D1_0
+                )
+            elif solver_type == 'dpm_solver':
+                x_t = (
+                    torch.exp(log_alpha_t - log_alpha_prev_0)[(...,) + (None,)*dims] * x
+                    - (sigma_t * (torch.exp(h) - 1.))[(...,) + (None,)*dims] * noise_prev_0
+                    - 0.5 * (sigma_t * (torch.exp(h) - 1.))[(...,) + (None,)*dims] * D1_0
+                )
+        return x_t
+    def dpm_multistep_third_update(self, x, noise_prev_list, t_prev_list, t, solver_type='dpm_solver'):
+        ns = self.noise_schedule
+        dims = len(x.shape) - 1
+        noise_prev_2, noise_prev_1, noise_prev_0 = noise_prev_list
+        t_prev_2, t_prev_1, t_prev_0 = t_prev_list
+        lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda(t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t)
+        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+        alpha_t = torch.exp(log_alpha_t)
+        h_1 = lambda_prev_1 - lambda_prev_2
+        h_0 = lambda_prev_0 - lambda_prev_1
+        h = lambda_t - lambda_prev_0
+        r0, r1 = h_0 / h, h_1 / h
+        D1_0 = (1. / r0)[(...,) + (None,)*dims] * (noise_prev_0 - noise_prev_1)
+        D1_1 = (1. / r1)[(...,) + (None,)*dims] * (noise_prev_1 - noise_prev_2)
+        D1 = D1_0 + (r0 / (r0 + r1))[(...,) + (None,)*dims] * (D1_0 - D1_1)
+        D2 = (1. / (r0 + r1))[(...,) + (None,)*dims] * (D1_0 - D1_1)
+        if self.predict_x0:
+            x_t = (
+                (sigma_t / sigma_prev_0)[(...,) + (None,)*dims] * x
+                - (alpha_t * (torch.exp(-h) - 1.))[(...,) + (None,)*dims] * noise_prev_0
+                + (alpha_t * ((torch.exp(-h) - 1.) / h + 1.))[(...,) + (None,)*dims] * D1
+                - (alpha_t * ((torch.exp(-h) - 1. + h) / h**2 - 0.5))[(...,) + (None,)*dims] * D2
+            )
+        else:
+            x_t = (
+                torch.exp(log_alpha_t - log_alpha_prev_0)[(...,) + (None,)*dims] * x
+                - (sigma_t * (torch.exp(h) - 1.))[(...,) + (None,)*dims] * noise_prev_0
+                - (sigma_t * ((torch.exp(h) - 1.) / h - 1.))[(...,) + (None,)*dims] * D1
+                - (sigma_t * ((torch.exp(h) - 1. - h) / h**2 - 0.5))[(...,) + (None,)*dims] * D2
+            )
+        return x_t
+    def dpm_solver_third_update(self, x, s, t, r1=1./3., r2=2./3., noise_s=None, noise_s1=None, noise_s2=None, return_noise=False, solver_type='dpm_solver'):
+        """
+        A single step for DPM-Solver-3.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
+            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
+            r1: A `float`. The hyperparameter of the third-order solver. We recommend the default setting `1 / 3`.
+            r2: A `float`. The hyperparameter of the third-order solver. We recommend the default setting `2 / 3`.
+            noise_s: A pytorch tensor. The predicted noise at time `s`.
+                If `noise_s` is None, we compute the predicted noise by `x` and `s`; otherwise we directly use it.
+            noise_s1: A pytorch tensor. The predicted noise at time `s1` (the intermediate time given by `r1`).
+                If `noise_s1` is None, we compute the predicted noise by `s1`; otherwise we directly use it.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        if r1 is None:
+            r1 = 1. / 3.
+        if r2 is None:
+            r2 = 2. / 3.
+        ns = self.noise_schedule
+        dims = len(x.shape) - 1
+        lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t)
+        h = lambda_t - lambda_s
+        lambda_s1 = lambda_s + r1 * h
+        lambda_s2 = lambda_s + r2 * h
+        s1 = ns.inverse_lambda(lambda_s1)
+        s2 = ns.inverse_lambda(lambda_s2)
+        log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t)
+        sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(s2), ns.marginal_std(t)
+        alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t)
+        if self.predict_x0:
+            phi_11 = torch.expm1(-r1 * h)
+            phi_12 = torch.expm1(-r2 * h)
+            phi_1 = torch.expm1(-h)
+            phi_22 = torch.expm1(-r2 * h) / (r2 * h) + 1.
+            phi_2 = phi_1 / h + 1.
+            phi_3 = phi_2 / h - 0.5
+            if noise_s is None:
+                noise_s = self.model_fn(x, s)
+            if noise_s1 is None:
+                x_s1 = (
+                    (sigma_s1 / sigma_s)[(...,) + (None,)*dims] * x
+                    - (alpha_s1 * phi_11)[(...,) + (None,)*dims] * noise_s
+                )
+                noise_s1 = self.model_fn(x_s1, s1)
+            if noise_s2 is None:
+                x_s2 = (
+                    (sigma_s2 / sigma_s)[(...,) + (None,)*dims] * x
+                    - (alpha_s2 * phi_12)[(...,) + (None,)*dims] * noise_s
+                    + r2 / r1 * (alpha_s2 * phi_22)[(...,) + (None,)*dims] * (noise_s1 - noise_s)
+                )
+                noise_s2 = self.model_fn(x_s2, s2)
+            if solver_type == 'dpm_solver':
+                x_t = (
+                    (sigma_t / sigma_s)[(...,) + (None,)*dims] * x
+                    - (alpha_t * phi_1)[(...,) + (None,)*dims] * noise_s
+                    + (1. / r2) * (alpha_t * phi_2)[(...,) + (None,)*dims] * (noise_s2 - noise_s)
+                )
+            elif solver_type == 'taylor':
+                D1_0 = (1. / r1) * (noise_s1 - noise_s)
+                D1_1 = (1. / r2) * (noise_s2 - noise_s)
+                D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
+                D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
+                x_t = (
+                    (sigma_t / sigma_s)[(...,) + (None,)*dims] * x
+                    - (alpha_t * phi_1)[(...,) + (None,)*dims] * noise_s
+                    + (alpha_t * phi_2)[(...,) + (None,)*dims] * D1
+                    - (alpha_t * phi_3)[(...,) + (None,)*dims] * D2
+                )
+            else:
+                raise ValueError("solver_type must be either dpm_solver or dpm_solver++, got {}".format(solver_type))
+        else:
+            phi_11 = torch.expm1(r1 * h)
+            phi_12 = torch.expm1(r2 * h)
+            phi_1 = torch.expm1(h)
+            phi_22 = torch.expm1(r2 * h) / (r2 * h) - 1.
+            phi_2 = phi_1 / h - 1.
+            phi_3 = phi_2 / h - 0.5
+            if noise_s is None:
+                noise_s = self.model_fn(x, s)
+            if noise_s1 is None:
+                x_s1 = (
+                    torch.exp(log_alpha_s1 - log_alpha_s)[(...,) + (None,)*dims] * x
+                    - (sigma_s1 * phi_11)[(...,) + (None,)*dims] * noise_s
+                )
+                noise_s1 = self.model_fn(x_s1, s1)
+            if noise_s2 is None:
+                x_s2 = (
+                    torch.exp(log_alpha_s2 - log_alpha_s)[(...,) + (None,)*dims] * x
+                    - (sigma_s2 * phi_12)[(...,) + (None,)*dims] * noise_s
+                    - r2 / r1 * (sigma_s2 * phi_22)[(...,) + (None,)*dims] * (noise_s1 - noise_s)
+                )
+                noise_s2 = self.model_fn(x_s2, s2)
+            if solver_type == 'dpm_solver':
+                x_t = (
+                    torch.exp(log_alpha_t - log_alpha_s)[(...,) + (None,)*dims] * x
+                    - (sigma_t * phi_1)[(...,) + (None,)*dims] * noise_s
+                    - (1. / r2) * (sigma_t * phi_2)[(...,) + (None,)*dims] * (noise_s2 - noise_s)
+                )
+            elif solver_type == 'taylor':
+                D1_0 = (1. / r1) * (noise_s1 - noise_s)
+                D1_1 = (1. / r2) * (noise_s2 - noise_s)
+                D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1)
+                D2 = 2. * (D1_1 - D1_0) / (r2 - r1)
+                x_t = (
+                    torch.exp(log_alpha_t - log_alpha_s)[(...,) + (None,)*dims] * x
+                    - (sigma_t * phi_1)[(...,) + (None,)*dims] * noise_s
+                    - (sigma_t * phi_2)[(...,) + (None,)*dims] * D1
+                    - (sigma_t * phi_3)[(...,) + (None,)*dims] * D2
+                )
+            else:
+                raise ValueError("solver_type must be either dpm_solver or dpm_solver++, got {}".format(solver_type))
+        if return_noise:
+            return x_t, {'noise_s': noise_s, 'noise_s1': noise_s1, 'noise_s2': noise_s2}
+        else:
+            return x_t
+    def dpm_solver_update(self, x, s, t, order, return_noise=False, solver_type='dpm_solver', r1=None, r2=None):
+        """
+        A single step for DPM-Solver of the given order `order`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
+            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
+            order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        if order == 1:
+            return self.dpm_solver_first_update(x, s, t, return_noise=return_noise)
+        elif order == 2:
+            return self.dpm_solver_second_update(x, s, t, return_noise=return_noise, solver_type=solver_type, r1=r1)
+        elif order == 3:
+            return self.dpm_solver_third_update(x, s, t, return_noise=return_noise, solver_type=solver_type, r1=r1, r2=r2)
+        else:
+            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
+    def dpm_multistep_update(self, x, noise_prev_list, t_prev_list, t, order, solver_type='taylor'):
+        """
+        A single step for DPM-Solver of the given order `order`.
+        Args:
+            x: A pytorch tensor. The initial value at time `s`.
+            s: A pytorch tensor. The starting time, with the shape (x.shape[0],).
+            t: A pytorch tensor. The ending time, with the shape (x.shape[0],).
+            order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3.
+        Returns:
+            x_t: A pytorch tensor. The approximated solution at time `t`.
+        """
+        if order == 1:
+            return self.dpm_solver_first_update(x, t_prev_list[-1], t, noise_s=noise_prev_list[-1])
+        elif order == 2:
+            return self.dpm_multistep_second_update(x, noise_prev_list, t_prev_list, t, solver_type=solver_type)
+        elif order == 3:
+            return self.dpm_multistep_third_update(x, noise_prev_list, t_prev_list, t, solver_type=solver_type)
+        else:
+            raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order))
+    def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5, solver_type='dpm_solver'):
+        """
+        The adaptive step size solver based on DPM-Solver.
+        Args:
+            x: A pytorch tensor. The initial value at time `t_T`.
+            order: A `int`. The (higher) order of the solver. We only support order == 2 or 3.
+            t_T: A `float`. The starting time of the sampling (default is T).
+            t_0: A `float`. The ending time of the sampling (default is epsilon).
+            h_init: A `float`. The initial step size (for logSNR).
+            atol: A `float`. The absolute tolerance of the solver. For image data, the default setting is 0.0078, followed [1].
+            rtol: A `float`. The relative tolerance of the solver. The default setting is 0.05.
+            theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1].
+            t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the
+                current time and `t_0` is less than `t_err`. The default setting is 1e-5.
+        Returns:
+            x_0: A pytorch tensor. The approximated solution at time `t_0`.
+        [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021.
+        """
+        ns = self.noise_schedule
+        s = t_T * torch.ones((x.shape[0],)).to(x)
+        lambda_s = ns.marginal_lambda(s)
+        lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x))
+        h = h_init * torch.ones_like(s).to(x)
+        x_prev = x
+        nfe = 0
+        if order == 2:
+            r1 = 0.5
+            lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_noise=True)
+            higher_update = lambda x, s, t, **kwargs: self.dpm_solver_second_update(x, s, t, r1=r1, solver_type=solver_type, **kwargs)
+        elif order == 3:
+            r1, r2 = 1. / 3., 2. / 3.
+            lower_update = lambda x, s, t: self.dpm_solver_second_update(x, s, t, r1=r1, return_noise=True, solver_type=solver_type)
+            higher_update = lambda x, s, t, **kwargs: self.dpm_solver_third_update(x, s, t, r1=r1, r2=r2, solver_type=solver_type, **kwargs)
+        else:
+            raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order))
+        while torch.abs((s - t_0)).mean() > t_err:
+            t = ns.inverse_lambda(lambda_s + h)
+            x_lower, lower_noise_kwargs = lower_update(x, s, t)
+            x_higher = higher_update(x, s, t, **lower_noise_kwargs)
+            delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev)))
+            norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True))
+            E = norm_fn((x_higher - x_lower) / delta).max()
+            if torch.all(E <= 1.):
+                x = x_higher
+                s = t
+                x_prev = x_lower
+                lambda_s = ns.marginal_lambda(s)
+            h = torch.min(theta * h * torch.float_power(E, -1. / order).float(), lambda_0 - lambda_s)
+            nfe += order
+        print('adaptive solver nfe', nfe)
+        return x
+    def sample(self, x, steps=10, eps=1e-4, T=None, order=3, skip_type='time_uniform',
+        denoise=False, method='fast', solver_type='dpm_solver', atol=0.0078,
+        rtol=0.05,
+    ):
+        """
+        Compute the sample at time `eps` by DPM-Solver, given the initial `x` at time `T`.
+        We support the following algorithms:
+            - Adaptive step size DPM-Solver (i.e. DPM-Solver-12 and DPM-Solver-23)
+            - Fixed order DPM-Solver (i.e. DPM-Solver-1, DPM-Solver-2 and DPM-Solver-3).
+            - Fast version of DPM-Solver (i.e. DPM-Solver-fast), which uses uniform logSNR steps and combine
+                different orders of DPM-Solver.
+        **We recommend DPM-Solver-fast for both fast sampling in few steps (<=20) and fast convergence in many steps (50 to 100).**
+        Choosing the algorithms:
+            - If `adaptive_step_size` is True:
+                We ignore `steps` and use adaptive step size DPM-Solver with a higher order of `order`.
+                If `order`=2, we use DPM-Solver-12 which combines DPM-Solver-1 and DPM-Solver-2.
+                If `order`=3, we use DPM-Solver-23 which combines DPM-Solver-2 and DPM-Solver-3.
+                You can adjust the absolute tolerance `atol` and the relative tolerance `rtol` to balance the computatation costs
+                (NFE) and the sample quality.
+            - If `adaptive_step_size` is False and `fast_version` is True:
+                We ignore `order` and use DPM-Solver-fast with number of function evaluations (NFE) = `steps`.
+                We ignore `skip_type` and use uniform logSNR steps for DPM-Solver-fast.
+                Given a fixed NFE=`steps`, the sampling procedure by DPM-Solver-fast is:
+                    - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling.
+                    - If steps % 3 == 0, we use (K - 2) steps of DPM-Solver-3, and 1 step of DPM-Solver-2 and 1 step of DPM-Solver-1.
+                    - If steps % 3 == 1, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-1.
+                    - If steps % 3 == 2, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-2.
+            - If `adaptive_step_size` is False and `fast_version` is False:
+                We use DPM-Solver-`order` for `order`=1 or 2 or 3, with total [`steps` // `order`] * `order` NFE.
+                We support three types of `skip_type`:
+                    - 'logSNR': uniform logSNR for the time steps, **recommended for DPM-Solver**.
+                    - 'time_uniform': uniform time for the time steps. (Used in DDIM and DDPM.)
+                    - 'time_quadratic': quadratic time for the time steps. (Used in DDIM.)
+        =====================================================
+        Args:
+            x: A pytorch tensor. The initial value at time `T` (a sample from the normal distribution).
+            steps: A `int`. The total number of function evaluations (NFE).
+            eps: A `float`. The ending time of the sampling.
+                We recommend `eps`=1e-3 when `steps` <= 15; and `eps`=1e-4 when `steps` > 15.
+            T: A `float`. The starting time of the sampling. Default is `None`.
+                If `T` is None, we use self.noise_schedule.T.
+            order: A `int`. The order of DPM-Solver.
+            skip_type: A `str`. The type for the spacing of the time steps. Default is 'logSNR'.
+            adaptive_step_size: A `bool`. If true, use the adaptive step size DPM-Solver.
+            fast_version: A `bool`. If true, use DPM-Solver-fast (recommended).
+            atol: A `float`. The absolute tolerance of the adaptive step size solver.
+            rtol: A `float`. The relative tolerance of the adaptive step size solver.
+        Returns:
+            x_0: A pytorch tensor. The approximated solution at time `t_0`.
+        [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021.
+        """
+        t_0 = eps
+        t_T = self.noise_schedule.T if T is None else T
+        device = x.device
+        if method == 'adaptive':
+            with torch.no_grad():
+                x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol, solver_type=solver_type)
+        elif method == 'multistep':
+            assert steps >= order
+            if timesteps is None:
+                timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
+            assert timesteps.shape[0] - 1 == steps
+            with torch.no_grad():
+                vec_t = timesteps[0].expand((x.shape[0]))
+                noise_prev_list = [self.model_fn(x, vec_t)]
+                t_prev_list = [vec_t]
+                for init_order in range(1, order):
+                    vec_t = timesteps[init_order].expand(x.shape[0])
+                    x = self.dpm_multistep_update(x, noise_prev_list, t_prev_list, vec_t, init_order, solver_type=solver_type)
+                    noise_prev_list.append(self.model_fn(x, vec_t))
+                    t_prev_list.append(vec_t)
+                for step in range(order, steps + 1):
+                    vec_t = timesteps[step].expand(x.shape[0])
+                    x = self.dpm_multistep_update(x, noise_prev_list, t_prev_list, vec_t, order, solver_type=solver_type)
+                    for i in range(order - 1):
+                        t_prev_list[i] = t_prev_list[i + 1]
+                        noise_prev_list[i] = noise_prev_list[i + 1]
+                    t_prev_list[-1] = vec_t
+                    if step < steps:
+                        noise_prev_list[-1] = self.model_fn(x, vec_t)
+        elif method == 'fast':
+            orders, _ = self.get_time_steps_for_dpm_solver_fast(skip_type=skip_type, t_T=t_T, t_0=t_0, steps=steps, order=order, device=device)
+            timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
+            with torch.no_grad():
+                i = 0
+                for order in orders:
+                    vec_s, vec_t = torch.ones((x.shape[0],)).to(device) * timesteps[i], torch.ones((x.shape[0],)).to(device) * timesteps[i + order]
+                    h = self.noise_schedule.marginal_lambda(timesteps[i + order]) - self.noise_schedule.marginal_lambda(timesteps[i])
+                    r1 = None if order <= 1 else (self.noise_schedule.marginal_lambda(timesteps[i + 1]) - self.noise_schedule.marginal_lambda(timesteps[i])) / h
+                    r2 = None if order <= 2 else (self.noise_schedule.marginal_lambda(timesteps[i + 2]) - self.noise_schedule.marginal_lambda(timesteps[i])) / h
+                    x = self.dpm_solver_update(x, vec_s, vec_t, order, solver_type=solver_type, r1=r1, r2=r2)
+                    i += order
+        elif method == 'singlestep':
+            N_steps = steps // order
+            orders = [order,] * N_steps
+            timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=N_steps, device=device)
+            assert len(timesteps) - 1 == N_steps
+            with torch.no_grad():
+                for i, order in enumerate(orders):
+                    vec_s, vec_t = torch.ones((x.shape[0],)).to(device) * timesteps[i], torch.ones((x.shape[0],)).to(device) * timesteps[i + 1]
+                    x = self.dpm_solver_update(x, vec_s, vec_t, order, solver_type=solver_type)
+        if denoise:
+            x = self.denoise_fn(x, torch.ones((x.shape[0],)).to(device) * t_0)
+        return x

libs/autoencoder.py ADDED Viewed

	@@ -0,0 +1,519 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from einops import rearrange
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
+        k = k.softmax(dim=-1)
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+class LinAttnBlock(LinearAttention):
+    """to match AttnBlock usage"""
+    def __init__(self, in_channels):
+        super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+        h_ = self.proj_out(h_)
+        return x+h_
+def make_attn(in_channels, attn_type="vanilla"):
+    assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown'
+    print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
+    if attn_type == "vanilla":
+        return AttnBlock(in_channels)
+    elif attn_type == "none":
+        return nn.Identity(in_channels)
+    else:
+        return LinAttnBlock(in_channels)
+class Encoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
+                 **ignore_kwargs):
+        super().__init__()
+        if use_linear_attn: attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2*z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
+                 attn_type="vanilla", **ignorekwargs):
+        super().__init__()
+        if use_linear_attn: attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,)+tuple(ch_mult)
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        self.z_shape = (1,z_channels,curr_res,curr_res)
+        print("Working with z of shape {} = {} dimensions.".format(
+            self.z_shape, np.prod(self.z_shape)))
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, z):
+        #assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
+class FrozenAutoencoderKL(nn.Module):
+    def __init__(self, ddconfig, embed_dim, pretrained_path, scale_factor=0.18215):
+        super().__init__()
+        print(f'Create autoencoder with scale_factor={scale_factor}')
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2 * ddconfig["z_channels"], 2 * embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+        self.scale_factor = scale_factor
+        m, u = self.load_state_dict(torch.load(pretrained_path, map_location='cpu'))
+        assert len(m) == 0 and len(u) == 0
+        self.eval()
+        self.requires_grad_(False)
+    def encode_moments(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        return moments
+    def sample(self, moments):
+        mean, logvar = torch.chunk(moments, 2, dim=1)
+        logvar = torch.clamp(logvar, -30.0, 20.0)
+        std = torch.exp(0.5 * logvar)
+        z = mean + std * torch.randn_like(mean)
+        z = self.scale_factor * z
+        return z
+    def encode(self, x):
+        moments = self.encode_moments(x)
+        z = self.sample(moments)
+        return z
+    def decode(self, z):
+        z = (1. / self.scale_factor) * z
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+    def forward(self, inputs, fn):
+        if fn == 'encode_moments':
+            return self.encode_moments(inputs)
+        elif fn == 'encode':
+            return self.encode(inputs)
+        elif fn == 'decode':
+            return self.decode(inputs)
+        else:
+            raise NotImplementedError
+def get_model(pretrained_path, scale_factor=0.18215):
+    ddconfig = dict(
+        double_z=True,
+        z_channels=4,
+        resolution=256,
+        in_channels=3,
+        out_ch=3,
+        ch=128,
+        ch_mult=[1, 2, 4, 4],
+        num_res_blocks=2,
+        attn_resolutions=[],
+        dropout=0.0
+    )
+    return FrozenAutoencoderKL(ddconfig, 4, pretrained_path, scale_factor)
+def main():
+    import torchvision.transforms as transforms
+    from torchvision.utils import save_image
+    import os
+    from PIL import Image
+    model = get_model('assets/stable-diffusion/autoencoder_kl.pth')
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    scale_factor = 0.18215
+    T = transforms.Compose([transforms.Resize(256), transforms.CenterCrop(256), transforms.ToTensor()])
+    path = 'imgs'
+    fnames = os.listdir(path)
+    for fname in fnames:
+        p = os.path.join(path, fname)
+        img = Image.open(p)
+        img = T(img)
+        img = img * 2. - 1
+        img = img[None, ...]
+        img = img.to(device)
+        # with torch.cuda.amp.autocast():
+        #     moments = model.encode_moments(img)
+        #     mean, logvar = torch.chunk(moments, 2, dim=1)
+        #     logvar = torch.clamp(logvar, -30.0, 20.0)
+        #     std = torch.exp(0.5 * logvar)
+        #     zs = [(mean + std * torch.randn_like(mean)) * scale_factor for _ in range(4)]
+        #     recons = [model.decode(z) for z in zs]
+        with torch.cuda.amp.autocast():
+            print('test encode & decode')
+            recons = [model.decode(model.encode(img)) for _ in range(4)]
+        out = torch.cat([img, *recons], dim=0)
+        out = (out + 1) * 0.5
+        save_image(out, f'recons_{fname}')
+if __name__ == "__main__":
+    main()

libs/caption_decoder.py ADDED Viewed

	@@ -0,0 +1,283 @@

+import os
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as nnf
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+from transformers import default_data_collator
+from transformers import EarlyStoppingCallback
+data_collator = default_data_collator
+es = EarlyStoppingCallback(early_stopping_patience=5)
+import json
+import argparse
+from typing import Union, Optional
+from collections import OrderedDict
+# %% model initial
+class ClipCaptionModel(nn.Module):
+    """
+    """
+    def get_dummy_token(self, batch_size: int, device: torch.device) -> torch.Tensor:
+        return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64, device=device)
+    def forward(self, tokens: torch.Tensor, prefix: torch.Tensor, mask: Optional[torch.Tensor] = None,
+                labels: Optional[torch.Tensor] = None):
+        """
+        : param tokens: (Tensor) [N x max_seq_len] eg. [4 X 33]
+        : param prefix: (Tensor) [N x prefix_length x 768] eg. [4 x 77 x 768]
+        : param mask: (Tensor) [N x (prefix_length + max_seq_len) x 768] eg. [4 x 110 x768]
+        : attribute embedding_text: (Tensor) [N x max_seq_len x 768] eg. [4 x 33 x 768]
+        : attribute embedding_cat: (Tensor) [N x (prefix_length + max_seq_len) x 768] eg. [4 x 110 x 768]
+        """
+        embedding_text = self.gpt.transformer.wte(tokens)
+        hidden = self.encode_prefix(prefix)
+        prefix = self.decode_prefix(hidden)
+        embedding_cat = torch.cat((prefix, embedding_text), dim=1)
+        if labels is not None:
+            dummy_token = self.get_dummy_token(tokens.shape[0], tokens.device)
+            labels = torch.cat((dummy_token, tokens), dim=1)
+        out = self.gpt(inputs_embeds=embedding_cat, labels=labels, attention_mask=mask)
+        if self.hidden_dim is not None:
+            return out, hidden
+        else:
+            return out
+    def encode_decode_prefix(self, prefix):
+        return self.decode_prefix(self.encode_prefix(prefix))
+    def __init__(self, prefix_length: int, hidden_dim=None):
+        super(ClipCaptionModel, self).__init__()
+        self.prefix_length = prefix_length
+        eos = '<|EOS|>'
+        special_tokens_dict = {'eos_token': eos}
+        base_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        base_tokenizer.add_special_tokens(special_tokens_dict)
+        self.gpt = GPT2LMHeadModel.from_pretrained('gpt2', eos_token_id=base_tokenizer.eos_token_id)
+        self.gpt.resize_token_embeddings(len(base_tokenizer))
+        self.hidden_dim = hidden_dim
+        self.encode_prefix = nn.Linear(768, hidden_dim) if hidden_dim is not None else nn.Identity()
+        self.decode_prefix = nn.Linear(hidden_dim, 768) if hidden_dim is not None else nn.Identity()
+def load_model(config_path: str, epoch_or_latest: Union[str, int] = '_latest'):
+    with open(config_path) as f:
+        config = json.load(f)
+    parser = argparse.ArgumentParser()
+    parser.set_defaults(**config)
+    args = parser.parse_args()
+    if type(epoch_or_latest) is int:
+        epoch_or_latest = f"-{epoch_or_latest:03d}"
+    model_path = os.path.join(args.out_dir, f"{args.prefix}{epoch_or_latest}.pt")
+    model = ClipCaptionModel(args.prefix_length)
+    if os.path.isfile(model_path):
+        print(f"loading model from {model_path}")
+        model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+    else:
+        print(f"{model_path} is not exist")
+    return model, parser
+def generate_beam(
+        model,
+        tokenizer,
+        beam_size: int = 5,
+        prompt=None,
+        embed=None,
+        entry_length=67,
+        temperature=1.0,
+        stop_token: str = '<|EOS|>',
+):
+    model.eval()
+    stop_token_index = tokenizer.encode(stop_token)[0]
+    tokens = None
+    scores = None
+    device = next(model.parameters()).device
+    seq_lengths = torch.ones(beam_size, device=device)
+    is_stopped = torch.zeros(beam_size, device=device, dtype=torch.bool)
+    with torch.no_grad():
+        if embed is not None:
+            generated = embed
+        else:
+            if tokens is None:
+                tokens = torch.tensor(tokenizer.encode(prompt))
+                tokens = tokens.unsqueeze(0).to(device)
+                generated = model.gpt.transformer.wte(tokens)
+        # pbar = tqdm(range(entry_length))
+        # pbar.set_description("generating text ...")
+        for i in range(entry_length):
+            # print(generated.shape)
+            outputs = model.gpt(inputs_embeds=generated)
+            logits = outputs.logits
+            logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
+            logits = logits.softmax(-1).log()
+            if scores is None:
+                scores, next_tokens = logits.topk(beam_size, -1)
+                generated = generated.expand(beam_size, *generated.shape[1:])
+                next_tokens, scores = next_tokens.permute(1, 0), scores.squeeze(0)
+                if tokens is None:
+                    tokens = next_tokens
+                else:
+                    tokens = tokens.expand(beam_size, *tokens.shape[1:])
+                    tokens = torch.cat((tokens, next_tokens), dim=1)
+            else:
+                logits[is_stopped] = -float(np.inf)
+                logits[is_stopped, 0] = 0
+                scores_sum = scores[:, None] + logits
+                seq_lengths[~is_stopped] += 1
+                scores_sum_average = scores_sum / seq_lengths[:, None]
+                scores_sum_average, next_tokens = scores_sum_average.view(-1).topk(
+                    beam_size, -1
+                )
+                next_tokens_source = next_tokens // scores_sum.shape[1]
+                seq_lengths = seq_lengths[next_tokens_source]
+                next_tokens = next_tokens % scores_sum.shape[1]
+                next_tokens = next_tokens.unsqueeze(1)
+                tokens = tokens[next_tokens_source]
+                tokens = torch.cat((tokens, next_tokens), dim=1)
+                generated = generated[next_tokens_source]
+                scores = scores_sum_average * seq_lengths
+                is_stopped = is_stopped[next_tokens_source]
+            next_token_embed = model.gpt.transformer.wte(next_tokens.squeeze()).view(
+                generated.shape[0], 1, -1
+            )
+            generated = torch.cat((generated, next_token_embed), dim=1)
+            is_stopped = is_stopped + next_tokens.eq(stop_token_index).squeeze()
+            if is_stopped.all():
+                break
+    scores = scores / seq_lengths
+    output_list = tokens.cpu().numpy()
+    output_texts = [
+        tokenizer.decode(output[: int(length)], skip_special_tokens=True)
+        for output, length in zip(output_list, seq_lengths)
+    ]
+    order = scores.argsort(descending=True)
+    output_texts = [output_texts[i] for i in order]
+    model.train()
+    return output_texts
+def generate2(
+        model,
+        tokenizer,
+        tokens=None,
+        prompt=None,
+        embed=None,
+        entry_count=1,
+        entry_length=67,  # maximum number of words
+        top_p=0.8,
+        temperature=1.0,
+        stop_token: str = '<|EOS|>',
+):
+    model.eval()
+    generated_num = 0
+    generated_list = []
+    stop_token_index = tokenizer.encode(stop_token)[0]
+    filter_value = -float("Inf")
+    device = next(model.parameters()).device
+    with torch.no_grad():
+        for entry_idx in range(entry_count):
+            if embed is not None:
+                generated = embed
+            else:
+                if tokens is None:
+                    tokens = torch.tensor(tokenizer.encode(prompt))
+                    tokens = tokens.unsqueeze(0).to(device)
+                generated = model.gpt.transformer.wte(tokens)
+            for i in range(entry_length):
+                outputs = model.gpt(inputs_embeds=generated)
+                logits = outputs.logits
+                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)
+                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+                cumulative_probs = torch.cumsum(
+                    nnf.softmax(sorted_logits, dim=-1), dim=-1
+                )
+                sorted_indices_to_remove = cumulative_probs > top_p
+                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
+                                                    ..., :-1
+                                                    ].clone()
+                sorted_indices_to_remove[..., 0] = 0
+                indices_to_remove = sorted_indices[sorted_indices_to_remove]
+                logits[:, indices_to_remove] = filter_value
+                next_token = torch.argmax(logits, -1).unsqueeze(0)
+                next_token_embed = model.gpt.transformer.wte(next_token)
+                if tokens is None:
+                    tokens = next_token
+                else:
+                    tokens = torch.cat((tokens, next_token), dim=1)
+                generated = torch.cat((generated, next_token_embed), dim=1)
+                if stop_token_index == next_token.item():
+                    break
+            output_list = list(tokens.squeeze().cpu().numpy())
+            output_text = tokenizer.decode(output_list)
+            generated_list.append(output_text)
+    return generated_list[0]
+class CaptionDecoder(object):
+    def __init__(self, device, pretrained_path, hidden_dim=-1):
+        if hidden_dim < 0:
+            hidden_dim = None
+        # tokenizer initialize
+        eos = '<|EOS|>'
+        special_tokens_dict = {'eos_token': eos}
+        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        self.tokenizer.add_special_tokens(special_tokens_dict)
+        # model initialize
+        feature_length = 77
+        # modelFile = "assets/caption_decoder/coco_v2_latest.pt"
+        self.caption_model = ClipCaptionModel(feature_length, hidden_dim=hidden_dim)
+        # print("Load Model...")
+        ckpt = torch.load(pretrained_path, map_location='cpu')
+        state_dict = OrderedDict()
+        for k, v in ckpt.items():
+            new_k = k[7:]
+            state_dict[new_k] = v
+        mk, uk = self.caption_model.load_state_dict(state_dict, strict=False)
+        assert len(mk) == 0
+        assert all([name.startswith('clip') for name in uk])
+        self.caption_model.eval()
+        self.caption_model.to(device)
+        self.caption_model.requires_grad_(False)
+        self.device = device
+    def encode_prefix(self, features):
+        return self.caption_model.encode_prefix(features)
+    def generate_captions(self, features):  # the low dimension representation of clip feature
+        """
+        generate captions given features
+        : param features : (tensor([B x L x D]))
+        : return generated_text: (list([L]))
+        """
+        # generate config
+        use_beam_search = True
+        features = torch.split(features, 1, dim=0)
+        generated_captions = []
+        with torch.no_grad():
+            for feature in features:
+                feature = self.caption_model.decode_prefix(feature.to(self.device))  # back to the clip feature
+                if use_beam_search:
+                    generated_captions.append(generate_beam(self.caption_model, self.tokenizer, embed=feature)[0])
+                else:
+                    generated_captions.append(generate2(self.caption_model, self.tokenizer, embed=feature))
+        return generated_captions

libs/clip.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch.nn as nn
+from transformers import CLIPTokenizer, CLIPTextModel
+class AbstractEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+class FrozenCLIPEmbedder(AbstractEncoder):
+    """Uses the CLIP transformer encoder for text (from Hugging Face)"""
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77):
+        super().__init__()
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length
+        self.freeze()
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(input_ids=tokens)
+        z = outputs.last_hidden_state
+        return z
+    def encode(self, text):
+        return self(text)

libs/timm.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# code from timm 0.3.2
+import torch
+import torch.nn as nn
+import math
+import warnings
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

libs/uvit_multi_post_ln.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import torch
+import torch.nn as nn
+import math
+from .timm import trunc_normal_, DropPath, Mlp
+import einops
+import torch.utils.checkpoint
+import torch.nn.functional as F
+if hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
+    ATTENTION_MODE = 'flash'
+else:
+    try:
+        import xformers
+        import xformers.ops
+        ATTENTION_MODE = 'xformers'
+    except:
+        ATTENTION_MODE = 'math'
+print(f'attention mode is {ATTENTION_MODE}')
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+def patchify(imgs, patch_size):
+    x = einops.rearrange(imgs, 'B C (h p1) (w p2) -> B (h w) (p1 p2 C)', p1=patch_size, p2=patch_size)
+    return x
+def unpatchify(x, in_chans):
+    patch_size = int((x.shape[2] // in_chans) ** 0.5)
+    h = w = int(x.shape[1] ** .5)
+    assert h * w == x.shape[1] and patch_size ** 2 * in_chans == x.shape[2]
+    x = einops.rearrange(x, 'B (h w) (p1 p2 C) -> B C (h p1) (w p2)', h=h, p1=patch_size, p2=patch_size)
+    return x
+def interpolate_pos_emb(pos_emb, old_shape, new_shape):
+    pos_emb = einops.rearrange(pos_emb, 'B (H W) C -> B C H W', H=old_shape[0], W=old_shape[1])
+    pos_emb = F.interpolate(pos_emb, new_shape, mode='bilinear')
+    pos_emb = einops.rearrange(pos_emb, 'B C H W -> B (H W) C')
+    return pos_emb
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, L, C = x.shape
+        qkv = self.qkv(x)
+        if ATTENTION_MODE == 'flash':
+            qkv = einops.rearrange(qkv, 'B L (K H D) -> K B H L D', K=3, H=self.num_heads).float()
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B H L D
+            x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'xformers':
+            qkv = einops.rearrange(qkv, 'B L (K H D) -> K B L H D', K=3, H=self.num_heads)
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B L H D
+            x = xformers.ops.memory_efficient_attention(q, k, v)
+            x = einops.rearrange(x, 'B L H D -> B L (H D)', H=self.num_heads)
+        elif ATTENTION_MODE == 'math':
+            with torch.amp.autocast(device_type='cuda', enabled=False):
+                qkv = einops.rearrange(qkv, 'B L (K H D) -> K B H L D', K=3, H=self.num_heads).float()
+                q, k, v = qkv[0], qkv[1], qkv[2]  # B H L D
+                attn = (q @ k.transpose(-2, -1)) * self.scale
+                attn = attn.softmax(dim=-1)
+                attn = self.attn_drop(attn)
+                x = (attn @ v).transpose(1, 2).reshape(B, L, C)
+        else:
+            raise NotImplemented
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, skip=False, use_checkpoint=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim) if skip else None
+        self.norm2 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm3 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.skip_linear = nn.Linear(2 * dim, dim) if skip else None
+        self.use_checkpoint = use_checkpoint
+    def forward(self, x, skip=None):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, skip)
+        else:
+            return self._forward(x, skip)
+    def _forward(self, x, skip=None):
+        if self.skip_linear is not None:
+            x = self.skip_linear(torch.cat([x, skip], dim=-1))
+            x = self.norm1(x)
+        x = x + self.drop_path(self.attn(x))
+        x = self.norm2(x)
+        x = x + self.drop_path(self.mlp(x))
+        x = self.norm3(x)
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, patch_size, in_chans=3, embed_dim=768):
+        super().__init__()
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H % self.patch_size == 0 and W % self.patch_size == 0
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class UViT(nn.Module):
+    def __init__(self, img_size, in_chans, patch_size, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, pos_drop_rate=0., drop_rate=0., attn_drop_rate=0.,
+                 norm_layer=nn.LayerNorm, mlp_time_embed=False, use_checkpoint=False,
+                 text_dim=None, num_text_tokens=None, clip_img_dim=None):
+        super().__init__()
+        self.in_chans = in_chans
+        self.patch_size = patch_size
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.patch_embed = PatchEmbed(patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        self.img_size = (img_size, img_size) if isinstance(img_size, int) else img_size  # the default img size
+        assert self.img_size[0] % patch_size == 0 and self.img_size[1] % patch_size == 0
+        self.num_patches = (self.img_size[0] // patch_size) * (self.img_size[1] // patch_size)
+        self.time_img_embed = nn.Sequential(
+            nn.Linear(embed_dim, 4 * embed_dim),
+            nn.SiLU(),
+            nn.Linear(4 * embed_dim, embed_dim),
+        ) if mlp_time_embed else nn.Identity()
+        self.time_text_embed = nn.Sequential(
+            nn.Linear(embed_dim, 4 * embed_dim),
+            nn.SiLU(),
+            nn.Linear(4 * embed_dim, embed_dim),
+        ) if mlp_time_embed else nn.Identity()
+        self.text_embed = nn.Linear(text_dim, embed_dim)
+        self.text_out = nn.Linear(embed_dim, text_dim)
+        self.clip_img_embed = nn.Linear(clip_img_dim, embed_dim)
+        self.clip_img_out = nn.Linear(embed_dim, clip_img_dim)
+        self.num_text_tokens = num_text_tokens
+        self.num_tokens = 1 + 1 + num_text_tokens + 1 + self.num_patches
+        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_tokens, embed_dim))
+        self.pos_drop = nn.Dropout(p=pos_drop_rate)
+        self.in_blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, norm_layer=norm_layer, use_checkpoint=use_checkpoint)
+            for _ in range(depth // 2)])
+        self.mid_block = Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, norm_layer=norm_layer, use_checkpoint=use_checkpoint)
+        self.out_blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, norm_layer=norm_layer, skip=True, use_checkpoint=use_checkpoint)
+            for _ in range(depth // 2)])
+        self.norm = norm_layer(embed_dim)
+        self.patch_dim = patch_size ** 2 * in_chans
+        self.decoder_pred = nn.Linear(embed_dim, self.patch_dim, bias=True)
+        trunc_normal_(self.pos_embed, std=.02)
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed'}
+    def forward(self, img, clip_img, text, t_img, t_text):
+        _, _, H, W = img.shape
+        img = self.patch_embed(img)
+        t_img_token = self.time_img_embed(timestep_embedding(t_img, self.embed_dim))
+        t_img_token = t_img_token.unsqueeze(dim=1)
+        t_text_token = self.time_text_embed(timestep_embedding(t_text, self.embed_dim))
+        t_text_token = t_text_token.unsqueeze(dim=1)
+        text = self.text_embed(text)
+        clip_img = self.clip_img_embed(clip_img)
+        x = torch.cat((t_img_token, t_text_token, text, clip_img, img), dim=1)
+        num_text_tokens, num_img_tokens = text.size(1), img.size(1)
+        if H == self.img_size[0] and W == self.img_size[1]:
+            pos_embed = self.pos_embed
+        else:  # interpolate the positional embedding when the input image is not of the default shape
+            pos_embed_others, pos_embed_patches = torch.split(self.pos_embed, [1 + 1 + num_text_tokens + 1, self.num_patches], dim=1)
+            pos_embed_patches = interpolate_pos_emb(pos_embed_patches, (self.img_size[0] // self.patch_size, self.img_size[1] // self.patch_size),
+                                                    (H // self.patch_size, W // self.patch_size))
+            pos_embed = torch.cat((pos_embed_others, pos_embed_patches), dim=1)
+        x = x + pos_embed
+        x = self.pos_drop(x)
+        skips = []
+        for blk in self.in_blocks:
+            x = blk(x)
+            skips.append(x)
+        x = self.mid_block(x)
+        for blk in self.out_blocks:
+            x = blk(x, skips.pop())
+        x = self.norm(x)
+        t_img_token_out, t_text_token_out, text_out, clip_img_out, img_out = x.split((1, 1, num_text_tokens, 1, num_img_tokens), dim=1)
+        img_out = self.decoder_pred(img_out)
+        img_out = unpatchify(img_out, self.in_chans)
+        clip_img_out = self.clip_img_out(clip_img_out)
+        text_out = self.text_out(text_out)
+        return img_out, clip_img_out, text_out

libs/uvit_multi_post_ln_v1.py ADDED Viewed

	@@ -0,0 +1,285 @@

+import torch
+import torch.nn as nn
+import math
+from .timm import trunc_normal_, DropPath, Mlp
+import einops
+import torch.utils.checkpoint
+import torch.nn.functional as F
+if hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
+    ATTENTION_MODE = 'flash'
+else:
+    try:
+        import xformers
+        import xformers.ops
+        ATTENTION_MODE = 'xformers'
+    except:
+        ATTENTION_MODE = 'math'
+print(f'attention mode is {ATTENTION_MODE}')
+def timestep_embedding(timesteps, dim, max_period=10000):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+    ).to(device=timesteps.device)
+    args = timesteps[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    return embedding
+def patchify(imgs, patch_size):
+    x = einops.rearrange(imgs, 'B C (h p1) (w p2) -> B (h w) (p1 p2 C)', p1=patch_size, p2=patch_size)
+    return x
+def unpatchify(x, in_chans):
+    patch_size = int((x.shape[2] // in_chans) ** 0.5)
+    h = w = int(x.shape[1] ** .5)
+    assert h * w == x.shape[1] and patch_size ** 2 * in_chans == x.shape[2]
+    x = einops.rearrange(x, 'B (h w) (p1 p2 C) -> B C (h p1) (w p2)', h=h, p1=patch_size, p2=patch_size)
+    return x
+def interpolate_pos_emb(pos_emb, old_shape, new_shape):
+    pos_emb = einops.rearrange(pos_emb, 'B (H W) C -> B C H W', H=old_shape[0], W=old_shape[1])
+    pos_emb = F.interpolate(pos_emb, new_shape, mode='bilinear')
+    pos_emb = einops.rearrange(pos_emb, 'B C H W -> B (H W) C')
+    return pos_emb
+class Attention(nn.Module):
+    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, L, C = x.shape
+        qkv = self.qkv(x)
+        if ATTENTION_MODE == 'flash':
+            qkv = einops.rearrange(qkv, 'B L (K H D) -> K B H L D', K=3, H=self.num_heads).float()
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B H L D
+            x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'xformers':
+            qkv = einops.rearrange(qkv, 'B L (K H D) -> K B L H D', K=3, H=self.num_heads)
+            q, k, v = qkv[0], qkv[1], qkv[2]  # B L H D
+            x = xformers.ops.memory_efficient_attention(q, k, v)
+            x = einops.rearrange(x, 'B L H D -> B L (H D)', H=self.num_heads)
+        elif ATTENTION_MODE == 'math':
+            with torch.amp.autocast(device_type='cuda', enabled=False):
+                qkv = einops.rearrange(qkv, 'B L (K H D) -> K B H L D', K=3, H=self.num_heads).float()
+                q, k, v = qkv[0], qkv[1], qkv[2]  # B H L D
+                attn = (q @ k.transpose(-2, -1)) * self.scale
+                attn = attn.softmax(dim=-1)
+                attn = self.attn_drop(attn)
+                x = (attn @ v).transpose(1, 2).reshape(B, L, C)
+        else:
+            raise NotImplemented
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, skip=False, use_checkpoint=False):
+        super().__init__()
+        self.norm1 = norm_layer(dim) if skip else None
+        self.norm2 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm3 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        self.skip_linear = nn.Linear(2 * dim, dim) if skip else None
+        self.use_checkpoint = use_checkpoint
+    def forward(self, x, skip=None):
+        if self.use_checkpoint:
+            return torch.utils.checkpoint.checkpoint(self._forward, x, skip)
+        else:
+            return self._forward(x, skip)
+    def _forward(self, x, skip=None):
+        if self.skip_linear is not None:
+            x = self.skip_linear(torch.cat([x, skip], dim=-1))
+            x = self.norm1(x)
+        x = x + self.drop_path(self.attn(x))
+        x = self.norm2(x)
+        x = x + self.drop_path(self.mlp(x))
+        x = self.norm3(x)
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, patch_size, in_chans=3, embed_dim=768):
+        super().__init__()
+        self.patch_size = patch_size
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x):
+        B, C, H, W = x.shape
+        assert H % self.patch_size == 0 and W % self.patch_size == 0
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class UViT(nn.Module):
+    def __init__(self, img_size, in_chans, patch_size, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, pos_drop_rate=0., drop_rate=0., attn_drop_rate=0.,
+                 norm_layer=nn.LayerNorm, mlp_time_embed=False, use_checkpoint=False,
+                 text_dim=None, num_text_tokens=None, clip_img_dim=None):
+        super().__init__()
+        self.in_chans = in_chans
+        self.patch_size = patch_size
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.patch_embed = PatchEmbed(patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        self.img_size = (img_size, img_size) if isinstance(img_size, int) else img_size  # the default img size
+        assert self.img_size[0] % patch_size == 0 and self.img_size[1] % patch_size == 0
+        self.num_patches = (self.img_size[0] // patch_size) * (self.img_size[1] // patch_size)
+        self.time_img_embed = nn.Sequential(
+            nn.Linear(embed_dim, 4 * embed_dim),
+            nn.SiLU(),
+            nn.Linear(4 * embed_dim, embed_dim),
+        ) if mlp_time_embed else nn.Identity()
+        self.time_text_embed = nn.Sequential(
+            nn.Linear(embed_dim, 4 * embed_dim),
+            nn.SiLU(),
+            nn.Linear(4 * embed_dim, embed_dim),
+        ) if mlp_time_embed else nn.Identity()
+        self.text_embed = nn.Linear(text_dim, embed_dim)
+        self.text_out = nn.Linear(embed_dim, text_dim)
+        self.clip_img_embed = nn.Linear(clip_img_dim, embed_dim)
+        self.clip_img_out = nn.Linear(embed_dim, clip_img_dim)
+        self.num_text_tokens = num_text_tokens
+        self.num_tokens = 1 + 1 + num_text_tokens + 1 + self.num_patches
+        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_tokens, embed_dim))
+        self.pos_drop = nn.Dropout(p=pos_drop_rate)
+        self.in_blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, norm_layer=norm_layer, use_checkpoint=use_checkpoint)
+            for _ in range(depth // 2)])
+        self.mid_block = Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, norm_layer=norm_layer, use_checkpoint=use_checkpoint)
+        self.out_blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, norm_layer=norm_layer, skip=True, use_checkpoint=use_checkpoint)
+            for _ in range(depth // 2)])
+        self.norm = norm_layer(embed_dim)
+        self.patch_dim = patch_size ** 2 * in_chans
+        self.decoder_pred = nn.Linear(embed_dim, self.patch_dim, bias=True)
+        trunc_normal_(self.pos_embed, std=.02)
+        self.apply(self._init_weights)
+        self.token_embedding = nn.Embedding(2, embed_dim)
+        self.pos_embed_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {'pos_embed'}
+    def forward(self, img, clip_img, text, t_img, t_text, data_type):
+        _, _, H, W = img.shape
+        img = self.patch_embed(img)
+        t_img_token = self.time_img_embed(timestep_embedding(t_img, self.embed_dim))
+        t_img_token = t_img_token.unsqueeze(dim=1)
+        t_text_token = self.time_text_embed(timestep_embedding(t_text, self.embed_dim))
+        t_text_token = t_text_token.unsqueeze(dim=1)
+        text = self.text_embed(text)
+        clip_img = self.clip_img_embed(clip_img)
+        token_embed = self.token_embedding(data_type).unsqueeze(dim=1)
+        x = torch.cat((t_img_token, t_text_token, token_embed, text, clip_img, img), dim=1)
+        num_text_tokens, num_img_tokens = text.size(1), img.size(1)
+        pos_embed = torch.cat(
+            [self.pos_embed[:, :1 + 1, :], self.pos_embed_token, self.pos_embed[:, 1 + 1:, :]], dim=1)
+        if H == self.img_size[0] and W == self.img_size[1]:
+            pass
+        else:  # interpolate the positional embedding when the input image is not of the default shape
+            pos_embed_others, pos_embed_patches = torch.split(pos_embed, [1 + 1 + 1 + num_text_tokens + 1, self.num_patches], dim=1)
+            pos_embed_patches = interpolate_pos_emb(pos_embed_patches, (self.img_size[0] // self.patch_size, self.img_size[1] // self.patch_size),
+                                                    (H // self.patch_size, W // self.patch_size))
+            pos_embed = torch.cat((pos_embed_others, pos_embed_patches), dim=1)
+        x = x + pos_embed
+        x = self.pos_drop(x)
+        skips = []
+        for blk in self.in_blocks:
+            x = blk(x)
+            skips.append(x)
+        x = self.mid_block(x)
+        for blk in self.out_blocks:
+            x = blk(x, skips.pop())
+        x = self.norm(x)
+        t_img_token_out, t_text_token_out, token_embed_out, text_out, clip_img_out, img_out = x.split((1, 1, 1, num_text_tokens, 1, num_img_tokens), dim=1)
+        img_out = self.decoder_pred(img_out)
+        img_out = unpatchify(img_out, self.in_chans)
+        clip_img_out = self.clip_img_out(clip_img_out)
+        text_out = self.text_out(text_out)
+        return img_out, clip_img_out, text_out

unidiffuser/sample_v1.py ADDED Viewed

	@@ -0,0 +1,437 @@

+import ml_collections
+import torch
+import random
+from absl import logging
+import einops
+from torchvision.utils import save_image, make_grid
+import torchvision.transforms as standard_transforms
+import numpy as np
+import clip
+from PIL import Image
+import time
+import os
+from libs.autoencoder import get_model
+from libs.clip import FrozenCLIPEmbedder
+from dpm_solver_pp import NoiseScheduleVP, DPM_Solver
+from utils import center_crop, set_logger, get_nnet
+def stable_diffusion_beta_schedule(linear_start=0.00085, linear_end=0.0120, n_timestep=1000):
+    _betas = (
+        torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
+    )
+    return _betas.numpy()
+def prepare_contexts(config, clip_text_model, clip_img_model, clip_img_model_preprocess, autoencoder):
+    resolution = config.z_shape[-1] * 8
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    contexts = torch.randn(config.n_samples, 77, config.clip_text_dim).to(device)
+    img_contexts = torch.randn(config.n_samples, 2 * config.z_shape[0], config.z_shape[1], config.z_shape[2])
+    clip_imgs = torch.randn(config.n_samples, 1, config.clip_img_dim)
+    if config.mode in ['t2i', 't2i2t']:
+        prompts = [ config.prompt ] * config.n_samples
+        contexts = clip_text_model.encode(prompts)
+    elif config.mode in ['i2t', 'i2t2i']:
+        from PIL import Image
+        img_contexts = []
+        clip_imgs = []
+        def get_img_feature(image):
+            image = np.array(image).astype(np.uint8)
+            image = center_crop(resolution, resolution, image)
+            clip_img_feature = clip_img_model.encode_image(clip_img_model_preprocess(Image.fromarray(image)).unsqueeze(0).to(device))
+            image = (image / 127.5 - 1.0).astype(np.float32)
+            image = einops.rearrange(image, 'h w c -> 1 c h w')
+            image = torch.tensor(image, device=device)
+            moments = autoencoder.encode_moments(image)
+            return clip_img_feature, moments
+        image = Image.open(config.img).convert('RGB')
+        clip_img, img_context = get_img_feature(image)
+        img_contexts.append(img_context)
+        clip_imgs.append(clip_img)
+        img_contexts = img_contexts * config.n_samples
+        clip_imgs = clip_imgs * config.n_samples
+        img_contexts = torch.concat(img_contexts, dim=0)
+        clip_imgs = torch.stack(clip_imgs, dim=0)
+    return contexts, img_contexts, clip_imgs
+def unpreprocess(v):  # to B C H W and [0, 1]
+    v = 0.5 * (v + 1.)
+    v.clamp_(0., 1.)
+    return v
+def set_seed(seed: int):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+def evaluate(config):
+    if config.get('benchmark', False):
+        torch.backends.cudnn.benchmark = True
+        torch.backends.cudnn.deterministic = False
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    set_seed(config.seed)
+    config = ml_collections.FrozenConfigDict(config)
+    set_logger(log_level='info')
+    _betas = stable_diffusion_beta_schedule()
+    N = len(_betas)
+    nnet = get_nnet(**config.nnet)
+    logging.info(f'load nnet from {config.nnet_path}')
+    nnet.load_state_dict(torch.load(config.nnet_path, map_location='cpu'))
+    nnet.to(device)
+    nnet.eval()
+    use_caption_decoder = config.text_dim < config.clip_text_dim or config.mode != 't2i'
+    if use_caption_decoder:
+        from ..libs.caption_decoder import CaptionDecoder
+        caption_decoder = CaptionDecoder(device=device, **config.caption_decoder)
+    else:
+        caption_decoder = None
+    clip_text_model = FrozenCLIPEmbedder(device=device)
+    clip_text_model.eval()
+    clip_text_model.to(device)
+    autoencoder = get_model(**config.autoencoder)
+    autoencoder.to(device)
+    clip_img_model, clip_img_model_preprocess = clip.load("ViT-B/32", device=device, jit=False)
+    empty_context = clip_text_model.encode([''])[0]
+    def split(x):
+        C, H, W = config.z_shape
+        z_dim = C * H * W
+        z, clip_img = x.split([z_dim, config.clip_img_dim], dim=1)
+        z = einops.rearrange(z, 'B (C H W) -> B C H W', C=C, H=H, W=W)
+        clip_img = einops.rearrange(clip_img, 'B (L D) -> B L D', L=1, D=config.clip_img_dim)
+        return z, clip_img
+    def combine(z, clip_img):
+        z = einops.rearrange(z, 'B C H W -> B (C H W)')
+        clip_img = einops.rearrange(clip_img, 'B L D -> B (L D)')
+        return torch.concat([z, clip_img], dim=-1)
+    def t2i_nnet(x, timesteps, text):  # text is the low dimension version of the text clip embedding
+        """
+        1. calculate the conditional model output
+        2. calculate unconditional model output
+            config.sample.t2i_cfg_mode == 'empty_token': using the original cfg with the empty string
+            config.sample.t2i_cfg_mode == 'true_uncond: using the unconditional model learned by our method
+        3. return linear combination of conditional output and unconditional output
+        """
+        z, clip_img = split(x)
+        t_text = torch.zeros(timesteps.size(0), dtype=torch.int, device=device)
+        z_out, clip_img_out, text_out = nnet(z, clip_img, text=text, t_img=timesteps, t_text=t_text,
+                                             data_type=torch.zeros_like(t_text, device=device, dtype=torch.int) + config.data_type)
+        x_out = combine(z_out, clip_img_out)
+        if config.sample.scale == 0.:
+            return x_out
+        if config.sample.t2i_cfg_mode == 'empty_token':
+            _empty_context = einops.repeat(empty_context, 'L D -> B L D', B=x.size(0))
+            if use_caption_decoder:
+                _empty_context = caption_decoder.encode_prefix(_empty_context)
+            z_out_uncond, clip_img_out_uncond, text_out_uncond = nnet(z, clip_img, text=_empty_context, t_img=timesteps, t_text=t_text,
+                                                                      data_type=torch.zeros_like(t_text, device=device, dtype=torch.int) + config.data_type)
+            x_out_uncond = combine(z_out_uncond, clip_img_out_uncond)
+        elif config.sample.t2i_cfg_mode == 'true_uncond':
+            text_N = torch.randn_like(text)  # 3 other possible choices
+            z_out_uncond, clip_img_out_uncond, text_out_uncond = nnet(z, clip_img, text=text_N, t_img=timesteps, t_text=torch.ones_like(timesteps) * N,
+                                                                      data_type=torch.zeros_like(t_text, device=device, dtype=torch.int) + config.data_type)
+            x_out_uncond = combine(z_out_uncond, clip_img_out_uncond)
+        else:
+            raise NotImplementedError
+        return x_out + config.sample.scale * (x_out - x_out_uncond)
+    def i_nnet(x, timesteps):
+        z, clip_img = split(x)
+        text = torch.randn(x.size(0), 77, config.text_dim, device=device)
+        t_text = torch.ones_like(timesteps) * N
+        z_out, clip_img_out, text_out = nnet(z, clip_img, text=text, t_img=timesteps, t_text=t_text,
+                                             data_type=torch.zeros_like(t_text, device=device, dtype=torch.int) + config.data_type)
+        x_out = combine(z_out, clip_img_out)
+        return x_out
+    def t_nnet(x, timesteps):
+        z = torch.randn(x.size(0), *config.z_shape, device=device)
+        clip_img = torch.randn(x.size(0), 1, config.clip_img_dim, device=device)
+        z_out, clip_img_out, text_out = nnet(z, clip_img, text=x, t_img=torch.ones_like(timesteps) * N, t_text=timesteps,
+                                             data_type=torch.zeros_like(timesteps, device=device, dtype=torch.int) + config.data_type)
+        return text_out
+    def i2t_nnet(x, timesteps, z, clip_img):
+        """
+        1. calculate the conditional model output
+        2. calculate unconditional model output
+        3. return linear combination of conditional output and unconditional output
+        """
+        t_img = torch.zeros(timesteps.size(0), dtype=torch.int, device=device)
+        z_out, clip_img_out, text_out = nnet(z, clip_img, text=x, t_img=t_img, t_text=timesteps,
+                                             data_type=torch.zeros_like(t_img, device=device, dtype=torch.int) + config.data_type)
+        if config.sample.scale == 0.:
+            return text_out
+        z_N = torch.randn_like(z)  # 3 other possible choices
+        clip_img_N = torch.randn_like(clip_img)
+        z_out_uncond, clip_img_out_uncond, text_out_uncond = nnet(z_N, clip_img_N, text=x, t_img=torch.ones_like(timesteps) * N, t_text=timesteps,
+                                                                  data_type=torch.zeros_like(timesteps, device=device, dtype=torch.int) + config.data_type)
+        return text_out + config.sample.scale * (text_out - text_out_uncond)
+    def split_joint(x):
+        C, H, W = config.z_shape
+        z_dim = C * H * W
+        z, clip_img, text = x.split([z_dim, config.clip_img_dim, 77 * config.text_dim], dim=1)
+        z = einops.rearrange(z, 'B (C H W) -> B C H W', C=C, H=H, W=W)
+        clip_img = einops.rearrange(clip_img, 'B (L D) -> B L D', L=1, D=config.clip_img_dim)
+        text = einops.rearrange(text, 'B (L D) -> B L D', L=77, D=config.text_dim)
+        return z, clip_img, text
+    def combine_joint(z, clip_img, text):
+        z = einops.rearrange(z, 'B C H W -> B (C H W)')
+        clip_img = einops.rearrange(clip_img, 'B L D -> B (L D)')
+        text = einops.rearrange(text, 'B L D -> B (L D)')
+        return torch.concat([z, clip_img, text], dim=-1)
+    def joint_nnet(x, timesteps):
+        z, clip_img, text = split_joint(x)
+        z_out, clip_img_out, text_out = nnet(z, clip_img, text=text, t_img=timesteps, t_text=timesteps,
+                                             data_type=torch.zeros_like(timesteps, device=device, dtype=torch.int) + config.data_type)
+        x_out = combine_joint(z_out, clip_img_out, text_out)
+        if config.sample.scale == 0.:
+            return x_out
+        z_noise = torch.randn(x.size(0), *config.z_shape, device=device)
+        clip_img_noise = torch.randn(x.size(0), 1, config.clip_img_dim, device=device)
+        text_noise = torch.randn(x.size(0), 77, config.text_dim, device=device)
+        _, _, text_out_uncond = nnet(z_noise, clip_img_noise, text=text, t_img=torch.ones_like(timesteps) * N, t_text=timesteps,
+                                     data_type=torch.zeros_like(timesteps, device=device, dtype=torch.int) + config.data_type)
+        z_out_uncond, clip_img_out_uncond, _ = nnet(z, clip_img, text=text_noise, t_img=timesteps, t_text=torch.ones_like(timesteps) * N,
+                                                    data_type=torch.zeros_like(timesteps, device=device, dtype=torch.int) + config.data_type)
+        x_out_uncond = combine_joint(z_out_uncond, clip_img_out_uncond, text_out_uncond)
+        return x_out + config.sample.scale * (x_out - x_out_uncond)
+    @torch.cuda.amp.autocast()
+    def encode(_batch):
+        return autoencoder.encode(_batch)
+    @torch.cuda.amp.autocast()
+    def decode(_batch):
+        return autoencoder.decode(_batch)
+    logging.info(config.sample)
+    logging.info(f'N={N}')
+    contexts, img_contexts, clip_imgs = prepare_contexts(config, clip_text_model, clip_img_model, clip_img_model_preprocess, autoencoder)
+    contexts = contexts  # the clip embedding of conditioned texts
+    contexts_low_dim = contexts if not use_caption_decoder else caption_decoder.encode_prefix(contexts)  # the low dimensional version of the contexts, which is the input to the nnet
+    img_contexts = img_contexts  # img_contexts is the autoencoder moment
+    z_img = autoencoder.sample(img_contexts)
+    clip_imgs = clip_imgs  # the clip embedding of conditioned image
+    if config.mode in ['t2i', 't2i2t']:
+        _n_samples = contexts_low_dim.size(0)
+    elif config.mode in ['i2t', 'i2t2i']:
+        _n_samples = img_contexts.size(0)
+    else:
+        _n_samples = config.n_samples
+    def sample_fn(mode, **kwargs):
+        _z_init = torch.randn(_n_samples, *config.z_shape, device=device)
+        _clip_img_init = torch.randn(_n_samples, 1, config.clip_img_dim, device=device)
+        _text_init = torch.randn(_n_samples, 77, config.text_dim, device=device)
+        if mode == 'joint':
+            _x_init = combine_joint(_z_init, _clip_img_init, _text_init)
+        elif mode in ['t2i', 'i']:
+            _x_init = combine(_z_init, _clip_img_init)
+        elif mode in ['i2t', 't']:
+            _x_init = _text_init
+        noise_schedule = NoiseScheduleVP(schedule='discrete', betas=torch.tensor(_betas, device=device).float())
+        def model_fn(x, t_continuous):
+            t = t_continuous * N
+            if mode == 'joint':
+                return joint_nnet(x, t)
+            elif mode == 't2i':
+                return t2i_nnet(x, t, **kwargs)
+            elif mode == 'i2t':
+                return i2t_nnet(x, t, **kwargs)
+            elif mode == 'i':
+                return i_nnet(x, t)
+            elif mode == 't':
+                return t_nnet(x, t)
+        dpm_solver = DPM_Solver(model_fn, noise_schedule, predict_x0=True, thresholding=False)
+        with torch.no_grad():
+            with torch.autocast(device_type=device):
+                start_time = time.time()
+                x = dpm_solver.sample(_x_init, steps=config.sample.sample_steps, eps=1. / N, T=1.)
+                end_time = time.time()
+                print(f'\ngenerate {_n_samples} samples with {config.sample.sample_steps} steps takes {end_time - start_time:.2f}s')
+        # os.makedirs(config.output_path, exist_ok=True)
+        if mode == 'joint':
+            _z, _clip_img, _text = split_joint(x)
+            return _z, _clip_img, _text
+        elif mode in ['t2i', 'i']:
+            _z, _clip_img = split(x)
+            return _z, _clip_img
+        elif mode in ['i2t', 't']:
+            return x
+    output_images = None
+    output_text = None
+    if config.mode in ['joint']:
+        _z, _clip_img, _text = sample_fn(config.mode)
+        samples = unpreprocess(decode(_z))
+        prompts = caption_decoder.generate_captions(_text)
+        # Just get the first output image for now
+        output_images = samples
+        output_text = prompts
+    elif config.mode in ['t2i', 'i', 'i2t2i']:
+        if config.mode == 't2i':
+            _z, _clip_img = sample_fn(config.mode, text=contexts_low_dim)  # conditioned on the text embedding
+        elif config.mode == 'i':
+            _z, _clip_img = sample_fn(config.mode)
+        elif config.mode == 'i2t2i':
+            _text = sample_fn('i2t', z=z_img, clip_img=clip_imgs)  # conditioned on the image embedding
+            _z, _clip_img = sample_fn('t2i', text=_text)
+        samples = unpreprocess(decode(_z))
+        output_images = samples
+    elif config.mode in ['i2t', 't', 't2i2t']:
+        if config.mode == 'i2t':
+            _text = sample_fn(config.mode, z=z_img, clip_img=clip_imgs)  # conditioned on the image embedding
+        elif config.mode == 't':
+            _text = sample_fn(config.mode)
+        elif config.mode == 't2i2t':
+            _z, _clip_img = sample_fn('t2i', text=contexts_low_dim)
+            _text = sample_fn('i2t', z=_z, clip_img=_clip_img)
+        samples = caption_decoder.generate_captions(_text)
+        output_text = samples
+    print(f'\nGPU memory usage: {torch.cuda.max_memory_reserved() / 1024 ** 3:.2f} GB')
+    print(f'\nresults are saved in {os.path.join(config.output_path, config.mode)} :)')
+    # Convert sample images to PIL
+    if output_images is not None:
+        for sample in output_images:
+            sample = standard_transforms.ToPILImage()(sample)
+    return output_images, output_text
+def d(**kwargs):
+    """Helper of creating a config dict."""
+    return ml_collections.ConfigDict(initial_dictionary=kwargs)
+def get_config():
+    config = ml_collections.ConfigDict()
+    config.seed = 1234
+    config.pred = 'noise_pred'
+    config.z_shape = (4, 64, 64)
+    config.clip_img_dim = 512
+    config.clip_text_dim = 768
+    config.text_dim = 64  # reduce dimension
+    config.data_type = 1
+    config.autoencoder = d(
+        pretrained_path='models/autoencoder_kl.pth',
+    )
+    config.caption_decoder = d(
+        pretrained_path="models/caption_decoder.pth",
+        hidden_dim=config.get_ref('text_dim')
+    )
+    config.nnet = d(
+        name='uvit_multi_post_ln_v1',
+        img_size=64,
+        in_chans=4,
+        patch_size=2,
+        embed_dim=1536,
+        depth=30,
+        num_heads=24,
+        mlp_ratio=4,
+        qkv_bias=False,
+        pos_drop_rate=0.,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        mlp_time_embed=False,
+        text_dim=config.get_ref('text_dim'),
+        num_text_tokens=77,
+        clip_img_dim=config.get_ref('clip_img_dim'),
+        use_checkpoint=True
+    )
+    config.sample = d(
+        sample_steps=50,
+        scale=7.,
+        t2i_cfg_mode='true_uncond'
+    )
+    return config
+def sample(mode, prompt, image, sample_steps=50, scale=7.0, seed=None):
+    config = get_config()
+    config.nnet_path = "models/uvit_v1.pth"
+    config.n_samples = 1
+    config.nrow = 1
+    config.mode = mode
+    config.prompt = prompt
+    config.img = image
+    config.sample.sample_steps = sample_steps
+    config.sample.scale = scale
+    if seed is not None:
+        config.seed = seed
+    evaluate(config)

utils.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from absl import logging
+import numpy as np
+from PIL import Image, ImageDraw, ImageFont
+def center_crop(width, height, img):
+    resample = {'box': Image.BOX, 'lanczos': Image.LANCZOS}['lanczos']
+    crop = np.min(img.shape[:2])
+    img = img[(img.shape[0] - crop) // 2: (img.shape[0] + crop) // 2,
+          (img.shape[1] - crop) // 2: (img.shape[1] + crop) // 2]  # center crop
+    try:
+        img = Image.fromarray(img, 'RGB')
+    except:
+        img = Image.fromarray(img)
+    img = img.resize((width, height), resample)  # resize the center crop from [crop, crop] to [width, height]
+    return np.array(img).astype(np.uint8)
+def set_logger(log_level='info', fname=None):
+    import logging as _logging
+    handler = logging.get_absl_handler()
+    formatter = _logging.Formatter('%(asctime)s - %(filename)s - %(message)s')
+    handler.setFormatter(formatter)
+    logging.set_verbosity(log_level)
+    if fname is not None:
+        handler = _logging.FileHandler(fname)
+        handler.setFormatter(formatter)
+        logging.get_absl_logger().addHandler(handler)
+def get_nnet(name, **kwargs):
+    if name == 'uvit_multi_post_ln':
+        from libs.uvit_multi_post_ln import UViT
+        return UViT(**kwargs)
+    elif name == 'uvit_multi_post_ln_v1':
+        from libs.uvit_multi_post_ln_v1 import UViT
+        return UViT(**kwargs)
+    else:
+        raise NotImplementedError(name)
+def drawRoundRec(draw, color, x, y, w, h, r):
+    drawObject = draw
+    '''Rounds'''
+    drawObject.ellipse((x, y, x + r, y + r), fill=color)
+    drawObject.ellipse((x + w - r, y, x + w, y + r), fill=color)
+    drawObject.ellipse((x, y + h - r, x + r, y + h), fill=color)
+    drawObject.ellipse((x + w - r, y + h - r, x + w, y + h), fill=color)
+    '''rec.s'''
+    drawObject.rectangle((x + r / 2, y, x + w - (r / 2), y + h), fill=color)
+    drawObject.rectangle((x, y + r / 2, x + w, y + h - (r / 2)), fill=color)
+def add_water(img, text='UniDiffuser', pos=3):
+    width, height = img.size
+    scale = 4
+    scale_size = 0.5
+    img = img.resize((width * scale, height * scale), Image.LANCZOS)
+    result = Image.new(img.mode, (width * scale, height * scale), color=(255, 255, 255))
+    result.paste(img, box=(0, 0))
+    delta_w = int(width * scale * 0.27 * scale_size)  # text width
+    delta_h = width * scale * 0.05 * scale_size  # text height
+    postions = np.array([[0, 0], [0, height * scale - delta_h], [width * scale - delta_w, 0],
+                         [width * scale - delta_w, height * scale - delta_h]])
+    postion = postions[pos]
+    # 文本
+    draw = ImageDraw.Draw(result)
+    fillColor = (107, 92, 231)
+    setFont = ImageFont.truetype("assets/ArialBoldMT.ttf", int(width * scale * 0.05 * scale_size))
+    delta = 20 * scale_size
+    padding = 15 * scale_size
+    drawRoundRec(draw, (223, 230, 233), postion[0] - delta - padding, postion[1] - delta - padding,
+                 w=delta_w + 2 * padding, h=delta_h + 2 * padding, r=50 * scale_size)
+    draw.text((postion[0] - delta, postion[1] - delta), text, font=setFont, fill=fillColor)
+    return result.resize((width, height), Image.LANCZOS)