Spaces:

alibaba-pai
/

EasyAnimate

Running

App Files Files Community

bubbliiiing commited on Jun 4

Commit

43ed08d

•

1 Parent(s): 08038f7

add requirements

Browse files

Files changed (6) hide show

easyanimate/vae/ldm/models/__init__.py +0 -0
easyanimate/vae/ldm/models/autoencoder.py +337 -0
easyanimate/vae/ldm/models/enc_dec_pytorch.py +234 -0
easyanimate/vae/ldm/models/omnigen_casual3dcnn.py +321 -0
easyanimate/vae/ldm/models/omnigen_enc_dec.py +396 -0
easyanimate/video_caption/datasets/put preprocess datasets here.txt +0 -0

easyanimate/vae/ldm/models/__init__.py ADDED Viewed

File without changes

easyanimate/vae/ldm/models/autoencoder.py ADDED Viewed

	@@ -0,0 +1,337 @@

+import time
+from contextlib import contextmanager
+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+from ..modules.diffusionmodules.model import Decoder, Encoder
+from ..modules.distributions.distributions import DiagonalGaussianDistribution
+from ..util import instantiate_from_config
+from .enc_dec_pytorch import Decoder as Mag_Decoder
+from .enc_dec_pytorch import Encoder as Mag_Encoder
+class AutoencoderKLMagvit(pl.LightningModule):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 ):
+        super().__init__()
+        self.image_key = image_key
+        self.encoder = Mag_Encoder()
+        self.decoder = Mag_Decoder()
+        self.loss = instantiate_from_config(lossconfig)
+        self.quant_conv = torch.nn.Conv3d(16, 16, 1)
+        self.post_quant_conv = torch.nn.Conv3d(8, 8, 1)
+        self.embed_dim = embed_dim
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def decode(self, z):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+    def forward(self, input, sample_posterior=True):
+        if input.ndim==4:
+            input = input.unsqueeze(2)
+        posterior = self.encode(input)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior
+    def get_input(self, batch, k):
+        x = batch[k]
+        if x.ndim==5:
+            x = x.permute(0, 4, 1, 2, 3).to(memory_format=torch.contiguous_format).float()
+            return x
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        return x
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        # tic = time.time()
+        inputs = self.get_input(batch, self.image_key)
+        # print(f"get_input time {time.time() - tic}")
+        # tic = time.time()
+        reconstructions, posterior = self(inputs)
+        # print(f"model forward time {time.time() - tic}")
+        if optimizer_idx == 0:
+            # train encoder+decoder+logvar
+            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            # print(f"cal loss time {time.time() - tic}")
+            return aeloss
+        if optimizer_idx == 1:
+            # train the discriminator
+            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                                last_layer=self.get_last_layer(), split="train")
+            self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            # print(f"cal loss time {time.time() - tic}")
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        with torch.no_grad():
+            inputs = self.get_input(batch, self.image_key)
+            reconstructions, posterior = self(inputs)
+            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
+                                            last_layer=self.get_last_layer(), split="val")
+            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
+                                                last_layer=self.get_last_layer(), split="val")
+        self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr, betas=(0.5, 0.9))
+        return [opt_ae, opt_disc], []
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    @torch.no_grad()
+    def log_images(self, batch, only_inputs=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if not only_inputs:
+            xrec, posterior = self(x)
+            if x.shape[1] > 3:
+                # colorize with random projection
+                assert xrec.shape[1] > 3
+                x = self.to_rgb(x)
+                xrec = self.to_rgb(xrec)
+            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
+            log["reconstructions"] = xrec
+        log["inputs"] = x
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+class AutoencoderKL(pl.LightningModule):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 ):
+        super().__init__()
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f"Restored from {path}")
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def decode(self, z):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+    def forward(self, input, sample_posterior=True):
+        posterior = self.encode(input)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        return x
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        # tic = time.time()
+        inputs = self.get_input(batch, self.image_key)
+        # print(f"get_input time {time.time() - tic}")
+        # tic = time.time()
+        reconstructions, posterior = self(inputs)
+        # print(f"model forward time {time.time() - tic}")
+        tic = time.time()
+        if optimizer_idx == 0:
+            # train encoder+decoder+logvar
+            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            # print(f"cal loss time {time.time() - tic}")
+            return aeloss
+        if optimizer_idx == 1:
+            # train the discriminator
+            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                                last_layer=self.get_last_layer(), split="train")
+            self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            # print(f"cal loss time {time.time() - tic}")
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        tic = time.time()
+        inputs = self.get_input(batch, self.image_key)
+        print(f"get_input time {time.time() - tic}")
+        tic = time.time()
+        reconstructions, posterior = self(inputs)
+        print(f"val forward time {time.time() - tic}")
+        tic = time.time()
+        aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
+                                        last_layer=self.get_last_layer(), split="val")
+        discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
+                                            last_layer=self.get_last_layer(), split="val")
+        self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        print(f"val end time {time.time() - tic}")
+        return self.log_dict
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr, betas=(0.5, 0.9))
+        return [opt_ae, opt_disc], []
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    @torch.no_grad()
+    def log_images(self, batch, only_inputs=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if not only_inputs:
+            xrec, posterior = self(x)
+            if x.shape[1] > 3:
+                # colorize with random projection
+                assert xrec.shape[1] > 3
+                x = self.to_rgb(x)
+                xrec = self.to_rgb(xrec)
+            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
+            log["reconstructions"] = xrec
+        log["inputs"] = x
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+class IdentityFirstStage(torch.nn.Module):
+    def __init__(self, *args, vq_interface=False, **kwargs):
+        self.vq_interface = vq_interface  # TODO: Should be true by default but check to not break older stuff
+        super().__init__()
+    def encode(self, x, *args, **kwargs):
+        return x
+    def decode(self, x, *args, **kwargs):
+        return x
+    def quantize(self, x, *args, **kwargs):
+        if self.vq_interface:
+            return x, None, [None, None, None]
+        return x
+    def forward(self, x, *args, **kwargs):
+        return x

easyanimate/vae/ldm/models/enc_dec_pytorch.py ADDED Viewed

	@@ -0,0 +1,234 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+def cast_tuple(t, length = 1):
+    return t if isinstance(t, tuple) else ((t,) * length)
+def divisible_by(num, den):
+    return (num % den) == 0
+def is_odd(n):
+    return not divisible_by(n, 2)
+class CausalConv3d(nn.Module):
+    def __init__(
+        self,
+        chan_in,
+        chan_out,
+        kernel_size,
+        pad_mode = 'constant',
+        **kwargs
+    ):
+        super().__init__()
+        kernel_size = cast_tuple(kernel_size, 3)
+        time_kernel_size, height_kernel_size, width_kernel_size = kernel_size
+        assert is_odd(height_kernel_size) and is_odd(width_kernel_size)
+        dilation = kwargs.pop('dilation', 1)
+        stride = kwargs.pop('stride', 1)
+        self.pad_mode = pad_mode
+        time_pad = dilation * (time_kernel_size - 1) + (1 - stride)
+        height_pad = height_kernel_size // 2
+        width_pad = width_kernel_size // 2
+        self.time_pad = time_pad
+        self.time_causal_padding = (width_pad, width_pad, height_pad, height_pad, time_pad, 0)
+        stride = (stride, 1, 1)
+        dilation = (dilation, 1, 1)
+        self.conv = nn.Conv3d(chan_in, chan_out, kernel_size, stride = stride, dilation = dilation, **kwargs)
+    def forward(self, x):
+        x = F.pad(x, self.time_causal_padding, mode = 'replicate')
+        return self.conv(x)
+class Swish(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+    def forward(self, x):
+        return x * F.sigmoid(x)
+class ResBlockX(nn.Module):
+    def __init__(self, inchannel) -> None:
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.GroupNorm(32, inchannel),
+            Swish(),
+            CausalConv3d(inchannel, inchannel, 3),
+            nn.GroupNorm(32, inchannel),
+            Swish(),
+            CausalConv3d(inchannel, inchannel, 3)
+        )
+    def forward(self, x):
+        return x + self.conv(x)
+class ResBlockXY(nn.Module):
+    def __init__(self, inchannel, outchannel) -> None:
+        super().__init__()
+        self.conv = nn.Sequential(
+            nn.GroupNorm(32, inchannel),
+            Swish(),
+            CausalConv3d(inchannel, outchannel, 3),
+            nn.GroupNorm(32, outchannel),
+            Swish(),
+            CausalConv3d(outchannel, outchannel, 3)
+        )
+        self.conv_1 = nn.Conv3d(inchannel, outchannel, 1)
+    def forward(self, x):
+        return self.conv_1(x) + self.conv(x)
+class PoolDown222(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.pool = nn.AvgPool3d(2, 2)
+    def forward(self, x):
+        x = F.pad(x, (0, 0, 0, 0, 1, 0), 'replicate')
+        return self.pool(x)
+class PoolDown122(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.pool = nn.AvgPool3d((1, 2, 2), (1, 2, 2))
+    def forward(self, x):
+        return self.pool(x)
+class Unpool222(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.up = nn.Upsample(scale_factor=2, mode='nearest')
+    def forward(self, x):
+        x = self.up(x)
+        return x[:, :, 1:]
+class Unpool122(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.up = nn.Upsample(scale_factor=(1, 2, 2), mode='nearest')
+    def forward(self, x):
+        x = self.up(x)
+        return x
+class ResBlockDown(nn.Module):
+    def __init__(self, inchannel, outchannel) -> None:
+        super().__init__()
+        self.blcok = nn.Sequential(
+            CausalConv3d(inchannel, outchannel, 3),
+            nn.LeakyReLU(inplace=True),
+            PoolDown222(),
+            CausalConv3d(outchannel, outchannel, 3),
+            nn.LeakyReLU(inplace=True)
+        )
+        self.res = nn.Sequential(
+            PoolDown222(),
+            nn.Conv3d(inchannel, outchannel, 1)
+        )
+    def forward(self, x):
+        return self.res(x) + self.blcok(x)
+class Discriminator(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.block = nn.Sequential(
+            CausalConv3d(3, 64, 3),
+            nn.LeakyReLU(inplace=True),
+            ResBlockDown(64, 128),
+            ResBlockDown(128, 256),
+            ResBlockDown(256, 256),
+            ResBlockDown(256, 256),
+            ResBlockDown(256, 256),
+            CausalConv3d(256, 256, 3),
+            nn.LeakyReLU(inplace=True),
+            nn.AdaptiveAvgPool3d(1),
+            nn.Flatten(),
+            nn.Linear(256, 256),
+            nn.LeakyReLU(inplace=True),
+            nn.Linear(256, 1)
+        )
+    def forward(self, x):
+        if x.ndim==4:
+            x = x.unsqueeze(2)
+        return self.block(x)
+class Encoder(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.encoder = nn.Sequential(
+            CausalConv3d(3, 64, 3),
+            ResBlockX(64),
+            ResBlockX(64),
+            PoolDown222(),
+            ResBlockXY(64, 128),
+            ResBlockX(128),
+            PoolDown222(),
+            ResBlockX(128),
+            ResBlockX(128),
+            PoolDown122(),
+            ResBlockXY(128, 256),
+            ResBlockX(256),
+            ResBlockX(256),
+            ResBlockX(256),
+            nn.GroupNorm(32, 256),
+            Swish(),
+            nn.Conv3d(256, 16, 1)
+        )
+    def forward(self, x):
+        return self.encoder(x)
+class Decoder(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.decoder = nn.Sequential(
+            CausalConv3d(8, 256, 3),
+            ResBlockX(256),
+            ResBlockX(256),
+            ResBlockX(256),
+            ResBlockX(256),
+            Unpool122(),
+            CausalConv3d(256, 256, 3),
+            ResBlockXY(256, 128),
+            ResBlockX(128),
+            Unpool222(),
+            CausalConv3d(128, 128, 3),
+            ResBlockX(128),
+            ResBlockX(128),
+            Unpool222(),
+            CausalConv3d(128, 128, 3),
+            ResBlockXY(128, 64),
+            ResBlockX(64),
+            nn.GroupNorm(32, 64),
+            Swish(),
+            CausalConv3d(64, 64, 3)
+        )
+        self.conv_out = nn.Conv3d(64, 3, 1)
+    def forward(self, x):
+        return self.conv_out(self.decoder(x))
+if __name__=='__main__':
+    encoder = Encoder()
+    decoder = Decoder()
+    dis = Discriminator()
+    x = torch.randn((1, 3, 1, 64, 64))
+    embedding = encoder(x)
+    y = decoder(embedding)
+    tmp = torch.randn((1, 4, 1, 64, 64))
+    print('something mmm')

easyanimate/vae/ldm/models/omnigen_casual3dcnn.py ADDED Viewed

	@@ -0,0 +1,321 @@

+import itertools
+from dataclasses import dataclass
+from typing import Optional
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ..util import instantiate_from_config
+from .omnigen_enc_dec import Decoder as omnigen_Mag_Decoder
+from .omnigen_enc_dec import Encoder as omnigen_Mag_Encoder
+class DiagonalGaussianDistribution:
+    def __init__(
+        self,
+        mean: torch.Tensor,
+        logvar: torch.Tensor,
+        deterministic: bool = False,
+    ):
+        self.mean = mean
+        self.logvar = torch.clamp(logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        if deterministic:
+            self.var = self.std = torch.zeros_like(self.mean)
+        else:
+            self.std = torch.exp(0.5 * self.logvar)
+            self.var = torch.exp(self.logvar)
+    def sample(self, generator = None) -> torch.FloatTensor:
+        x = torch.randn(
+            self.mean.shape,
+            generator=generator,
+            device=self.mean.device,
+            dtype=self.mean.dtype,
+        )
+        return self.mean + self.std * x
+    def mode(self):
+        return self.mean
+    def kl(self, other: Optional["DiagonalGaussianDistribution"] = None) -> torch.Tensor:
+        dims = list(range(1, self.mean.ndim))
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=dims,
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=dims,
+                )
+    def nll(self, sample: torch.Tensor) -> torch.Tensor:
+        dims = list(range(1, self.mean.ndim))
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+@dataclass
+class EncoderOutput:
+    latent_dist: DiagonalGaussianDistribution
+@dataclass
+class DecoderOutput:
+    sample: torch.Tensor
+def str_eval(item):
+    if type(item) == str:
+        return eval(item)
+    else:
+        return item
+class AutoencoderKLMagvit_fromOmnigen(pl.LightningModule):
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        ch =  128,
+        ch_mult = [ 1,2,4,4 ],
+        use_gc_blocks = None,
+        down_block_types: tuple = None,
+        up_block_types: tuple = None,
+        mid_block_type: str = "MidBlock3D",
+        mid_block_use_attention: bool = True,
+        mid_block_attention_type: str = "3d",
+        mid_block_num_attention_heads: int = 1,
+        layers_per_block: int = 2,
+        act_fn: str = "silu",
+        num_attention_heads: int = 1,
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        image_key="image",
+        monitor=None,
+        ckpt_path=None,
+        lossconfig=None,
+        slice_compression_vae=False,
+        mini_batch_encoder=9,
+        mini_batch_decoder=3,
+        train_decoder_only=False,
+    ):
+        super().__init__()
+        self.image_key = image_key
+        down_block_types = str_eval(down_block_types)
+        up_block_types = str_eval(up_block_types)
+        self.encoder = omnigen_Mag_Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            ch = ch,
+            ch_mult = ch_mult,
+            use_gc_blocks=use_gc_blocks,
+            mid_block_type=mid_block_type,
+            mid_block_use_attention=mid_block_use_attention,
+            mid_block_attention_type=mid_block_attention_type,
+            mid_block_num_attention_heads=mid_block_num_attention_heads,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+            num_attention_heads=num_attention_heads,
+            double_z=True,
+            slice_compression_vae=slice_compression_vae,
+            mini_batch_encoder=mini_batch_encoder,
+        )
+        self.decoder = omnigen_Mag_Decoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            ch = ch,
+            ch_mult = ch_mult,
+            use_gc_blocks=use_gc_blocks,
+            mid_block_type=mid_block_type,
+            mid_block_use_attention=mid_block_use_attention,
+            mid_block_attention_type=mid_block_attention_type,
+            mid_block_num_attention_heads=mid_block_num_attention_heads,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+            num_attention_heads=num_attention_heads,
+            slice_compression_vae=slice_compression_vae,
+            mini_batch_decoder=mini_batch_decoder,
+        )
+        self.quant_conv = nn.Conv3d(2 * latent_channels, 2 * latent_channels, kernel_size=1)
+        self.post_quant_conv = nn.Conv3d(latent_channels, latent_channels, kernel_size=1)
+        self.mini_batch_encoder = mini_batch_encoder
+        self.mini_batch_decoder = mini_batch_decoder
+        self.train_decoder_only = train_decoder_only
+        if train_decoder_only:
+            self.encoder.requires_grad_(False)
+            self.quant_conv.requires_grad_(False)
+        if monitor is not None:
+            self.monitor = monitor
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys="loss")
+        if lossconfig is not None:
+            self.loss = instantiate_from_config(lossconfig)
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        if path.endswith("safetensors"):
+            from safetensors.torch import load_file, safe_open
+            sd = load_file(path)
+        else:
+            sd = torch.load(path, map_location="cpu")
+        if "state_dict" in list(sd.keys()):
+            sd = sd["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False) # loss.item can be ignored successfully
+        print(f"Restored from {path}")
+    def encode(self, x: torch.Tensor) -> EncoderOutput:
+        h = self.encoder(x)
+        moments: torch.Tensor = self.quant_conv(h)
+        mean, logvar = moments.chunk(2, dim=1)
+        posterior = DiagonalGaussianDistribution(mean, logvar)
+        # return EncoderOutput(latent_dist=posterior)
+        return posterior
+    def decode(self, z: torch.Tensor) -> DecoderOutput:
+        z = self.post_quant_conv(z)
+        decoded = self.decoder(z)
+        # return DecoderOutput(sample=decoded)
+        return decoded
+    def forward(self, input, sample_posterior=True):
+        if input.ndim==4:
+            input = input.unsqueeze(2)
+        posterior = self.encode(input)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        # print("stt latent shape", z.shape)
+        dec = self.decode(z)
+        return dec, posterior
+    def get_input(self, batch, k):
+        x = batch[k]
+        if x.ndim==5:
+            x = x.permute(0, 4, 1, 2, 3).to(memory_format=torch.contiguous_format).float()
+            return x
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        return x
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        # tic = time.time()
+        inputs = self.get_input(batch, self.image_key)
+        # print(f"get_input time {time.time() - tic}")
+        # tic = time.time()
+        reconstructions, posterior = self(inputs)
+        # print(f"model forward time {time.time() - tic}")
+        if optimizer_idx == 0:
+            # train encoder+decoder+logvar
+            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            # print(f"cal loss time {time.time() - tic}")
+            return aeloss
+        if optimizer_idx == 1:
+            # train the discriminator
+            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                                last_layer=self.get_last_layer(), split="train")
+            self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            # print(f"cal loss time {time.time() - tic}")
+            return discloss
+    def validation_step(self, batch, batch_idx):
+        with torch.no_grad():
+            inputs = self.get_input(batch, self.image_key)
+            reconstructions, posterior = self(inputs)
+            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
+                                            last_layer=self.get_last_layer(), split="val")
+            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
+                                                last_layer=self.get_last_layer(), split="val")
+        self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        if self.train_decoder_only:
+            opt_ae = torch.optim.Adam(list(self.decoder.parameters())+
+                                    list(self.post_quant_conv.parameters()),
+                                    lr=lr, betas=(0.5, 0.9))
+        else:
+            opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                    list(self.decoder.parameters())+
+                                    list(self.quant_conv.parameters())+
+                                    list(self.post_quant_conv.parameters()),
+                                    lr=lr, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(list(self.loss.discriminator3d.parameters()) + list(self.loss.discriminator.parameters()),
+                                    lr=lr, betas=(0.5, 0.9))
+        return [opt_ae, opt_disc], []
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    @torch.no_grad()
+    def log_images(self, batch, only_inputs=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if not only_inputs:
+            xrec, posterior = self(x)
+            if x.shape[1] > 3:
+                # colorize with random projection
+                assert xrec.shape[1] > 3
+                x = self.to_rgb(x)
+                xrec = self.to_rgb(xrec)
+            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
+            log["reconstructions"] = xrec
+        log["inputs"] = x
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x

easyanimate/vae/ldm/models/omnigen_enc_dec.py ADDED Viewed

	@@ -0,0 +1,396 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from ..modules.vaemodules.activations import get_activation
+from ..modules.vaemodules.common import CausalConv3d
+from ..modules.vaemodules.down_blocks import get_down_block
+from ..modules.vaemodules.mid_blocks import get_mid_block
+from ..modules.vaemodules.up_blocks import get_up_block
+class Encoder(nn.Module):
+    r"""
+    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 8):
+            The number of output channels.
+        down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("SpatialDownBlock3D",)`):
+            The types of down blocks to use.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        use_gc_blocks (`Tuple[bool, ...]`, *optional*, defaults to `None`):
+            Whether to use global context blocks for each down block.
+        mid_block_type (`str`, *optional*, defaults to `"MidBlock3D"`):
+            The type of mid block to use.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        num_attention_heads (`int`, *optional*, defaults to 1):
+            The number of attention heads to use.
+        double_z (`bool`, *optional*, defaults to `True`):
+            Whether to double the number of output channels for the last block.
+    """
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 8,
+        down_block_types = ("SpatialDownBlock3D",),
+        ch = 128,
+        ch_mult = [1,2,4,4,],
+        use_gc_blocks = None,
+        mid_block_type: str = "MidBlock3D",
+        mid_block_use_attention: bool = True,
+        mid_block_attention_type: str = "3d",
+        mid_block_num_attention_heads: int = 1,
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        num_attention_heads: int = 1,
+        double_z: bool = True,
+        slice_compression_vae: bool = False,
+        mini_batch_encoder: int = 9,
+        verbose = False,
+    ):
+        super().__init__()
+        block_out_channels = [ch * i for i in ch_mult]
+        assert len(down_block_types) == len(block_out_channels), (
+            "Number of down block types must match number of block output channels."
+        )
+        if use_gc_blocks is not None:
+            assert len(use_gc_blocks) == len(down_block_types), (
+                "Number of GC blocks must match number of down block types."
+            )
+        else:
+            use_gc_blocks = [False] * len(down_block_types)
+        self.conv_in = CausalConv3d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+        )
+        self.down_blocks = nn.ModuleList([])
+        output_channels = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channels = output_channels
+            output_channels = block_out_channels[i]
+            is_final_block = (i == len(block_out_channels) - 1)
+            down_block = get_down_block(
+                down_block_type,
+                in_channels=input_channels,
+                out_channels=output_channels,
+                num_layers=layers_per_block,
+                act_fn=act_fn,
+                norm_num_groups=norm_num_groups,
+                norm_eps=1e-6,
+                num_attention_heads=num_attention_heads,
+                add_gc_block=use_gc_blocks[i],
+                add_downsample=not is_final_block,
+            )
+            self.down_blocks.append(down_block)
+        self.mid_block = get_mid_block(
+            mid_block_type,
+            in_channels=block_out_channels[-1],
+            num_layers=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            norm_eps=1e-6,
+            add_attention=mid_block_use_attention,
+            attention_type=mid_block_attention_type,
+            num_attention_heads=mid_block_num_attention_heads,
+        )
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[-1],
+            num_groups=norm_num_groups,
+            eps=1e-6,
+        )
+        self.conv_act = get_activation(act_fn)
+        conv_out_channels = 2 * out_channels if double_z else out_channels
+        self.conv_out = CausalConv3d(block_out_channels[-1], conv_out_channels, kernel_size=3)
+        self.slice_compression_vae = slice_compression_vae
+        self.mini_batch_encoder = mini_batch_encoder
+        self.features_share = False
+        self.verbose = verbose
+    def set_padding_one_frame(self):
+        def _set_padding_one_frame(name, module):
+            if hasattr(module, 'padding_flag'):
+                if self.verbose:
+                    print('Set pad mode for module[%s] type=%s' % (name, str(type(module))))
+                module.padding_flag = 1
+            for sub_name, sub_mod in module.named_children():
+                _set_padding_one_frame(sub_name, sub_mod)
+        for name, module in self.named_children():
+            _set_padding_one_frame(name, module)
+    def set_padding_more_frame(self):
+        def _set_padding_more_frame(name, module):
+            if hasattr(module, 'padding_flag'):
+                if self.verbose:
+                    print('Set pad mode for module[%s] type=%s' % (name, str(type(module))))
+                module.padding_flag = 2
+            for sub_name, sub_mod in module.named_children():
+                _set_padding_more_frame(sub_name, sub_mod)
+        for name, module in self.named_children():
+            _set_padding_more_frame(name, module)
+    def single_forward(self, x: torch.Tensor, previous_features: torch.Tensor, after_features: torch.Tensor) -> torch.Tensor:
+        # x: (B, C, T, H, W)
+        if self.features_share and previous_features is not None and after_features is None:
+            x = torch.concat([previous_features, x], 2)
+        elif self.features_share and previous_features is None and after_features is not None:
+            x = torch.concat([x, after_features], 2)
+        elif self.features_share and previous_features is not None and after_features is not None:
+            x = torch.concat([previous_features, x, after_features], 2)
+        x = self.conv_in(x)
+        for down_block in self.down_blocks:
+            x = down_block(x)
+        x = self.mid_block(x)
+        x = self.conv_norm_out(x)
+        x = self.conv_act(x)
+        x = self.conv_out(x)
+        if self.features_share and previous_features is not None and after_features is None:
+            x = x[:, :, 1:]
+        elif self.features_share and previous_features is None and after_features is not None:
+            x = x[:, :, :2]
+        elif self.features_share and previous_features is not None and after_features is not None:
+            x = x[:, :, 1:3]
+        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.slice_compression_vae:
+            _, _, f, _, _ = x.size()
+            if f % 2 != 0:
+                self.set_padding_one_frame()
+                first_frames = self.single_forward(x[:, :, 0:1, :, :], None, None)
+                self.set_padding_more_frame()
+                new_pixel_values = [first_frames]
+                start_index = 1
+            else:
+                self.set_padding_more_frame()
+                new_pixel_values = []
+                start_index = 0
+            previous_features = None
+            for i in range(start_index, x.shape[2], self.mini_batch_encoder):
+                after_features = x[:, :, i + self.mini_batch_encoder: i + self.mini_batch_encoder + 4, :, :] if i + self.mini_batch_encoder < x.shape[2] else None
+                next_frames = self.single_forward(x[:, :, i: i + self.mini_batch_encoder, :, :], previous_features, after_features)
+                previous_features = x[:, :, i + self.mini_batch_encoder - 4: i + self.mini_batch_encoder, :, :]
+                new_pixel_values.append(next_frames)
+            new_pixel_values = torch.cat(new_pixel_values, dim=2)
+        else:
+            new_pixel_values = self.single_forward(x, None, None)
+        return new_pixel_values
+class Decoder(nn.Module):
+    r"""
+    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
+    Args:
+        in_channels (`int`, *optional*, defaults to 8):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("SpatialUpBlock3D",)`):
+            The types of up blocks to use.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        use_gc_blocks (`Tuple[bool, ...]`, *optional*, defaults to `None`):
+            Whether to use global context blocks for each down block.
+        mid_block_type (`str`, *optional*, defaults to `"MidBlock3D"`):
+            The type of mid block to use.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        num_attention_heads (`int`, *optional*, defaults to 1):
+            The number of attention heads to use.
+    """
+    def __init__(
+        self,
+        in_channels: int = 8,
+        out_channels: int = 3,
+        up_block_types  = ("SpatialUpBlock3D",),
+        ch = 128,
+        ch_mult = [1,2,4,4,],
+        use_gc_blocks = None,
+        mid_block_type: str = "MidBlock3D",
+        mid_block_use_attention: bool = True,
+        mid_block_attention_type: str = "3d",
+        mid_block_num_attention_heads: int = 1,
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        num_attention_heads: int = 1,
+        slice_compression_vae: bool = False,
+        mini_batch_decoder: int = 3,
+        verbose = False,
+    ):
+        super().__init__()
+        block_out_channels = [ch * i for i in ch_mult]
+        assert len(up_block_types) == len(block_out_channels), (
+            "Number of up block types must match number of block output channels."
+        )
+        if use_gc_blocks is not None:
+            assert len(use_gc_blocks) == len(up_block_types), (
+                "Number of GC blocks must match number of up block types."
+            )
+        else:
+            use_gc_blocks = [False] * len(up_block_types)
+        self.conv_in = CausalConv3d(
+            in_channels,
+            block_out_channels[-1],
+            kernel_size=3,
+        )
+        self.mid_block = get_mid_block(
+            mid_block_type,
+            in_channels=block_out_channels[-1],
+            num_layers=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            norm_eps=1e-6,
+            add_attention=mid_block_use_attention,
+            attention_type=mid_block_attention_type,
+            num_attention_heads=mid_block_num_attention_heads,
+        )
+        self.up_blocks = nn.ModuleList([])
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channels = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            input_channels = output_channels
+            output_channels = reversed_block_out_channels[i]
+            # is_first_block = i == 0
+            is_final_block = i == len(block_out_channels) - 1
+            up_block = get_up_block(
+                up_block_type,
+                in_channels=input_channels,
+                out_channels=output_channels,
+                num_layers=layers_per_block + 1,
+                act_fn=act_fn,
+                norm_num_groups=norm_num_groups,
+                norm_eps=1e-6,
+                num_attention_heads=num_attention_heads,
+                add_gc_block=use_gc_blocks[i],
+                add_upsample=not is_final_block,
+            )
+            self.up_blocks.append(up_block)
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[0],
+            num_groups=norm_num_groups,
+            eps=1e-6,
+        )
+        self.conv_act = get_activation(act_fn)
+        self.conv_out = CausalConv3d(block_out_channels[0], out_channels, kernel_size=3)
+        self.slice_compression_vae = slice_compression_vae
+        self.mini_batch_decoder = mini_batch_decoder
+        self.features_share = True
+        self.verbose = verbose
+    def set_padding_one_frame(self):
+        def _set_padding_one_frame(name, module):
+            if hasattr(module, 'padding_flag'):
+                if self.verbose:
+                    print('Set pad mode for module[%s] type=%s' % (name, str(type(module))))
+                module.padding_flag = 1
+            for sub_name, sub_mod in module.named_children():
+                _set_padding_one_frame(sub_name, sub_mod)
+        for name, module in self.named_children():
+            _set_padding_one_frame(name, module)
+    def set_padding_more_frame(self):
+        def _set_padding_more_frame(name, module):
+            if hasattr(module, 'padding_flag'):
+                if self.verbose:
+                    print('Set pad mode for module[%s] type=%s' % (name, str(type(module))))
+                module.padding_flag = 2
+            for sub_name, sub_mod in module.named_children():
+                _set_padding_more_frame(sub_name, sub_mod)
+        for name, module in self.named_children():
+            _set_padding_more_frame(name, module)
+    def single_forward(self, x: torch.Tensor, previous_features: torch.Tensor, after_features: torch.Tensor) -> torch.Tensor:
+        # x: (B, C, T, H, W)
+        if self.features_share and previous_features is not None and after_features is None:
+            b, c, t, h, w = x.size()
+            x = torch.concat([previous_features, x], 2)
+            x = self.conv_in(x)
+            x = self.mid_block(x)
+            x = x[:, :, -t:]
+        elif self.features_share and previous_features is None and after_features is not None:
+            b, c, t, h, w = x.size()
+            x = torch.concat([x, after_features], 2)
+            x = self.conv_in(x)
+            x = self.mid_block(x)
+            x = x[:, :, :t]
+        elif self.features_share and previous_features is not None and after_features is not None:
+            _, _, t_1, _, _ = previous_features.size()
+            _, _, t_2, _, _ = x.size()
+            x = torch.concat([previous_features, x, after_features], 2)
+            x = self.conv_in(x)
+            x = self.mid_block(x)
+            x = x[:, :, t_1:(t_1 + t_2)]
+        else:
+            x = self.conv_in(x)
+            x = self.mid_block(x)
+        for up_block in self.up_blocks:
+            x = up_block(x)
+        x = self.conv_norm_out(x)
+        x = self.conv_act(x)
+        x = self.conv_out(x)
+        return x
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if self.slice_compression_vae:
+            _, _, f, _, _ = x.size()
+            if f % 2 != 0:
+                self.set_padding_one_frame()
+                first_frames = self.single_forward(x[:, :, 0:1, :, :], None, None)
+                self.set_padding_more_frame()
+                new_pixel_values = [first_frames]
+                start_index = 1
+            else:
+                self.set_padding_more_frame()
+                new_pixel_values = []
+                start_index = 0
+            previous_features = None
+            for i in range(start_index, x.shape[2], self.mini_batch_decoder):
+                after_features = x[:, :, i + self.mini_batch_decoder: i + 2 * self.mini_batch_decoder, :, :] if i + self.mini_batch_decoder < x.shape[2] else None
+                next_frames = self.single_forward(x[:, :, i: i + self.mini_batch_decoder, :, :], previous_features, after_features)
+                previous_features = x[:, :, i: i + self.mini_batch_decoder, :, :]
+                new_pixel_values.append(next_frames)
+            new_pixel_values = torch.cat(new_pixel_values, dim=2)
+        else:
+            new_pixel_values = self.single_forward(x, None, None)
+        return new_pixel_values

easyanimate/video_caption/datasets/put preprocess datasets here.txt ADDED Viewed

File without changes