Spaces:

fffiloni
/

ControlVideo

Paused

App Files Files Community

fffiloni commited on Jul 7, 2023

Commit

b85e577

•

1 Parent(s): d89e409

Update models/controlnet.py

Browse files

Files changed (1) hide show

models/controlnet.py +2 -44

models/controlnet.py CHANGED Viewed

@@ -43,10 +43,9 @@ class ControlNetOutput(BaseOutput):
     down_block_res_samples: Tuple[torch.Tensor]
     mid_block_res_sample: torch.Tensor
-"""
 class ControlNetConditioningEmbedding(nn.Module):
     """
-"""
     Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
     [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
     training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
@@ -54,7 +53,7 @@ class ControlNetConditioningEmbedding(nn.Module):
     (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
     model) to encode image-space conditions ... into feature maps ..."
     """
-"""
     def __init__(
         self,
@@ -89,48 +88,7 @@ class ControlNetConditioningEmbedding(nn.Module):
         embedding = self.conv_out(embedding)
         return embedding
-"""
-class ControlNetConditioningEmbedding(nn.Module):
-    def __init__(
-        self,
-        conditioning_embedding_channels: int,
-        conditioning_channels: int = 3,
-        block_out_channels: Tuple[int] = (16, 32, 96, 256),
-    ):
-        super().__init__()
-        self.conv_in = InflatedConv3d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
-        self.bn_in = nn.BatchNorm3d(block_out_channels[0])
-        self.blocks = nn.ModuleList([])
-        self.bns = nn.ModuleList([])
-        for i in range(len(block_out_channels) - 1):
-            channel_in = block_out_channels[i]
-            channel_out = block_out_channels[i + 1]
-            self.blocks.append(InflatedConv3d(channel_in, channel_in, kernel_size=3, padding=1))
-            self.bns.append(nn.BatchNorm3d(channel_in))
-            self.blocks.append(InflatedConv3d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
-            self.bns.append(nn.BatchNorm3d(channel_out))
-        self.conv_out = zero_module(
-            InflatedConv3d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
-        )
-    def forward(self, conditioning):
-        embedding = self.conv_in(conditioning)
-        embedding = self.bn_in(embedding)
-        embedding = F.silu(embedding)
-        for block, bn in zip(self.blocks, self.bns):
-            embedding = block(embedding)
-            embedding = bn(embedding)
-            embedding = F.silu(embedding)
-        embedding = self.conv_out(embedding)
-        return embedding
 class ControlNetModel3D(ModelMixin, ConfigMixin):
     _supports_gradient_checkpointing = True

     down_block_res_samples: Tuple[torch.Tensor]
     mid_block_res_sample: torch.Tensor
 class ControlNetConditioningEmbedding(nn.Module):
     """
     Quoting from https://arxiv.org/abs/2302.05543: "Stable Diffusion uses a pre-processing method similar to VQ-GAN
     [11] to convert the entire dataset of 512 × 512 images into smaller 64 × 64 “latent images” for stabilized
     training. This requires ControlNets to convert image-based conditions to 64 × 64 feature space to match the
     (activated by ReLU, channels are 16, 32, 64, 128, initialized with Gaussian weights, trained jointly with the full
     model) to encode image-space conditions ... into feature maps ..."
     """
     def __init__(
         self,
         embedding = self.conv_out(embedding)
         return embedding
 class ControlNetModel3D(ModelMixin, ConfigMixin):
     _supports_gradient_checkpointing = True