Spaces:

oguzakif
/

video-object-remover

Running on T4

App Files Files Community

oguzakif commited on Apr 19, 2023

Commit

d4b77ac

1 Parent(s): f6248c8

init repo

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
FGT_codes/FGT/checkpoint/config.yaml +34 -0
FGT_codes/FGT/checkpoint/fgt.pth.tar +3 -0
FGT_codes/FGT/config/data_info.yaml +11 -0
FGT_codes/FGT/config/davis_name2len.pkl +3 -0
FGT_codes/FGT/config/davis_name2len_train.pkl +3 -0
FGT_codes/FGT/config/davis_name2len_val.pkl +3 -0
FGT_codes/FGT/config/train.yaml +93 -0
FGT_codes/FGT/config/valid_config.yaml +8 -0
FGT_codes/FGT/config/youtubevos_name2len.pkl +3 -0
FGT_codes/FGT/data/__init__.py +49 -0
FGT_codes/FGT/data/train_dataset.py +165 -0
FGT_codes/FGT/data/util/MaskModel.py +123 -0
FGT_codes/FGT/data/util/STTN_mask.py +244 -0
FGT_codes/FGT/data/util/__init__.py +28 -0
FGT_codes/FGT/data/util/flow_utils/__init__.py +0 -0
FGT_codes/FGT/data/util/flow_utils/flow_reversal.py +77 -0
FGT_codes/FGT/data/util/flow_utils/region_fill.py +142 -0
FGT_codes/FGT/data/util/freeform_masks.py +266 -0
FGT_codes/FGT/data/util/mask_generators.py +217 -0
FGT_codes/FGT/data/util/readers.py +527 -0
FGT_codes/FGT/data/util/util.py +259 -0
FGT_codes/FGT/data/util/utils.py +158 -0
FGT_codes/FGT/flowCheckPoint/config.yaml +11 -0
FGT_codes/FGT/flowCheckPoint/lafc_single.pth.tar +3 -0
FGT_codes/FGT/inputs.py +83 -0
FGT_codes/FGT/metrics/__init__.py +31 -0
FGT_codes/FGT/metrics/psnr.py +10 -0
FGT_codes/FGT/metrics/ssim.py +46 -0
FGT_codes/FGT/models/BaseNetwork.py +46 -0
FGT_codes/FGT/models/__init__.py +0 -0
FGT_codes/FGT/models/__pycache__/BaseNetwork.cpython-39.pyc +0 -0
FGT_codes/FGT/models/__pycache__/__init__.cpython-39.pyc +0 -0
FGT_codes/FGT/models/__pycache__/model.cpython-39.pyc +0 -0
FGT_codes/FGT/models/lafc_single.py +114 -0
FGT_codes/FGT/models/model.py +284 -0
FGT_codes/FGT/models/temporal_patch_gan.py +76 -0
FGT_codes/FGT/models/transformer_base/__init__.py +0 -0
FGT_codes/FGT/models/transformer_base/__pycache__/__init__.cpython-39.pyc +0 -0
FGT_codes/FGT/models/transformer_base/__pycache__/attention_base.cpython-39.pyc +0 -0
FGT_codes/FGT/models/transformer_base/__pycache__/attention_flow.cpython-39.pyc +0 -0
FGT_codes/FGT/models/transformer_base/__pycache__/ffn_base.cpython-39.pyc +0 -0
FGT_codes/FGT/models/transformer_base/attention_base.py +106 -0
FGT_codes/FGT/models/transformer_base/attention_flow.py +171 -0
FGT_codes/FGT/models/transformer_base/ffn_base.py +114 -0
FGT_codes/FGT/models/utils/RAFT/utils/__init__.py +0 -0
FGT_codes/FGT/models/utils/RAFT/utils/utils.py +82 -0
FGT_codes/FGT/models/utils/__init__.py +0 -0
FGT_codes/FGT/models/utils/__pycache__/__init__.cpython-39.pyc +0 -0
FGT_codes/FGT/models/utils/__pycache__/network_blocks_2d.cpython-39.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.pth.tar filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 *.pth.tar filter=lfs diff=lfs merge=lfs -text
+*.o filter=lfs diff=lfs merge=lfs -text

FGT_codes/FGT/checkpoint/config.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+PASSMASK: 1
+alpha: 0.3
+ape: 1
+cnum: 64
+conv_type: vanilla
+dist_cnum: 32
+drop: 0
+frame_hidden: 512
+gd: 4
+in_channel: 4
+init_weights: 1
+input_resolution: !!python/tuple
+- 240
+- 432
+flow_inChannel: 2
+flow_cnum: 64
+flow_hidden: 256
+kernel_size: !!python/tuple
+- 7
+- 7
+mlp_ratio: 40
+numBlocks: 8
+num_head: 4
+padding: !!python/tuple
+- 3
+- 3
+stride: !!python/tuple
+- 3
+- 3
+sw: 8
+tw: 2
+use_bias: 1
+norm: None
+model: model

FGT_codes/FGT/checkpoint/fgt.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:41352263b2d14aec73f0dcf75c4bf5155ddb23404aba6f023a0300aadfd7672f
+size 157341393

FGT_codes/FGT/config/data_info.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+# dataset general info
+frame_path: youtubevos_frames
+flow_path: youtubevos_flows
+name2len: config/youtubevos_name2len.pkl
+flow:
+  flow_height: 240
+  flow_width: 432
+  augments: False
+  colors: RGB
+  ext: .jpg

FGT_codes/FGT/config/davis_name2len.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6607939cc02910f5badaebff46242f299597e93c07d77b6d740a3004f179f50c
+size 1621

FGT_codes/FGT/config/davis_name2len_train.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ad5e89d5486b38f74ac62d08924a4ff7caa445d34df827385457e8516d4763f
+size 1073

FGT_codes/FGT/config/davis_name2len_val.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30b2a23f943f40f2a09e98b474b88b07271e46a1224cb415650432d491cc1896
+size 188

FGT_codes/FGT/config/train.yaml ADDED Viewed

	@@ -0,0 +1,93 @@

+### General settings
+name: FGT_train
+use_tb_logger: true
+outputdir: /myData/ret/experiments
+datadir: /myData
+record_iter: 16
+### Calling definition
+model: model
+datasetName_train: train_dataset
+network: network
+### datasets
+datasets:
+  train:
+    name: youtubevos
+    type: video
+    mode: train
+    dataInfo_config: ./config/data_info.yaml
+    use_shuffle: True
+    n_workers: 0
+    batch_size: 2
+  val:
+    name: youtubevos
+    type: video
+    mode: val
+    use_shuffle: False
+    n_workers: 1
+    batch_size: 1
+    val_config: ./config/valid_config.yaml
+### train settings
+train:
+  lr: 0.0001
+  lr_decay: 0.1
+  manual_seed: 10
+  BETA1: 0.9
+  BETA2: 0.999
+  MAX_ITERS: 500000
+  UPDATE_INTERVAL: 300000 # 400000 is also OK
+  WARMUP: ~
+  val_freq: 1  # Set to 1 is for debug, you can enlarge it to 50 in regular training
+  TEMPORAL_GAN: ~  # without temporal GAN
+### logger
+logger:
+  PRINT_FREQ: 16
+  SAVE_CHECKPOINT_FREQ: 4000   # 100 is for debug consideration
+### Data related parameters
+flow2rgb: 1
+flow_direction: for
+num_frames: 5
+sample: random
+max_val: 0.01
+### Model related parameters
+res_h: 240
+res_w: 432
+in_channel: 4
+cnum: 64
+flow_inChannel: 2
+flow_cnum: 64
+dist_cnum: 32
+frame_hidden: 512
+flow_hidden: 256
+PASSMASK: 1
+num_blocks: 8
+kernel_size_w: 7
+kernel_size_h: 7
+stride_h: 3
+stride_w: 3
+num_head: 4
+conv_type: vanilla
+norm: None
+use_bias: 1
+ape: 1
+pos_mode: single
+mlp_ratio: 40
+drop: 0
+init_weights: 1
+tw: 2
+sw: 8
+gd: 4
+### Loss weights
+L1M: 1
+L1V: 1
+adv: 0.01
+### inference parameters
+ref_length: 10

FGT_codes/FGT/config/valid_config.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+flow_height: 240
+flow_width: 432
+data_root: davis_valid_flows
+mask_root: rectMask_96
+frame_root: JPEGImages/480p
+flow_root: davis_test_flows
+batch_size: 1
+name2len: config/davis_name2len_val.pkl

FGT_codes/FGT/config/youtubevos_name2len.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60410308d4a0e780a531290d8bddc7f204bc0e8a500eab7c01c563b8efce9753
+size 75501

FGT_codes/FGT/data/__init__.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import logging
+import torch
+import torch.utils.data
+from importlib import import_module
+def create_dataloader(phase, dataset, dataset_opt, opt=None, sampler=None):
+    logger = logging.getLogger('base')
+    if phase == 'train':
+        num_workers = dataset_opt['n_workers'] * opt['world_size']
+        batch_size = dataset_opt['batch_size']
+        if sampler is not None:
+            logger.info('N_workers: {}, batch_size: {} DDP train dataloader has been established'.format(num_workers,
+                                                                                                         batch_size))
+            return torch.utils.data.DataLoader(dataset, batch_size=batch_size,
+                                               num_workers=num_workers, sampler=sampler,
+                                               pin_memory=True)
+        else:
+            logger.info('N_workers: {}, batch_size: {} train dataloader has been established'.format(num_workers,
+                                                                                                     batch_size))
+            return torch.utils.data.DataLoader(dataset, batch_size=batch_size,
+                                               num_workers=num_workers, shuffle=True,
+                                               pin_memory=True)
+    else:
+        logger.info(
+            'N_workers: {}, batch_size: {} validate/test dataloader has been established'.format(
+                dataset_opt['n_workers'],
+                dataset_opt['batch_size']))
+        return torch.utils.data.DataLoader(dataset, batch_size=dataset_opt['batch_size'], shuffle=False,
+                                           num_workers=dataset_opt['n_workers'],
+                                           pin_memory=False)
+def create_dataset(dataset_opt, dataInfo, phase, dataset_name):
+    if phase == 'train':
+        dataset_package = import_module('data.{}'.format(dataset_name))
+        dataset = dataset_package.VideoBasedDataset(dataset_opt, dataInfo)
+        mode = dataset_opt['mode']
+        logger = logging.getLogger('base')
+        logger.info(
+            '{} train dataset [{:s} - {:s} - {:s}] is created.'.format(dataset_opt['type'].upper(),
+                                                                       dataset.__class__.__name__,
+                                                                       dataset_opt['name'], mode))
+    else:  # validate and test dataset
+        return ValueError('No dataset initialized for valdataset')
+    return dataset

FGT_codes/FGT/data/train_dataset.py ADDED Viewed

	@@ -0,0 +1,165 @@

+import random
+import pickle
+import logging
+import torch
+import cv2
+import os
+from torch.utils.data.dataset import Dataset
+import numpy as np
+import cvbase
+from .util.STTN_mask import create_random_shape_with_random_motion
+import imageio
+from .util.flow_utils import region_fill as rf
+logger = logging.getLogger('base')
+class VideoBasedDataset(Dataset):
+    def __init__(self, opt, dataInfo):
+        self.opt = opt
+        self.sampleMethod = opt['sample']
+        self.dataInfo = dataInfo
+        self.height, self.width = self.opt['input_resolution']
+        self.frame_path = dataInfo['frame_path']
+        self.flow_path = dataInfo['flow_path']  # The path of the optical flows
+        self.train_list = os.listdir(self.frame_path)
+        self.name2length = self.dataInfo['name2len']
+        with open(self.name2length, 'rb') as f:
+            self.name2length = pickle.load(f)
+        self.sequenceLen = self.opt['num_frames']
+        self.flow2rgb = opt['flow2rgb']  # whether to change flow to rgb domain
+        self.flow_direction = opt[
+            'flow_direction']  # The direction must be in ['for', 'back', 'bi'], indicating forward, backward and bidirectional flows
+    def __len__(self):
+        return len(self.train_list)
+    def __getitem__(self, idx):
+        try:
+            item = self.load_item(idx)
+        except:
+            print('Loading error: ' + self.train_list[idx])
+            item = self.load_item(0)
+        return item
+    def frameSample(self, frameLen, sequenceLen):
+        if self.sampleMethod == 'random':
+            indices = [i for i in range(frameLen)]
+            sampleIndices = random.sample(indices, sequenceLen)
+        elif self.sampleMethod == 'seq':
+            pivot = random.randint(0, sequenceLen - 1 - frameLen)
+            sampleIndices = [i for i in range(pivot, pivot + frameLen)]
+        else:
+            raise ValueError('Cannot determine the sample method {}'.format(self.sampleMethod))
+        return sampleIndices
+    def load_item(self, idx):
+        video = self.train_list[idx]
+        frame_dir = os.path.join(self.frame_path, video)
+        forward_flow_dir = os.path.join(self.flow_path, video, 'forward_flo')
+        backward_flow_dir = os.path.join(self.flow_path, video, 'backward_flo')
+        frameLen = self.name2length[video]
+        flowLen = frameLen - 1
+        assert frameLen > self.sequenceLen, 'Frame length {} is less than sequence length'.format(frameLen)
+        sampledIndices = self.frameSample(frameLen, self.sequenceLen)
+        # generate random masks for these sampled frames
+        candidateMasks = create_random_shape_with_random_motion(frameLen, 0.9, 1.1, 1, 10)
+        # read the frames and masks
+        frames, masks, forward_flows, backward_flows = [], [], [], []
+        for i in range(len(sampledIndices)):
+            frame = self.read_frame(os.path.join(frame_dir, '{:05d}.jpg'.format(sampledIndices[i])), self.height,
+                                    self.width)
+            mask = self.read_mask(candidateMasks[sampledIndices[i]], self.height, self.width)
+            frames.append(frame)
+            masks.append(mask)
+            if self.flow_direction == 'for':
+                forward_flow = self.read_forward_flow(forward_flow_dir, sampledIndices[i], flowLen)
+                forward_flow = self.diffusion_flow(forward_flow, mask)
+                forward_flows.append(forward_flow)
+            elif self.flow_direction == 'back':
+                backward_flow = self.read_backward_flow(backward_flow_dir, sampledIndices[i])
+                backward_flow = self.diffusion_flow(backward_flow, mask)
+                backward_flows.append(backward_flow)
+            elif self.flow_direction == 'bi':
+                forward_flow = self.read_forward_flow(forward_flow_dir, sampledIndices[i], flowLen)
+                forward_flow = self.diffusion_flow(forward_flow, mask)
+                forward_flows.append(forward_flow)
+                backward_flow = self.read_backward_flow(backward_flow_dir, sampledIndices[i])
+                backward_flow = self.diffusion_flow(backward_flow, mask)
+                backward_flows.append(backward_flow)
+            else:
+                raise ValueError('Unknown flow direction mode: {}'.format(self.flow_direction))
+        inputs = {'frames': frames, 'masks': masks, 'forward_flo': forward_flows, 'backward_flo': backward_flows}
+        inputs = self.to_tensor(inputs)
+        inputs['frames'] = (inputs['frames'] / 255.) * 2 - 1
+        return inputs
+    def diffusion_flow(self, flow, mask):
+        flow_filled = np.zeros(flow.shape)
+        flow_filled[:, :, 0] = rf.regionfill(flow[:, :, 0] * (1 - mask), mask)
+        flow_filled[:, :, 1] = rf.regionfill(flow[:, :, 1] * (1 - mask), mask)
+        return flow_filled
+    def read_frame(self, path, height, width):
+        frame = imageio.imread(path)
+        frame = cv2.resize(frame, (width, height), cv2.INTER_LINEAR)
+        return frame
+    def read_mask(self, mask, height, width):
+        mask = np.array(mask)
+        mask = mask / 255.
+        raw_mask = (mask > 0.5).astype(np.uint8)
+        raw_mask = cv2.resize(raw_mask, dsize=(width, height), interpolation=cv2.INTER_NEAREST)
+        return raw_mask
+    def read_forward_flow(self, forward_flow_dir, sampledIndex, flowLen):
+        if sampledIndex >= flowLen:
+            sampledIndex = flowLen - 1
+        flow = cvbase.read_flow(os.path.join(forward_flow_dir, '{:05d}.flo'.format(sampledIndex)))
+        height, width = flow.shape[:2]
+        flow = cv2.resize(flow, (self.width, self.height), cv2.INTER_LINEAR)
+        flow[:, :, 0] = flow[:, :, 0] / width * self.width
+        flow[:, :, 1] = flow[:, :, 1] / height * self.height
+        return flow
+    def read_backward_flow(self, backward_flow_dir, sampledIndex):
+        if sampledIndex == 0:
+            sampledIndex = 0
+        else:
+            sampledIndex -= 1
+        flow = cvbase.read_flow(os.path.join(backward_flow_dir, '{:05d}.flo'.format(sampledIndex)))
+        height, width = flow.shape[:2]
+        flow = cv2.resize(flow, (self.width, self.height), cv2.INTER_LINEAR)
+        flow[:, :, 0] = flow[:, :, 0] / width * self.width
+        flow[:, :, 1] = flow[:, :, 1] / height * self.height
+        return flow
+    def to_tensor(self, data_list):
+        """
+        Args:
+            data_list: A list contains multiple numpy arrays
+        Returns: The stacked tensor list
+        """
+        keys = list(data_list.keys())
+        for key in keys:
+            if data_list[key] is None or data_list[key] == []:
+                data_list.pop(key)
+            else:
+                item = data_list[key]
+                if not isinstance(item, list):
+                    item = torch.from_numpy(np.transpose(item, (2, 0, 1))).float()  # [c, h, w]
+                else:
+                    item = np.stack(item, axis=0)
+                    if len(item.shape) == 3:  # [t, h, w]
+                        item = item[:, :, :, np.newaxis]
+                    item = torch.from_numpy(np.transpose(item, (0, 3, 1, 2))).float()  # [t, c, h, w]
+                data_list[key] = item
+        return data_list

FGT_codes/FGT/data/util/MaskModel.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import random
+import numpy as np
+class RandomMask():
+    def __init__(self, videoLength, dataInfo):
+        self.videoLength = videoLength
+        self.imageHeight, self.imageWidth = dataInfo['image']['image_height'], \
+                                            dataInfo['image']['image_width']
+        self.maskHeight, self.maskWidth = dataInfo['mask']['mask_height'], \
+                                          dataInfo['mask']['mask_width']
+        try:
+            self.maxDeltaHeight, self.maxDeltaWidth = dataInfo['mask']['max_delta_height'], \
+                                                    dataInfo['mask']['max_delta_width']
+        except KeyError:
+            self.maxDeltaHeight, self.maxDeltaWidth = 0, 0
+        try:
+            self.verticalMargin, self.horizontalMargin = dataInfo['mask']['vertical_margin'], \
+                                                         dataInfo['mask']['horizontal_margin']
+        except KeyError:
+            self.verticalMargin, self.horizontalMargin = 0, 0
+    def __call__(self):
+        from .utils import random_bbox
+        from .utils import bbox2mask
+        masks = []
+        bbox = random_bbox(self.imageHeight, self.imageWidth, self.verticalMargin, self.horizontalMargin,
+                           self.maskHeight, self.maskWidth)
+        if random.uniform(0, 1) > 0.5:
+            mask = bbox2mask(self.imageHeight, self.imageWidth, 0, 0, bbox)
+            for frame in range(self.videoLength):
+                masks.append(mask)
+        else:
+            for frame in range(self.videoLength):
+                delta_h, delta_w = random.randint(-3, 3), random.randint(-3, 3)  # 每次向四个方向移动三个像素以内
+                bbox = list(bbox)
+                bbox[0] = min(max(self.verticalMargin, bbox[0] + delta_h), self.imageHeight - self.verticalMargin - bbox[2])
+                bbox[1] = min(max(self.horizontalMargin, bbox[1] + delta_w), self.imageWidth - self.horizontalMargin - bbox[3])
+                mask = bbox2mask(self.imageHeight, self.imageWidth, 0, 0, bbox)
+                masks.append(mask)
+        masks = np.stack(masks, axis=0)
+        if len(masks.shape) == 3:
+            masks = masks[:, :, :, np.newaxis]
+        assert len(masks.shape) == 4, 'Wrong mask dimension {}'.format(len(masks.shape))
+        return masks
+class MidRandomMask():
+    ### This mask is considered without random motion
+    def __init__(self, videoLength, dataInfo):
+        self.videoLength = videoLength
+        self.imageHeight, self.imageWidth = dataInfo['image']['image_height'], \
+                                            dataInfo['image']['image_width']
+        self.maskHeight, self.maskWidth = dataInfo['mask']['mask_height'], \
+                                          dataInfo['mask']['mask_width']
+    def __call__(self):
+        from .utils import mid_bbox_mask
+        mask = mid_bbox_mask(self.imageHeight, self.imageWidth, self.maskHeight, self.maskWidth)
+        masks = []
+        for _ in range(self.videoLength):
+            masks.append(mask)
+        return mask
+class MatrixMask():
+    ### This mask is considered without random motion
+    def __init__(self, videoLength, dataInfo):
+        self.videoLength = videoLength
+        self.imageHeight, self.imageWidth = dataInfo['image']['image_height'], \
+                                            dataInfo['image']['image_width']
+        self.maskHeight, self.maskWidth = dataInfo['mask']['mask_height'], \
+                                          dataInfo['mask']['mask_width']
+        try:
+            self.row, self.column = dataInfo['mask']['row'], \
+                                dataInfo['mask']['column']
+        except KeyError:
+            self.row, self.column = 5, 4
+    def __call__(self):
+        from .utils import matrix2bbox
+        mask = matrix2bbox(self.imageHeight, self.imageWidth, self.maskHeight,
+                           self.maskWidth, self.row, self.column)
+        masks = []
+        for video in range(self.videoLength):
+            masks.append(mask)
+        return mask
+class FreeFormMask():
+    def __init__(self, videoLength, dataInfo):
+        self.videoLength = videoLength
+        self.imageHeight, self.imageWidth = dataInfo['image']['image_height'], \
+                                            dataInfo['image']['image_width']
+        self.maxVertex = dataInfo['mask']['max_vertex']
+        self.maxLength = dataInfo['mask']['max_length']
+        self.maxBrushWidth = dataInfo['mask']['max_brush_width']
+        self.maxAngle = dataInfo['mask']['max_angle']
+    def __call__(self):
+        from .utils import freeFormMask
+        mask = freeFormMask(self.imageHeight, self.imageWidth,
+                     self.maxVertex, self.maxLength,
+                     self.maxBrushWidth, self.maxAngle)
+        return mask
+class StationaryMask():
+    def __init__(self, videoLength, dataInfo):
+        self.videoLength = videoLength
+        self.imageHeight, self.imageWidth = dataInfo['image']['image_height'], \
+                                            dataInfo['image']['image_width']
+        # self.maxPointNum = dataInfo['mask']['max_point_num']
+        # self.maxLength = dataInfo['mask']['max_length']
+    def __call__(self):
+        from .STTN_mask import create_random_shape_with_random_motion
+        masks = create_random_shape_with_random_motion(self.videoLength, 0.9, 1.1, 1, 10, self.imageHeight, self.imageWidth)
+        masks = np.stack(masks, axis=0)
+        if len(masks.shape) == 3:
+            masks = masks[:, :, :, np.newaxis]
+        assert len(masks.shape) == 4, 'Your masks with a wrong shape {}'.format(len(masks.shape))
+        return masks

FGT_codes/FGT/data/util/STTN_mask.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import matplotlib.patches as patches
+from matplotlib.path import Path
+import os
+import sys
+import io
+import cv2
+import time
+import math
+import argparse
+import shutil
+import random
+import zipfile
+from glob import glob
+import math
+import numpy as np
+import torch.nn.functional as F
+import torchvision.transforms as transforms
+from PIL import Image, ImageOps, ImageDraw, ImageFilter
+import torch
+import torchvision
+import torch.nn as nn
+import torch.distributed as dist
+import matplotlib
+from matplotlib import pyplot as plt
+matplotlib.use('agg')
+class GroupRandomHorizontalFlip(object):
+    """Randomly horizontally flips the given PIL.Image with a probability of 0.5
+    """
+    def __init__(self, is_flow=False):
+        self.is_flow = is_flow
+    def __call__(self, img_group, is_flow=False):
+        v = random.random()
+        if v < 0.5:
+            ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
+            if self.is_flow:
+                for i in range(0, len(ret), 2):
+                    # invert flow pixel values when flipping
+                    ret[i] = ImageOps.invert(ret[i])
+            return ret
+        else:
+            return img_group
+class Stack(object):
+    def __init__(self, roll=False):
+        self.roll = roll
+    def __call__(self, img_group):
+        mode = img_group[0].mode
+        if mode == '1':
+            img_group = [img.convert('L') for img in img_group]
+            mode = 'L'
+        if mode == 'L':
+            return np.stack([np.expand_dims(x, 2) for x in img_group], axis=2)
+        elif mode == 'RGB':
+            if self.roll:
+                return np.stack([np.array(x)[:, :, ::-1] for x in img_group], axis=2)
+            else:
+                return np.stack(img_group, axis=2)
+        else:
+            raise NotImplementedError("Image mode {}".format(mode))
+class ToTorchFormatTensor(object):
+    """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
+    to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
+    def __init__(self, div=True):
+        self.div = div
+    def __call__(self, pic):
+        if isinstance(pic, np.ndarray):
+            # numpy img: [L, C, H, W]
+            img = torch.from_numpy(pic).permute(2, 3, 0, 1).contiguous()
+        else:
+            # handle PIL Image
+            img = torch.ByteTensor(
+                torch.ByteStorage.from_buffer(pic.tobytes()))
+            img = img.view(pic.size[1], pic.size[0], len(pic.mode))
+            # put it from HWC to CHW format
+            # yikes, this transpose takes 80% of the loading time/CPU
+            img = img.transpose(0, 1).transpose(0, 2).contiguous()
+        img = img.float().div(255) if self.div else img.float()
+        return img
+# ##########################################
+# ##########################################
+def create_random_shape_with_random_motion(video_length, zoomin, zoomout, rotmin, rotmax, imageHeight=240, imageWidth=432):
+    # get a random shape
+    assert zoomin < 1, "Zoom-in parameter must be smaller than 1"
+    assert zoomout > 1, "Zoom-out parameter must be larger than 1"
+    assert rotmin < rotmax, "Minimum value of rotation must be smaller than maximun value !"
+    height = random.randint(imageHeight//3, imageHeight-1)
+    width = random.randint(imageWidth//3, imageWidth-1)
+    edge_num = random.randint(6, 8)
+    ratio = random.randint(6, 8)/10
+    region = get_random_shape(
+        edge_num=edge_num, ratio=ratio, height=height, width=width)
+    region_width, region_height = region.size
+    # get random position
+    x, y = random.randint(
+        0, imageHeight-region_height), random.randint(0, imageWidth-region_width)
+    velocity = get_random_velocity(max_speed=3)
+    m = Image.fromarray(np.zeros((imageHeight, imageWidth)).astype(np.uint8))
+    m.paste(region, (y, x, y+region.size[0], x+region.size[1]))
+    masks = [m.convert('L')]
+    # return fixed masks
+    if random.uniform(0, 1) > 0.5:
+        return masks*video_length  # -> directly copy all the base masks
+    # return moving masks
+    for _ in range(video_length-1):
+        x, y, velocity = random_move_control_points(
+            x, y, imageHeight, imageWidth, velocity, region.size, maxLineAcceleration=(3, 0.5), maxInitSpeed=3)
+        m = Image.fromarray(
+            np.zeros((imageHeight, imageWidth)).astype(np.uint8))
+        ### add by kaidong, to simulate zoon-in, zoom-out and rotation
+        extra_transform = random.uniform(0, 1)
+        # zoom in and zoom out
+        if extra_transform > 0.75:
+            resize_coefficient = random.uniform(zoomin, zoomout)
+            region = region.resize((math.ceil(region_width * resize_coefficient), math.ceil(region_height * resize_coefficient)), Image.NEAREST)
+            m.paste(region, (y, x, y + region.size[0], x + region.size[1]))
+            region_width, region_height = region.size
+        # rotation
+        elif extra_transform > 0.5:
+            m.paste(region, (y, x, y + region.size[0], x + region.size[1]))
+            m = m.rotate(random.randint(rotmin, rotmax))
+            # region_width, region_height = region.size
+        ### end
+        else:
+            m.paste(region, (y, x, y+region.size[0], x+region.size[1]))
+        masks.append(m.convert('L'))
+    return masks
+def get_random_shape(edge_num=9, ratio=0.7, width=432, height=240):
+    '''
+      There is the initial point and 3 points per cubic bezier curve.
+      Thus, the curve will only pass though n points, which will be the sharp edges.
+      The other 2 modify the shape of the bezier curve.
+      edge_num, Number of possibly sharp edges
+      points_num, number of points in the Path
+      ratio, (0, 1) magnitude of the perturbation from the unit circle,
+    '''
+    points_num = edge_num*3 + 1
+    angles = np.linspace(0, 2*np.pi, points_num)
+    codes = np.full(points_num, Path.CURVE4)
+    codes[0] = Path.MOVETO
+    # Using this instad of Path.CLOSEPOLY avoids an innecessary straight line
+    verts = np.stack((np.cos(angles), np.sin(angles))).T * \
+        (2*ratio*np.random.random(points_num)+1-ratio)[:, None]
+    verts[-1, :] = verts[0, :]
+    path = Path(verts, codes)
+    # draw paths into images
+    fig = plt.figure()
+    ax = fig.add_subplot(111)
+    patch = patches.PathPatch(path, facecolor='black', lw=2)
+    ax.add_patch(patch)
+    ax.set_xlim(np.min(verts)*1.1, np.max(verts)*1.1)
+    ax.set_ylim(np.min(verts)*1.1, np.max(verts)*1.1)
+    ax.axis('off')  # removes the axis to leave only the shape
+    fig.canvas.draw()
+    # convert plt images into numpy images
+    data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
+    data = data.reshape((fig.canvas.get_width_height()[::-1] + (3,)))
+    plt.close(fig)
+    # postprocess
+    data = cv2.resize(data, (width, height))[:, :, 0]
+    data = (1 - np.array(data > 0).astype(np.uint8))*255
+    corrdinates = np.where(data > 0)
+    xmin, xmax, ymin, ymax = np.min(corrdinates[0]), np.max(
+        corrdinates[0]), np.min(corrdinates[1]), np.max(corrdinates[1])
+    region = Image.fromarray(data).crop((ymin, xmin, ymax, xmax))
+    return region
+def random_accelerate(velocity, maxAcceleration, dist='uniform'):
+    speed, angle = velocity
+    d_speed, d_angle = maxAcceleration
+    if dist == 'uniform':
+        speed += np.random.uniform(-d_speed, d_speed)
+        angle += np.random.uniform(-d_angle, d_angle)
+    elif dist == 'guassian':
+        speed += np.random.normal(0, d_speed / 2)
+        angle += np.random.normal(0, d_angle / 2)
+    else:
+        raise NotImplementedError(
+            f'Distribution type {dist} is not supported.')
+    return (speed, angle)
+def get_random_velocity(max_speed=3, dist='uniform'):
+    if dist == 'uniform':
+        speed = np.random.uniform(max_speed)
+    elif dist == 'guassian':
+        speed = np.abs(np.random.normal(0, max_speed / 2))
+    else:
+        raise NotImplementedError(
+            'Distribution type {} is not supported.'.format(dist))
+    angle = np.random.uniform(0, 2 * np.pi)
+    return (speed, angle)
+def random_move_control_points(X, Y, imageHeight, imageWidth, lineVelocity, region_size, maxLineAcceleration=(3, 0.5), maxInitSpeed=3):
+    region_width, region_height = region_size
+    speed, angle = lineVelocity
+    X += int(speed * np.cos(angle))
+    Y += int(speed * np.sin(angle))
+    lineVelocity = random_accelerate(
+        lineVelocity, maxLineAcceleration, dist='guassian')
+    if ((X > imageHeight - region_height) or (X < 0) or (Y > imageWidth - region_width) or (Y < 0)):
+        lineVelocity = get_random_velocity(maxInitSpeed, dist='guassian')
+    new_X = np.clip(X, 0, imageHeight - region_height)
+    new_Y = np.clip(Y, 0, imageWidth - region_width)
+    return new_X, new_Y, lineVelocity
+# ##############################################
+# ##############################################
+if __name__ == '__main__':
+    import os
+    os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
+    trials = 10
+    for _ in range(trials):
+        video_length = 10
+        # The returned masks are either stationary (50%) or moving (50%)
+        masks = create_random_shape_with_random_motion(video_length, zoomin=0.9, zoomout=1.1, rotmin=1, rotmax=10, imageHeight=240, imageWidth=432)
+        i = 0
+        for m in masks:
+           cv2.imshow('mask', np.array(m))
+           cv2.waitKey(500)
+           # m.save('mask_{}.png'.format(i))
+           i += 1

FGT_codes/FGT/data/util/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from .STTN_mask import create_random_shape_with_random_motion
+import logging
+logger = logging.getLogger('base')
+def initialize_mask(videoLength, dataInfo):
+    from .MaskModel import RandomMask
+    from .MaskModel import MidRandomMask
+    from .MaskModel import MatrixMask
+    from .MaskModel import FreeFormMask
+    from .MaskModel import StationaryMask
+    return {'random': RandomMask(videoLength, dataInfo),
+            'mid': MidRandomMask(videoLength, dataInfo),
+            'matrix': MatrixMask(videoLength, dataInfo),
+            'free': FreeFormMask(videoLength, dataInfo),
+            'stationary': StationaryMask(videoLength, dataInfo)
+            }
+def create_mask(maskClass, form):
+    if form == 'mix':
+        from random import randint
+        candidates = list(maskClass.keys())
+        candidate_index = randint(0, len(candidates) - 1)
+        return maskClass[candidates[candidate_index]]()
+    return maskClass[form]()

FGT_codes/FGT/data/util/flow_utils/__init__.py ADDED Viewed

File without changes

FGT_codes/FGT/data/util/flow_utils/flow_reversal.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+def flow_reversal(flow):
+    """
+    flow: shape [b, c, h, w]
+    return: backward flow in corresponding to the forward flow
+    The formula is borrowed from Quadratic Video Interpolation (4)
+    """
+    b, c, h, w = flow.shape
+    y = flow[:, 0:1, :, :]
+    x = flow[:, 1:2, :, :]  # [b, 1, h, w]
+    x = x.repeat(1, c, 1, 1)
+    y = y.repeat(1, c, 1, 1)
+    # get the four points of the square (x1, y1), (x1, y2), (x2, y1), (x2, y2)
+    x1 = torch.floor(x)
+    x2 = x1 + 1
+    y1 = torch.floor(y)
+    y2 = y1 + 1
+    # get gaussian weights
+    w11, w12, w21, w22 = get_gaussian_weights(x, y, x1, x2, y1, y2)
+    # calculate the weight maps for each optical flows
+    flow11, o11 = sample_one(flow, x1, y1, w11)
+    flow12, o12 = sample_one(flow, x1, y2, w12)
+    flow21, o21 = sample_one(flow, x2, y1, w21)
+    flow22, o22 = sample_one(flow, x2, y2, w22)
+    # fuse all the reversed flows based on equation (4)
+    flow_o = flow11 + flow12 + flow21 + flow22
+    o = o11 + o12 + o21 + o22
+    flow_o = -flow_o
+    flow_o[o > 0] = flow_o[o > 0] / o[o > 0]
+    return flow_o
+def get_gaussian_weights(x, y, x1, x2, y1, y2):
+    sigma = 1
+    w11 = torch.exp(-((x - x1) ** 2 + (y - y1) ** 2) / (sigma ** 2))
+    w12 = torch.exp(-((x - x1) ** 2 + (y - y2) ** 2) / (sigma ** 2))
+    w21 = torch.exp(-((x - x2) ** 2 + (y - y1) ** 2) / (sigma ** 2))
+    w22 = torch.exp(-((x - x2) ** 2 + (y - y2) ** 2) / (sigma ** 2))
+    return w11, w12, w21, w22
+def sample_one(flow, shiftx, shifty, weight):
+    b, c, h, w = flow.shape
+    flat_shiftx = shiftx.view(-1)  # [h * w]
+    flat_shifty = shifty.view(-1)  # [h * w]
+    flat_basex = torch.arange(0, h, requires_grad=False).view(-1, 1).long().repeat(b, c, 1, w).view(-1)  # [h * w]
+    flat_basey = torch.arange(0, w, requires_grad=False).view(-1, 1).long().repeat(b, c, h, 1).view(-1)  # [h * w]
+    flat_weight = weight.reshape(-1)  # [h * w]
+    flat_flow = flow.reshape(-1)
+    idxn = torch.arange(0, b, requires_grad=False).view(b, 1, 1, 1).long().repeat(1, c, h, w).view(-1)
+    idxc = torch.arange(0, c, requires_grad=False).view(1, c, 1, 1).long().repeat(b, 1, h, w).view(-1)
+    idxx = flat_shiftx.long() + flat_basex  # size [-1]
+    idxy = flat_shifty.long() + flat_basey  # size [-1]
+    # record the shifted pixels inside the image boundaries
+    mask = idxx.ge(0) & idxx.lt(h) & idxy.ge(0) & idxy.lt(w)
+    # mask off points out of boundaries
+    ids = idxn * c * h * w + idxc * h * w + idxx * w + idxy
+    ids_mask = torch.masked_select(ids, mask).clone()
+    # put the value into corresponding regions
+    flow_warp = torch.zeros([b * c * h * w])
+    flow_warp.put_(ids_mask, torch.masked_select(flat_flow * flat_weight, mask), accumulate=True)
+    one_warp = torch.zeros([b * c * h * w])
+    one_warp.put_(ids_mask, torch.masked_select(flat_weight, mask), accumulate=True)
+    return flow_warp.view(b, c, h, w), one_warp.view(b, c, h, w)

FGT_codes/FGT/data/util/flow_utils/region_fill.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import numpy as np
+import cv2
+from scipy import sparse
+from scipy.sparse.linalg import spsolve
+# Laplacian filling
+def regionfill(I, mask, factor=1.0):
+    if np.count_nonzero(mask) == 0:
+        return I.copy()
+    resize_mask = cv2.resize(
+        mask.astype(float), (0, 0), fx=factor, fy=factor) > 0
+    resize_I = cv2.resize(I.astype(float), (0, 0), fx=factor, fy=factor)
+    maskPerimeter = findBoundaryPixels(resize_mask)
+    regionfillLaplace(resize_I, resize_mask, maskPerimeter)
+    resize_I = cv2.resize(resize_I, (I.shape[1], I.shape[0]))
+    resize_I[mask == 0] = I[mask == 0]
+    return resize_I
+def findBoundaryPixels(mask):
+    kernel = cv2.getStructuringElement(cv2.MORPH_CROSS, (3, 3))
+    maskDilated = cv2.dilate(mask.astype(float), kernel)
+    return (maskDilated > 0) & (mask == 0)
+def regionfillLaplace(I, mask, maskPerimeter):
+    height, width = I.shape
+    rightSide = formRightSide(I, maskPerimeter)
+    # Location of mask pixels
+    maskIdx = np.where(mask)
+    # Only keep values for pixels that are in the mask
+    rightSide = rightSide[maskIdx]
+    # Number the mask pixels in a grid matrix
+    grid = -np.ones((height, width))
+    grid[maskIdx] = range(0, maskIdx[0].size)
+    # Pad with zeros to avoid "index out of bounds" errors in the for loop
+    grid = padMatrix(grid)
+    gridIdx = np.where(grid >= 0)
+    # Form the connectivity matrix D=sparse(i,j,s)
+    # Connect each mask pixel to itself
+    i = np.arange(0, maskIdx[0].size)
+    j = np.arange(0, maskIdx[0].size)
+    # The coefficient is the number of neighbors over which we average
+    numNeighbors = computeNumberOfNeighbors(height, width)
+    s = numNeighbors[maskIdx]
+    # Now connect the N,E,S,W neighbors if they exist
+    for direction in ((-1, 0), (0, 1), (1, 0), (0, -1)):
+        # Possible neighbors in the current direction
+        neighbors = grid[gridIdx[0] + direction[0], gridIdx[1] + direction[1]]
+        # ConDnect mask points to neighbors with -1's
+        index = (neighbors >= 0)
+        i = np.concatenate((i, grid[gridIdx[0][index], gridIdx[1][index]]))
+        j = np.concatenate((j, neighbors[index]))
+        s = np.concatenate((s, -np.ones(np.count_nonzero(index))))
+    D = sparse.coo_matrix((s, (i.astype(int), j.astype(int)))).tocsr()
+    sol = spsolve(D, rightSide)
+    I[maskIdx] = sol
+    return I
+def formRightSide(I, maskPerimeter):
+    height, width = I.shape
+    perimeterValues = np.zeros((height, width))
+    perimeterValues[maskPerimeter] = I[maskPerimeter]
+    rightSide = np.zeros((height, width))
+    rightSide[1:height - 1, 1:width - 1] = (
+        perimeterValues[0:height - 2, 1:width - 1] +
+        perimeterValues[2:height, 1:width - 1] +
+        perimeterValues[1:height - 1, 0:width - 2] +
+        perimeterValues[1:height - 1, 2:width])
+    rightSide[1:height - 1, 0] = (
+        perimeterValues[0:height - 2, 0] + perimeterValues[2:height, 0] +
+        perimeterValues[1:height - 1, 1])
+    rightSide[1:height - 1, width - 1] = (
+        perimeterValues[0:height - 2, width - 1] +
+        perimeterValues[2:height, width - 1] +
+        perimeterValues[1:height - 1, width - 2])
+    rightSide[0, 1:width - 1] = (
+        perimeterValues[1, 1:width - 1] + perimeterValues[0, 0:width - 2] +
+        perimeterValues[0, 2:width])
+    rightSide[height - 1, 1:width - 1] = (
+        perimeterValues[height - 2, 1:width - 1] +
+        perimeterValues[height - 1, 0:width - 2] +
+        perimeterValues[height - 1, 2:width])
+    rightSide[0, 0] = perimeterValues[0, 1] + perimeterValues[1, 0]
+    rightSide[0, width - 1] = (
+        perimeterValues[0, width - 2] + perimeterValues[1, width - 1])
+    rightSide[height - 1, 0] = (
+        perimeterValues[height - 2, 0] + perimeterValues[height - 1, 1])
+    rightSide[height - 1, width - 1] = (perimeterValues[height - 2, width - 1] +
+                                        perimeterValues[height - 1, width - 2])
+    return rightSide
+def computeNumberOfNeighbors(height, width):
+    # Initialize
+    numNeighbors = np.zeros((height, width))
+    # Interior pixels have 4 neighbors
+    numNeighbors[1:height - 1, 1:width - 1] = 4
+    # Border pixels have 3 neighbors
+    numNeighbors[1:height - 1, (0, width - 1)] = 3
+    numNeighbors[(0, height - 1), 1:width - 1] = 3
+    # Corner pixels have 2 neighbors
+    numNeighbors[(0, 0, height - 1, height - 1), (0, width - 1, 0,
+                                                  width - 1)] = 2
+    return numNeighbors
+def padMatrix(grid):
+    height, width = grid.shape
+    gridPadded = -np.ones((height + 2, width + 2))
+    gridPadded[1:height + 1, 1:width + 1] = grid
+    gridPadded = gridPadded.astype(grid.dtype)
+    return gridPadded
+if __name__ == '__main__':
+    import time
+    x = np.linspace(0, 255, 500)
+    xv, _ = np.meshgrid(x, x)
+    image = ((xv + np.transpose(xv)) / 2.0).astype(int)
+    mask = np.zeros((500, 500))
+    mask[100:259, 100:259] = 1
+    mask = (mask > 0)
+    image[mask] = 0
+    st = time.time()
+    inpaint = regionfill(image, mask, 0.5).astype(np.uint8)
+    print(time.time() - st)
+    cv2.imshow('img', np.concatenate((image.astype(np.uint8), inpaint)))
+    cv2.waitKey()

FGT_codes/FGT/data/util/freeform_masks.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import os
+import sys
+import shutil
+sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))  # NOQA
+import numpy as np
+import argparse
+from PIL import Image
+from .mask_generators import get_video_masks_by_moving_random_stroke, get_masked_ratio
+from .util import make_dirs, make_dir_under_root, get_everything_under
+from .readers import MaskReader
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-od', '--output_dir',
+        type=str,
+        help="Output directory name"
+    )
+    parser.add_argument(
+        '-im',
+        '--image_masks', action='store_true',
+        help="Set this if you want to generate independent masks in one directory."
+    )
+    parser.add_argument(
+        '-vl', '--video_len',
+        type=int,
+        help="Maximum video length (i.e. #mask)"
+    )
+    parser.add_argument(
+        '-ns', '--num_stroke',
+        type=int,
+        help="Number of stroke in one mask"
+    )
+    parser.add_argument(
+        '-nsb', '--num_stroke_bound',
+        type=int,
+        nargs=2,
+        help="Upper/lower bound of number of stroke in one mask"
+    )
+    parser.add_argument(
+        '-n',
+        type=int,
+        help="Number of mask to generate"
+    )
+    parser.add_argument(
+        '-sp',
+        '--stroke_preset',
+        type=str,
+        default='rand_curve',
+        help="Preset of the stroke parameters"
+    )
+    parser.add_argument(
+        '-iw',
+        '--image_width',
+        type=int,
+        default=320
+    )
+    parser.add_argument(
+        '-ih',
+        '--image_height',
+        type=int,
+        default=180
+    )
+    parser.add_argument(
+        '--cluster_by_area',
+        action='store_true'
+    )
+    parser.add_argument(
+        '--leave_boarder_unmasked',
+        type=int,
+        help='Set this to a number, then a copy of the mask where the mask of boarder is erased.'
+    )
+    parser.add_argument(
+        '--redo_without_generation',
+        action='store_true',
+        help='Set this, and the script will skip the generation and redo the left tasks'
+             '(uncluster -> erase boarder -> re-cluster)'
+    )
+    args = parser.parse_args()
+    return args
+def get_stroke_preset(stroke_preset):
+    if stroke_preset == 'object_like':
+        return {
+            "nVertexBound": [5, 30],
+            "maxHeadSpeed": 15,
+            "maxHeadAcceleration": (10, 1.5),
+            "brushWidthBound": (20, 50),
+            "nMovePointRatio": 0.5,
+            "maxPiontMove": 10,
+            "maxLineAcceleration": (5, 0.5),
+            "boarderGap": None,
+            "maxInitSpeed": 10,
+        }
+    elif stroke_preset == 'object_like_middle':
+        return {
+            "nVertexBound": [5, 15],
+            "maxHeadSpeed": 8,
+            "maxHeadAcceleration": (4, 1.5),
+            "brushWidthBound": (20, 50),
+            "nMovePointRatio": 0.5,
+            "maxPiontMove": 5,
+            "maxLineAcceleration": (5, 0.5),
+            "boarderGap": None,
+            "maxInitSpeed": 10,
+        }
+    elif stroke_preset == 'object_like_small':
+        return {
+            "nVertexBound": [5, 20],
+            "maxHeadSpeed": 7,
+            "maxHeadAcceleration": (3.5, 1.5),
+            "brushWidthBound": (10, 30),
+            "nMovePointRatio": 0.5,
+            "maxPiontMove": 5,
+            "maxLineAcceleration": (3, 0.5),
+            "boarderGap": None,
+            "maxInitSpeed": 4,
+        }
+    elif stroke_preset == 'rand_curve':
+        return {
+            "nVertexBound": [10, 30],
+            "maxHeadSpeed": 20,
+            "maxHeadAcceleration": (15, 0.5),
+            "brushWidthBound": (3, 10),
+            "nMovePointRatio": 0.5,
+            "maxPiontMove": 3,
+            "maxLineAcceleration": (5, 0.5),
+            "boarderGap": None,
+            "maxInitSpeed": 6
+        }
+    elif stroke_preset == 'rand_curve_small':
+        return {
+            "nVertexBound": [6, 22],
+            "maxHeadSpeed": 12,
+            "maxHeadAcceleration": (8, 0.5),
+            "brushWidthBound": (2.5, 5),
+            "nMovePointRatio": 0.5,
+            "maxPiontMove": 1.5,
+            "maxLineAcceleration": (3, 0.5),
+            "boarderGap": None,
+            "maxInitSpeed": 3
+        }
+    else:
+        raise NotImplementedError(f'The stroke presetting "{stroke_preset}" does not exist.')
+def copy_masks_without_boarder(root_dir, args):
+    def erase_mask_boarder(mask, gap):
+        pix = np.asarray(mask).astype('uint8') * 255
+        pix[:gap, :] = 255
+        pix[-gap:, :] = 255
+        pix[:, :gap] = 255
+        pix[:, -gap:] = 255
+        return Image.fromarray(pix).convert('1')
+    wo_boarder_dir = root_dir + '_noBoarder'
+    shutil.copytree(root_dir, wo_boarder_dir)
+    for i, filename in enumerate(get_everything_under(wo_boarder_dir)):
+        if args.image_masks:
+            mask = Image.open(filename)
+            mask_wo_boarder = erase_mask_boarder(mask, args.leave_boarder_unmasked)
+            mask_wo_boarder.save(filename)
+        else:
+            # filename is a diretory containing multiple mask files
+            for f in get_everything_under(filename, pattern='*.png'):
+                mask = Image.open(f)
+                mask_wo_boarder = erase_mask_boarder(mask, args.leave_boarder_unmasked)
+                mask_wo_boarder.save(f)
+    return wo_boarder_dir
+def cluster_by_masked_area(root_dir, args):
+    clustered_dir = root_dir + '_clustered'
+    make_dirs(clustered_dir)
+    radius = 5
+    # all masks with ratio in x +- radius will be stored in sub-directory x
+    clustered_centors = np.arange(radius, 100, radius * 2)
+    clustered_subdirs = []
+    for c in clustered_centors:
+        # make sub-directories for each ratio range
+        clustered_subdirs.append(make_dir_under_root(clustered_dir, str(c)))
+    for i, filename in enumerate(get_everything_under(root_dir)):
+        if args.image_masks:
+            ratio = get_masked_ratio(Image.open(filename))
+        else:
+            # filename is a diretory containing multiple mask files
+            ratio = np.mean([
+                get_masked_ratio(Image.open(f))
+                for f in get_everything_under(filename, pattern='*.png')
+            ])
+        # find the nearest centor
+        for i, c in enumerate(clustered_centors):
+            if c - radius <= ratio * 100 <= c + radius:
+                shutil.move(filename, clustered_subdirs[i])
+                break
+    shutil.rmtree(root_dir)
+    os.rename(clustered_dir, root_dir)
+def decide_nStroke(args):
+    if args.num_stroke is not None:
+        return args.num_stroke
+    elif args.num_stroke_bound is not None:
+        return np.random.randint(args.num_stroke_bound[0], args.num_stroke_bound[1])
+    else:
+        raise ValueError('One of "-ns" or "-nsb" is needed')
+def main(args):
+    preset = get_stroke_preset(args.stroke_preset)
+    make_dirs(args.output_dir)
+    if args.redo_without_generation:
+        assert(len(get_everything_under(args.output_dir)) > 0)
+        # put back clustered masks
+        for clustered_subdir in get_everything_under(args.output_dir):
+            if not os.path.isdir(clustered_subdir):
+                continue
+            for f in get_everything_under(clustered_subdir):
+                shutil.move(f, args.output_dir)
+            os.rmdir(clustered_subdir)
+    else:
+        if args.image_masks:
+            for i in range(args.n):
+                nStroke = decide_nStroke(args)
+                mask = get_video_masks_by_moving_random_stroke(
+                    video_len=1, imageWidth=args.image_width, imageHeight=args.image_height,
+                    nStroke=nStroke, **preset
+                )[0]
+                mask.save(os.path.join(args.output_dir, f'{i:07d}.png'))
+        else:
+            for i in range(args.n):
+                mask_dir = make_dir_under_root(args.output_dir, f'{i:05d}')
+                mask_reader = MaskReader(mask_dir, read=False)
+                nStroke = decide_nStroke(args)
+                masks = get_video_masks_by_moving_random_stroke(
+                    imageWidth=args.image_width, imageHeight=args.image_height,
+                    video_len=args.video_len, nStroke=nStroke, **preset)
+                mask_reader.set_files(masks)
+                mask_reader.save_files(output_dir=mask_reader.dir_name)
+    if args.leave_boarder_unmasked is not None:
+        dir_leave_boarder = copy_masks_without_boarder(args.output_dir, args)
+        if args.cluster_by_area:
+            cluster_by_masked_area(dir_leave_boarder, args)
+    if args.cluster_by_area:
+        cluster_by_masked_area(args.output_dir, args)
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

FGT_codes/FGT/data/util/mask_generators.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import numpy as np
+import random
+from PIL import Image, ImageDraw
+def get_video_masks_by_moving_random_stroke(
+    video_len, imageWidth=320, imageHeight=180, nStroke=5,
+    nVertexBound=[10, 30], maxHeadSpeed=15, maxHeadAcceleration=(15, 0.5),
+    brushWidthBound=(5, 20), boarderGap=None, nMovePointRatio=0.5, maxPiontMove=10,
+    maxLineAcceleration=5, maxInitSpeed=5
+):
+    '''
+    Get video masks by random strokes which move randomly between each
+    frame, including the whole stroke and its control points
+    Parameters
+    ----------
+        imageWidth: Image width
+        imageHeight: Image height
+        nStroke: Number of drawed lines
+        nVertexBound: Lower/upper bound of number of control points for each line
+        maxHeadSpeed: Max head speed when creating control points
+        maxHeadAcceleration: Max acceleration applying on the current head point (
+            a head point and its velosity decides the next point)
+        brushWidthBound (min, max): Bound of width for each stroke
+        boarderGap: The minimum gap between image boarder and drawed lines
+        nMovePointRatio: The ratio of control points to move for next frames
+        maxPiontMove: The magnitude of movement for control points for next frames
+        maxLineAcceleration: The magnitude of acceleration for the whole line
+    Examples
+    ----------
+        object_like_setting = {
+            "nVertexBound": [5, 20],
+            "maxHeadSpeed": 15,
+            "maxHeadAcceleration": (15, 3.14),
+            "brushWidthBound": (30, 50),
+            "nMovePointRatio": 0.5,
+            "maxPiontMove": 10,
+            "maxLineAcceleration": (5, 0.5),
+            "boarderGap": 20,
+            "maxInitSpeed": 10,
+        }
+        rand_curve_setting = {
+            "nVertexBound": [10, 30],
+            "maxHeadSpeed": 20,
+            "maxHeadAcceleration": (15, 0.5),
+            "brushWidthBound": (3, 10),
+            "nMovePointRatio": 0.5,
+            "maxPiontMove": 3,
+            "maxLineAcceleration": (5, 0.5),
+            "boarderGap": 20,
+            "maxInitSpeed": 6
+        }
+        get_video_masks_by_moving_random_stroke(video_len=5, nStroke=3, **object_like_setting)
+    '''
+    assert(video_len >= 1)
+    # Initilize a set of control points to draw the first mask
+    mask = Image.new(mode='1', size=(imageWidth, imageHeight), color=1)
+    control_points_set = []
+    for i in range(nStroke):
+        brushWidth = np.random.randint(brushWidthBound[0], brushWidthBound[1])
+        Xs, Ys, velocity = get_random_stroke_control_points(
+            imageWidth=imageWidth, imageHeight=imageHeight,
+            nVertexBound=nVertexBound, maxHeadSpeed=maxHeadSpeed,
+            maxHeadAcceleration=maxHeadAcceleration, boarderGap=boarderGap,
+            maxInitSpeed=maxInitSpeed
+        )
+        control_points_set.append((Xs, Ys, velocity, brushWidth))
+        draw_mask_by_control_points(mask, Xs, Ys, brushWidth, fill=0)
+    # Generate the following masks by randomly move strokes and their control points
+    masks = [mask]
+    for i in range(video_len - 1):
+        mask = Image.new(mode='1', size=(imageWidth, imageHeight), color=1)
+        for j in range(len(control_points_set)):
+            Xs, Ys, velocity, brushWidth = control_points_set[j]
+            new_Xs, new_Ys = random_move_control_points(
+                Xs, Ys, velocity, nMovePointRatio, maxPiontMove,
+                maxLineAcceleration, boarderGap
+            )
+            control_points_set[j] = (new_Xs, new_Ys, velocity, brushWidth)
+        for Xs, Ys, velocity, brushWidth in control_points_set:
+            draw_mask_by_control_points(mask, Xs, Ys, brushWidth, fill=0)
+        masks.append(mask)
+    return masks
+def random_accelerate(velocity, maxAcceleration, dist='uniform'):
+    speed, angle = velocity
+    d_speed, d_angle = maxAcceleration
+    if dist == 'uniform':
+        speed += np.random.uniform(-d_speed, d_speed)
+        angle += np.random.uniform(-d_angle, d_angle)
+    elif dist == 'guassian':
+        speed += np.random.normal(0, d_speed / 2)
+        angle += np.random.normal(0, d_angle / 2)
+    else:
+        raise NotImplementedError(f'Distribution type {dist} is not supported.')
+    return (speed, angle)
+def random_move_control_points(Xs, Ys, lineVelocity, nMovePointRatio, maxPiontMove, maxLineAcceleration, boarderGap=15):
+    new_Xs = Xs.copy()
+    new_Ys = Ys.copy()
+    # move the whole line and accelerate
+    speed, angle = lineVelocity
+    new_Xs += int(speed * np.cos(angle))
+    new_Ys += int(speed * np.sin(angle))
+    lineVelocity = random_accelerate(lineVelocity, maxLineAcceleration, dist='guassian')
+    # choose points to move
+    chosen = np.arange(len(Xs))
+    np.random.shuffle(chosen)
+    chosen = chosen[:int(len(Xs) * nMovePointRatio)]
+    for i in chosen:
+        new_Xs[i] += np.random.randint(-maxPiontMove, maxPiontMove)
+        new_Ys[i] += np.random.randint(-maxPiontMove, maxPiontMove)
+    return new_Xs, new_Ys
+def get_random_stroke_control_points(
+    imageWidth, imageHeight,
+    nVertexBound=(10, 30), maxHeadSpeed=10, maxHeadAcceleration=(5, 0.5), boarderGap=20,
+    maxInitSpeed=10
+):
+    '''
+    Implementation the free-form training masks generating algorithm
+    proposed by JIAHUI YU et al. in "Free-Form Image Inpainting with Gated Convolution"
+    '''
+    startX = np.random.randint(imageWidth)
+    startY = np.random.randint(imageHeight)
+    Xs = [startX]
+    Ys = [startY]
+    numVertex = np.random.randint(nVertexBound[0], nVertexBound[1])
+    angle = np.random.uniform(0, 2 * np.pi)
+    speed = np.random.uniform(0, maxHeadSpeed)
+    for i in range(numVertex):
+        speed, angle = random_accelerate((speed, angle), maxHeadAcceleration)
+        speed = np.clip(speed, 0, maxHeadSpeed)
+        nextX = startX + speed * np.sin(angle)
+        nextY = startY + speed * np.cos(angle)
+        if boarderGap is not None:
+            nextX = np.clip(nextX, boarderGap, imageWidth - boarderGap)
+            nextY = np.clip(nextY, boarderGap, imageHeight - boarderGap)
+        startX, startY = nextX, nextY
+        Xs.append(nextX)
+        Ys.append(nextY)
+    velocity = get_random_velocity(maxInitSpeed, dist='guassian')
+    return np.array(Xs), np.array(Ys), velocity
+def get_random_velocity(max_speed, dist='uniform'):
+    if dist == 'uniform':
+        speed = np.random.uniform(max_speed)
+    elif dist == 'guassian':
+        speed = np.abs(np.random.normal(0, max_speed / 2))
+    else:
+        raise NotImplementedError(f'Distribution type {dist} is not supported.')
+    angle = np.random.uniform(0, 2 * np.pi)
+    return (speed, angle)
+def draw_mask_by_control_points(mask, Xs, Ys, brushWidth, fill=255):
+    radius = brushWidth // 2 - 1
+    for i in range(1, len(Xs)):
+        draw = ImageDraw.Draw(mask)
+        startX, startY = Xs[i - 1], Ys[i - 1]
+        nextX, nextY = Xs[i], Ys[i]
+        draw.line((startX, startY) + (nextX, nextY), fill=fill, width=brushWidth)
+    for x, y in zip(Xs, Ys):
+        draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=fill)
+    return mask
+# modified from https://github.com/naoto0804/pytorch-inpainting-with-partial-conv/blob/master/generate_data.py
+def get_random_walk_mask(imageWidth=320, imageHeight=180, length=None):
+    action_list = [[0, 1], [0, -1], [1, 0], [-1, 0]]
+    canvas = np.zeros((imageHeight, imageWidth)).astype("i")
+    if length is None:
+        length = imageWidth * imageHeight
+    x = random.randint(0, imageHeight - 1)
+    y = random.randint(0, imageWidth - 1)
+    x_list = []
+    y_list = []
+    for i in range(length):
+        r = random.randint(0, len(action_list) - 1)
+        x = np.clip(x + action_list[r][0], a_min=0, a_max=imageHeight - 1)
+        y = np.clip(y + action_list[r][1], a_min=0, a_max=imageWidth - 1)
+        x_list.append(x)
+        y_list.append(y)
+    canvas[np.array(x_list), np.array(y_list)] = 1
+    return Image.fromarray(canvas * 255).convert('1')
+def get_masked_ratio(mask):
+    """
+    Calculate the masked ratio.
+    mask: Expected a binary PIL image, where 0 and 1 represent
+          masked(invalid) and valid pixel values.
+    """
+    hist = mask.histogram()
+    return hist[0] / np.prod(mask.size)

FGT_codes/FGT/data/util/readers.py ADDED Viewed

	@@ -0,0 +1,527 @@

+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))  # NOQA
+import argparse
+from math import ceil
+from glob import glob
+import numpy as np
+import cv2
+from PIL import Image, ImageDraw, ImageOps, ImageFont
+from utils.logging_config import logger
+from utils.util import make_dirs, bbox_offset
+DEFAULT_FPS = 6
+MAX_LENGTH = 60
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-fps', '--fps',
+        type=int, default=DEFAULT_FPS,
+        help="Output video FPS"
+    )
+    parser.add_argument(
+        '-v', '--video_dir',
+        type=str,
+        help="Video directory name"
+    )
+    parser.add_argument(
+        '-vs', '--video_dirs',
+        nargs='+',
+        type=str,
+        help="Video directory names"
+    )
+    parser.add_argument(
+        '-v2', '--video_dir2',
+        type=str,
+        help="Video directory name"
+    )
+    parser.add_argument(
+        '-sd', '--segms_dir',
+        type=str,
+        help="Segmentation directory name"
+    )
+    parser.add_argument(
+        '-fgd', '--fg_dir',
+        type=str,
+        help="Foreground directory name"
+    )
+    parser.add_argument(
+        '-fgfd', '--fg_frames_dir',
+        type=str,
+        help="Foreground frames directory name"
+    )
+    parser.add_argument(
+        '-fgsd', '--fg_segms_dir',
+        type=str,
+        help="Foreground segmentations directory name"
+    )
+    parser.add_argument(
+        '-syfd', '--syn_frames_dir',
+        type=str,
+        help="Synthesized frames directory name"
+    )
+    parser.add_argument(
+        '-bgfd', '--bg_frames_dir',
+        type=str,
+        help="Background frames directory name"
+    )
+    parser.add_argument(
+        '-rt', '--reader_type',
+        type=str,
+        help="Type of reader"
+    )
+    parser.add_argument(
+        '-od', '--output_dir',
+        type=str,
+        help="Output directory name"
+    )
+    parser.add_argument(
+        '-o', '--output_filename',
+        type=str, required=True,
+        help="Output output filename"
+    )
+    args = parser.parse_args()
+    return args
+class Reader:
+    def __init__(self, dir_name, read=True, max_length=None, sample_period=1):
+        self.dir_name = dir_name
+        self.count = 0
+        self.max_length = max_length
+        self.filenames = []
+        self.sample_period = sample_period
+        if read:
+            if os.path.exists(dir_name):
+                # self.filenames = read_filenames_from_dir(dir_name, self.__class__.__name__)
+                # ^^^^^ yield None when reading some videos of face forensics data
+                # (related to 'Too many levels of symbolic links'?)
+                self.filenames = sorted(glob(os.path.join(dir_name, '*')))
+                self.filenames = [f for f in self.filenames if os.path.isfile(f)]
+                self.filenames = self.filenames[::sample_period][:max_length]
+                self.files = self.read_files(self.filenames)
+            else:
+                self.files = []
+                logger.warning(f"Directory {dir_name} not exists!")
+        else:
+            self.files = []
+        self.current_index = 0
+    def append(self, file_):
+        self.files.append(file_)
+    def set_files(self, files):
+        self.files = files
+    def read_files(self, filenames):
+        assert type(filenames) == list, f'filenames is not a list; dirname: {self.dir_name}'
+        filenames.sort()
+        frames = []
+        for filename in filenames:
+            file_ = self.read_file(filename)
+            frames.append(file_)
+        return frames
+    def save_files(self, output_dir=None):
+        make_dirs(output_dir)
+        logger.info(f"Saving {self.__class__.__name__} files to {output_dir}")
+        for i, file_ in enumerate(self.files):
+            self._save_file(output_dir, i, file_)
+    def _save_file(self, output_dir, i, file_):
+        raise NotImplementedError("This is an abstract function")
+    def read_file(self, filename):
+        raise NotImplementedError("This is an abstract function")
+    def __iter__(self):
+        return self
+    def __next__(self):
+        if self.current_index < len(self.files):
+            file_ = self.files[self.current_index]
+            self.current_index += 1
+            return file_
+        else:
+            self.current_index = 0
+            raise StopIteration
+    def __getitem__(self, key):
+        return self.files[key]
+    def __len__(self):
+        return len(self.files)
+class FrameReader(Reader):
+    def __init__(
+        self, dir_name, resize=None, read=True, max_length=MAX_LENGTH,
+        scale=1, sample_period=1
+    ):
+        self.resize = resize
+        self.scale = scale
+        self.sample_period = sample_period
+        super().__init__(dir_name, read, max_length, sample_period)
+    def read_file(self, filename):
+        origin_frame = Image.open(filename)
+        size = self.resize if self.resize is not None else origin_frame.size
+        origin_frame_resized = origin_frame.resize(
+            (int(size[0] * self.scale), int(size[1] * self.scale))
+        )
+        return origin_frame_resized
+    def _save_file(self, output_dir, i, file_):
+        if len(self.filenames) == len(self.files):
+            name = sorted(self.filenames)[i].split('/')[-1]
+        else:
+            name = f"frame_{i:04}.png"
+        filename = os.path.join(
+            output_dir, name
+        )
+        file_.save(filename, "PNG")
+    def write_files_to_video(self, output_filename, fps=DEFAULT_FPS, frame_num_when_repeat_list=[1]):
+        logger.info(
+            f"Writeing frames to video {output_filename} with FPS={fps}")
+        video_writer = cv2.VideoWriter(
+            output_filename,
+            cv2.VideoWriter_fourcc(*"MJPG"),
+            fps,
+            self.files[0].size
+        )
+        for frame_num_when_repeat in frame_num_when_repeat_list:
+            for frame in self.files:
+                frame = frame.convert("RGB")
+                frame_cv = np.array(frame)
+                frame_cv = cv2.cvtColor(frame_cv, cv2.COLOR_RGB2BGR)
+                for i in range(frame_num_when_repeat):
+                    video_writer.write(frame_cv)
+        video_writer.release()
+class SynthesizedFrameReader(FrameReader):
+    def __init__(
+        self, bg_frames_dir, fg_frames_dir,
+        fg_segms_dir, segm_bbox_mask_dir, fg_dir, dir_name,
+        bboxes_list_dir,
+        fg_scale=0.7, fg_location=(48, 27), mask_only=False
+    ):
+        self.bg_reader = FrameReader(bg_frames_dir)
+        self.size = self.bg_reader[0].size
+        # TODO: add different location and change scale to var
+        self.fg_reader = ForegroundReader(
+            fg_frames_dir, fg_segms_dir, fg_dir,
+            resize=self.size,
+            scale=fg_scale
+        )
+        self.fg_location = fg_location
+        # self.masks = self.fg_reader.masks
+        # self.bbox_masks = self.fg_reader.bbox_masks
+        super().__init__(dir_name, read=False)
+        self.files = self.synthesize_frames(
+            self.bg_reader, self.fg_reader, mask_only)
+        self.bbox_masks = MaskGenerator(
+            segm_bbox_mask_dir, self.size, self.get_bboxeses()
+        )
+        self.bboxes_list_dir = bboxes_list_dir
+        self.bboxes_list = self.get_bboxeses()
+        self.save_bboxes()
+    def save_bboxes(self):
+        make_dirs(self.bboxes_list_dir)
+        logger.info(f"Saving bboxes to {self.bboxes_list_dir}")
+        for i, bboxes in enumerate(self.bboxes_list):
+            save_path = os.path.join(self.bboxes_list_dir, f"bboxes_{i:04}.txt")
+            if len(bboxes) > 0:
+                np.savetxt(save_path, bboxes[0], fmt='%4u')
+    def get_bboxeses(self):
+        bboxeses = self.fg_reader.segms.bboxeses
+        new_bboxeses = []
+        for bboxes in bboxeses:
+            new_bboxes = []
+            for bbox in bboxes:
+                offset_bbox = bbox_offset(bbox, self.fg_location)
+                new_bboxes.append(offset_bbox)
+            new_bboxeses.append(new_bboxes)
+        return new_bboxeses
+    def synthesize_frames(self, bg_reader, fg_reader, mask_only=False):
+        logger.info(
+            f"Synthesizing {bg_reader.dir_name} and {fg_reader.dir_name}"
+        )
+        synthesized_frames = []
+        for i, bg in enumerate(bg_reader):
+            if i == len(fg_reader):
+                break
+            fg = fg_reader[i]
+            mask = fg_reader.get_mask(i)
+            synthesized_frame = bg.copy()
+            if mask_only:
+                synthesized_frame.paste(mask, self.fg_location, mask)
+            else:
+                synthesized_frame.paste(fg, self.fg_location, mask)
+            synthesized_frames.append(synthesized_frame)
+        return synthesized_frames
+class WarpedFrameReader(FrameReader):
+    def __init__(self, dir_name, i, ks):
+        self.i = i
+        self.ks = ks
+        super().__init__(dir_name)
+    def _save_file(self, output_dir, i, file_):
+        filename = os.path.join(
+            output_dir,
+            f"warped_frame_{self.i:04}_k{self.ks[i]:02}.png"
+        )
+        file_.save(filename)
+class SegmentationReader(FrameReader):
+    def __init__(
+        self, dir_name,
+        resize=None, scale=1
+    ):
+        super().__init__(
+            dir_name, resize=resize, scale=scale
+        )
+    def read_file(self, filename):
+        origin_frame = Image.open(filename)
+        mask = ImageOps.invert(origin_frame.convert("L"))
+        mask = mask.point(lambda x: 0 if x < 255 else 255, '1')
+        size = self.resize if self.resize is not None else origin_frame.size
+        mask_resized = mask.resize(
+            (int(size[0] * self.scale), int(size[1] * self.scale))
+        )
+        return mask_resized
+class MaskReader(Reader):
+    def __init__(self, dir_name, read=True):
+        super().__init__(dir_name, read=read)
+    def read_file(self, filename):
+        mask = Image.open(filename)
+        return mask
+    def _save_file(self, output_dir, i, file_):
+        filename = os.path.join(
+            output_dir,
+            f"mask_{i:04}.png"
+        )
+        file_.save(filename)
+    def get_bboxes(self, i):
+        # TODO: save bbox instead of looking for one
+        mask = self.files[i]
+        mask = ImageOps.invert(mask.convert("L")).convert("1")
+        mask = np.array(mask)
+        image, contours, hier = cv2.findContours(
+            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
+        bboxes = []
+        for c in contours:
+            # get the bounding rect
+            x, y, w, h = cv2.boundingRect(c)
+            bbox = ((x, y), (x + w - 1, y + h - 1))
+            bboxes.append(bbox)
+        return bboxes
+    def get_bbox(self, i):
+        # TODO: save bbox instead of looking for one
+        mask = self.files[i]
+        mask = ImageOps.invert(mask.convert("L"))
+        mask = np.array(mask)
+        image, contours, hier = cv2.findContours(
+            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
+        for c in contours:
+            # get the bounding rect
+            x, y, w, h = cv2.boundingRect(c)
+            bbox = ((x, y), (x + w - 1, y + h - 1))
+            return bbox
+class MaskGenerator(Reader):
+    def __init__(
+        self, mask_output_dir, size, bboxeses, save_masks=True
+    ):
+        self.bboxeses = bboxeses
+        self.size = size
+        super().__init__(mask_output_dir, read=False)
+        self.files = self.generate_masks()
+        if save_masks:
+            make_dirs(mask_output_dir)
+            self.save_files(mask_output_dir)
+    def _save_file(self, output_dir, i, file_):
+        filename = os.path.join(
+            output_dir,
+            f"mask_{i:04}.png"
+        )
+        file_.save(filename)
+    def get_bboxes(self, i):
+        return self.bboxeses[i]
+    def generate_masks(self):
+        masks = []
+        for i in range(len(self.bboxeses)):
+            mask = self.generate_mask(i)
+            masks.append(mask)
+        return masks
+    def generate_mask(self, i):
+        bboxes = self.bboxeses[i]
+        mask = Image.new("1", self.size, 1)
+        draw = ImageDraw.Draw(mask)
+        for bbox in bboxes:
+            draw.rectangle(
+                bbox, fill=0
+            )
+        return mask
+class ForegroundReader(FrameReader):
+    def __init__(
+        self, frames_dir, segms_dir, dir_name,
+        resize=None, scale=1
+    ):
+        self.frames_dir = frames_dir
+        self.segms_dir = segms_dir
+        self.frames = FrameReader(
+            frames_dir,
+            resize=resize, scale=scale
+        )
+        self.segms = SegmentationReader(
+            segms_dir, resize=resize, scale=scale
+        )
+        super().__init__(dir_name, read=False)
+        self.masks = self.segms.masks
+        # self.bbox_masks = self.segms.bbox_masks
+        self.files = self.generate_fg_frames(self.frames, self.segms)
+    def get_mask(self, i):
+        return self.masks[i]
+    def generate_fg_frames(self, frames, segms):
+        logger.info(
+            f"Generating fg frames from {self.frames_dir} and {self.segms_dir}"
+        )
+        fg_frames = []
+        for i, frame in enumerate(frames):
+            mask = self.masks[i]
+            fg_frame = Image.new("RGB", frame.size, (0, 0, 0))
+            fg_frame.paste(
+                frame, (0, 0),
+                mask
+            )
+            fg_frames.append(fg_frame)
+        return fg_frames
+class CompareFramesReader(FrameReader):
+    def __init__(self, dir_names, col=2, names=[], mask_dir=None):
+        self.videos = []
+        for dir_name in dir_names:
+            # If a method fails on this video, use None to indicate the situation
+            try:
+                self.videos.append(FrameReader(dir_name))
+            except AssertionError:
+                self.videos.append(None)
+        if mask_dir is not None:
+            self.masks = MaskReader(mask_dir)
+        self.names = names
+        self.files = self.combine_videos(self.videos, col)
+    def combine_videos(self, videos, col=2, edge_offset=35, h_start_offset=35):
+        combined_frames = []
+        w, h = videos[0][0].size
+        # Prevent the first method fails and have a "None" as its video
+        i = 0
+        while videos[i] is None:
+            i += 1
+        length = len(videos[i])
+        video_num = len(videos)
+        row = ceil(video_num / col)
+        for frame_idx in range(length):
+            width = col * w + (col - 1) * edge_offset
+            height = row * h + (row - 1) * edge_offset + h_start_offset
+            combined_frame = Image.new("RGBA", (width, height))
+            draw = ImageDraw.Draw(combined_frame)
+            for i, video in enumerate(videos):
+                # Give the failed method a black output
+                if video is None or frame_idx >= len(video):
+                    failed = True
+                    frame = Image.new("RGBA", (w, h))
+                else:
+                    frame = video[frame_idx].convert("RGBA")
+                    failed = False
+                f_x = (i % col) * (w + edge_offset)
+                f_y = (i // col) * (h + edge_offset) + h_start_offset
+                combined_frame.paste(frame, (f_x, f_y))
+                # Draw name
+                font = ImageFont.truetype("DejaVuSans.ttf", 12)
+                # font = ImageFont.truetype("DejaVuSans-Bold.ttf", 13)
+                # font = ImageFont.truetype("timesbd.ttf", 14)
+                name = self.names[i] if not failed else f'{self.names[i]} (failed)'
+                draw.text(
+                    (f_x + 10, f_y - 20),
+                    name, (255, 255, 255), font=font
+                )
+            combined_frames.append(combined_frame)
+        return combined_frames
+class BoundingBoxesListReader(Reader):
+    def __init__(
+        self, dir_name, resize=None, read=True, max_length=MAX_LENGTH,
+        scale=1
+    ):
+        self.resize = resize
+        self.scale = scale
+        super().__init__(dir_name, read, max_length)
+    def read_file(self, filename):
+        bboxes = np.loadtxt(filename, dtype=int)
+        bboxes = [bboxes.tolist()]
+        return bboxes
+def save_frames_to_dir(frames, dirname):
+    reader = FrameReader(dirname, read=False)
+    reader.set_files(frames)
+    reader.save_files(dirname)
+if __name__ == "__main__":
+    args = parse_args()
+    if args.reader_type is None:
+        reader = FrameReader(args.video_dir)
+    elif args.reader_type == 'fg':
+        reader = ForegroundReader(
+            args.video_dir, args.segms_dir, args.fg_dir)
+    elif args.reader_type == 'sy':
+        reader = SynthesizedFrameReader(
+            args.bg_frames_dir, args.fg_frames_dir,
+            args.fg_segms_dir, args.fg_dir, args.syn_frames_dir
+        )
+    elif args.reader_type == 'com':
+        reader = CompareFramesReader(
+            args.video_dirs
+        )
+    reader.write_files_to_video(
+        os.path.join(args.output_dir, args.output_filename),
+        fps=args.fps
+    )

FGT_codes/FGT/data/util/util.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import os
+import argparse
+import shutil
+from glob import glob
+import numpy as np
+from PIL import Image
+from utils.logging_config import logger
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-v', '--video_dir',
+        type=str,
+        help="Video directory name"
+    )
+    parser.add_argument(
+        '-fl', '--flow_dir',
+        type=str,
+        help="Optical flow ground truth directory name"
+    )
+    parser.add_argument(
+        '-od', '--output_dir',
+        type=str,
+        help="Output directory name"
+    )
+    parser.add_argument(
+        '-o', '--output_filename',
+        type=str,
+        help="Output output filename"
+    )
+    args = parser.parse_args()
+    return args
+def make_dirs(dir_name):
+    if not os.path.exists(dir_name):
+        os.makedirs(dir_name)
+        logger.info(f"Directory {dir_name} made")
+ensure_dir = make_dirs
+def make_dir_under_root(root_dir, name):
+    full_dir_name = os.path.join(root_dir, name)
+    make_dirs(full_dir_name)
+    return full_dir_name
+def rm_dirs(dir_name, ignore_errors=False):
+    if os.path.exists(dir_name):
+        shutil.rmtree(dir_name, ignore_errors)
+        logger.info(f"Directory {dir_name} removed")
+def read_dirnames_under_root(root_dir, skip_list=[]):
+    dirnames = [
+        name for i, name in enumerate(sorted(os.listdir(root_dir)))
+        if (os.path.isdir(os.path.join(root_dir, name))
+            and name not in skip_list
+            and i not in skip_list)
+    ]
+    logger.info(f"Reading directories under {root_dir}, exclude {skip_list}, num: {len(dirnames)}")
+    return dirnames
+def bbox_offset(bbox, location):
+    x0, y0 = location
+    (x1, y1), (x2, y2) = bbox
+    return ((x1 + x0, y1 + y0), (x2 + x0, y2 + y0))
+def cover2_bbox(bbox1, bbox2):
+    x1 = min(bbox1[0][0], bbox2[0][0])
+    y1 = min(bbox1[0][1], bbox2[0][1])
+    x2 = max(bbox1[1][0], bbox2[1][0])
+    y2 = max(bbox1[1][1], bbox2[1][1])
+    return ((x1, y1), (x2, y2))
+def extend_r_bbox(bbox, w, h, r):
+    (x1, y1), (x2, y2) = bbox
+    x1 = max(x1 - r, 0)
+    x2 = min(x2 + r, w)
+    y1 = max(y1 - r, 0)
+    y2 = min(y2 + r, h)
+    return ((x1, y1), (x2, y2))
+def mean_squared_error(A, B):
+    return np.square(np.subtract(A, B)).mean()
+def bboxes_to_mask(size, bboxes):
+    mask = Image.new("L", size, 255)
+    mask = np.array(mask)
+    for bbox in bboxes:
+        try:
+            (x1, y1), (x2, y2) = bbox
+        except Exception:
+            (x1, y1, x2, y2) = bbox
+        mask[y1:y2, x1:x2] = 0
+    mask = Image.fromarray(mask.astype("uint8"))
+    return mask
+def get_extended_from_box(img_size, box, patch_size):
+    def _decide_patch_num(box_width, patch_size):
+        num = np.ceil(box_width / patch_size).astype(np.int)
+        if (num * patch_size - box_width) < (patch_size // 2):
+            num += 1
+        return num
+    x1, y1 = box[0]
+    x2, y2 = box[1]
+    new_box = (x1, y1, x2 - x1, y2 - y1)
+    box_x_start, box_y_start, box_x_size, box_y_size = new_box
+    patchN_x = _decide_patch_num(box_x_size, patch_size)
+    patchN_y = _decide_patch_num(box_y_size, patch_size)
+    extend_x = (patch_size * patchN_x - box_x_size) // 2
+    extend_y = (patch_size * patchN_y - box_y_size) // 2
+    img_x_size = img_size[0]
+    img_y_size = img_size[1]
+    x_start = max(0, box_x_start - extend_x)
+    x_end = min(box_x_start - extend_x + patchN_x * patch_size, img_x_size)
+    y_start = max(0, box_y_start - extend_y)
+    y_end = min(box_y_start - extend_y + patchN_y * patch_size, img_y_size)
+    x_start, y_start, x_end, y_end = int(x_start), int(y_start), int(x_end), int(y_end)
+    extented_box = ((x_start, y_start), (x_end, y_end))
+    return extented_box
+# code modified from https://github.com/WonwoongCho/Generative-Inpainting-pytorch/blob/master/util.py
+def spatial_discounting_mask(mask_width, mask_height, discounting_gamma):
+    """Generate spatial discounting mask constant.
+    Spatial discounting mask is first introduced in publication:
+        Generative Image Inpainting with Contextual Attention, Yu et al.
+    Returns:
+        np.array: spatial discounting mask
+    """
+    gamma = discounting_gamma
+    mask_values = np.ones((mask_width, mask_height), dtype=np.float32)
+    for i in range(mask_width):
+        for j in range(mask_height):
+            mask_values[i, j] = max(
+                gamma**min(i, mask_width - i),
+                gamma**min(j, mask_height - j))
+    return mask_values
+def bboxes_to_discounting_loss_mask(img_size, bboxes, discounting_gamma=0.99):
+    mask = np.zeros(img_size, dtype=np.float32) + 0.5
+    for bbox in bboxes:
+        try:
+            (x1, y1), (x2, y2) = bbox
+        except Exception:
+            (x1, y1, x2, y2) = bbox
+        mask_width, mask_height = y2 - y1, x2 - x1
+        mask[y1:y2, x1:x2] = spatial_discounting_mask(mask_width, mask_height, discounting_gamma)
+    return mask
+def find_proper_window(image_size, bbox_point):
+    '''
+        parameters:
+            image_size(2-tuple): (height, width)
+            bbox_point(2-2-tuple): (first_point, last_point)
+        return values:
+            window left-up point, (2-tuple)
+            window right-bottom point, (2-tuple)
+    '''
+    bbox_height = bbox_point[1][0] - bbox_point[0][0]
+    bbox_width = bbox_point[1][1] - bbox_point[0][1]
+    window_size = min(
+        max(bbox_height, bbox_width) * 2,
+        image_size[0], image_size[1]
+    )
+    # Limit min window size due to the requirement of VGG16
+    window_size = max(window_size, 32)
+    horizontal_span = window_size - (bbox_point[1][1] - bbox_point[0][1])
+    vertical_span = window_size - (bbox_point[1][0] - bbox_point[0][0])
+    top_bound, bottom_bound = bbox_point[0][0] - \
+        vertical_span // 2, bbox_point[1][0] + vertical_span // 2
+    left_bound, right_bound = bbox_point[0][1] - \
+        horizontal_span // 2, bbox_point[1][1] + horizontal_span // 2
+    if left_bound < 0:
+        right_bound += 0 - left_bound
+        left_bound += 0 - left_bound
+    elif right_bound > image_size[1]:
+        left_bound -= right_bound - image_size[1]
+        right_bound -= right_bound - image_size[1]
+    if top_bound < 0:
+        bottom_bound += 0 - top_bound
+        top_bound += 0 - top_bound
+    elif bottom_bound > image_size[0]:
+        top_bound -= bottom_bound - image_size[0]
+        bottom_bound -= bottom_bound - image_size[0]
+    return (top_bound, left_bound), (bottom_bound, right_bound)
+def drawrect(drawcontext, xy, outline=None, width=0, partial=None):
+    (x1, y1), (x2, y2) = xy
+    if partial is None:
+        points = (x1, y1), (x2, y1), (x2, y2), (x1, y2), (x1, y1)
+        drawcontext.line(points, fill=outline, width=width)
+    else:
+        drawcontext.line([(x1, y1), (x1, y1 + partial)], fill=outline, width=width)
+        drawcontext.line([(x1 + partial, y1), (x1, y1)], fill=outline, width=width)
+        drawcontext.line([(x2, y1), (x2, y1 + partial)], fill=outline, width=width)
+        drawcontext.line([(x2, y1), (x2 - partial, y1)], fill=outline, width=width)
+        drawcontext.line([(x1, y2), (x1 + partial, y2)], fill=outline, width=width)
+        drawcontext.line([(x1, y2), (x1, y2 - partial)], fill=outline, width=width)
+        drawcontext.line([(x2 - partial, y2), (x2, y2)], fill=outline, width=width)
+        drawcontext.line([(x2, y2), (x2, y2 - partial)], fill=outline, width=width)
+def get_everything_under(root_dir, pattern='*', only_dirs=False, only_files=False):
+    assert not(only_dirs and only_files), 'You will get nothnig '\
+        'when "only_dirs" and "only_files" are both set to True'
+    everything = sorted(glob(os.path.join(root_dir, pattern)))
+    if only_dirs:
+        everything = [f for f in everything if os.path.isdir(f)]
+    if only_files:
+        everything = [f for f in everything if os.path.isfile(f)]
+    return everything
+def read_filenames_from_dir(dir_name, reader, max_length=None):
+    logger.debug(
+        f"{reader} reading files from {dir_name}")
+    filenames = []
+    for root, dirs, files in os.walk(dir_name):
+        assert len(dirs) == 0, f"There are direcories: {dirs} in {root}"
+        assert len(files) != 0, f"There are no files in {root}"
+        filenames = [os.path.join(root, name) for name in sorted(files)]
+        for name in filenames:
+            logger.debug(name)
+        if max_length is not None:
+            return filenames[:max_length]
+        return filenames

FGT_codes/FGT/data/util/utils.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import random
+import numpy as np
+import cv2
+def random_bbox(img_height, img_width, vertical_margin, horizontal_margin, mask_height, mask_width):
+    maxt = img_height - vertical_margin - mask_height
+    maxl = img_width - horizontal_margin - mask_width
+    t = random.randint(vertical_margin, maxt)
+    l = random.randint(horizontal_margin, maxl)
+    h = random.randint(mask_height // 2, mask_height)
+    w = random.randint(mask_width // 2, mask_width)
+    return (t, l, h, w)  # 产生随机块状box,这个box后面会发展成为mask
+def mid_bbox_mask(img_height, img_width, mask_height, mask_width):
+    def npmask(bbox, height, width):
+        mask = np.zeros((height, width, 1), np.float32)
+        mask[bbox[0]: bbox[0] + bbox[2], bbox[1]: bbox[1] + bbox[3], :] = 255.
+        return mask
+    bbox = (img_height * 3 // 8, img_width * 3 // 8, mask_height, mask_width)
+    mask = npmask(bbox, img_height, img_width)
+    return mask
+def bbox2mask(img_height, img_width, max_delta_height, max_delta_width, bbox):
+    """Generate mask tensor from bbox.
+    Args:
+        bbox: configuration tuple, (top, left, height, width)
+        config: Config should have configuration including IMG_SHAPES,
+            MAX_DELTA_HEIGHT, MAX_DELTA_WIDTH.
+    Returns:
+        tf.Tensor: output with shape [B, 1, H, W]
+    """
+    def npmask(bbox, height, width, delta_h, delta_w):
+        mask = np.zeros((height, width, 1), np.float32)
+        h = np.random.randint(delta_h // 2 + 1)  # 防止有0产生
+        w = np.random.randint(delta_w // 2 + 1)
+        mask[bbox[0] + h: bbox[0] + bbox[2] - h, bbox[1] + w: bbox[1] + bbox[3] - w, :] = 255.  # height_true = height - 2 * h, width_true = width - 2 * w
+        return mask
+    mask = npmask(bbox, img_height, img_width,
+                  max_delta_height,
+                  max_delta_width)
+    return mask
+def matrix2bbox(img_height, img_width, mask_height, mask_width, row, column):
+    """Generate masks with a matrix form
+    @param img_height
+    @param img_width
+    @param mask_height
+    @param mask_width
+    @param row: number of blocks in row
+    @param column: number of blocks in column
+    @return mbbox: multiple bboxes in (y, h, h, w) manner
+    """
+    assert img_height - column * mask_height > img_height // 2, "Too many masks across a column"
+    assert img_width - row * mask_width > img_width // 2, "Too many masks across a row"
+    interval_height = (img_height - column * mask_height) // (column + 1)
+    interval_width = (img_width - row * mask_width) // (row + 1)
+    mbbox = []
+    for i in range(row):
+        for j in range(column):
+            y = interval_height * (j+1) + j * mask_height
+            x = interval_width * (i+1) + i * mask_width
+            mbbox.append((y, x, mask_height, mask_width))
+    return mbbox
+def mbbox2masks(img_height, img_width, mbbox):
+    def npmask(mbbox, height, width):
+        mask = np.zeros((height, width, 1), np.float32)
+        for bbox in mbbox:
+            mask[bbox[0]: bbox[0] + bbox[2], bbox[1]: bbox[1] + bbox[3], :] = 255.  # height_true = height - 2 * h, width_true = width - 2 * w
+        return mask
+    mask = npmask(mbbox, img_height, img_width)
+    return mask
+def draw_line(mask, startX, startY, angle, length, brushWidth):
+    """assume the size of mask is (H,W,1)
+    """
+    assert len(mask.shape) == 2 or mask.shape[2] == 1, "The channel of mask doesn't fit the opencv format"
+    offsetX = int(np.round(length * np.cos(angle)))
+    offsetY = int(np.round(length * np.sin(angle)))
+    endX = startX + offsetX
+    endY = startY + offsetY
+    if endX > mask.shape[1]:
+        endX = mask.shape[1]
+    if endY > mask.shape[0]:
+        endY = mask.shape[0]
+    mask_processed = cv2.line(mask, (startX, startY), (endX, endY), 255, brushWidth)
+    return mask_processed, endX, endY
+def draw_circle(mask, circle_x, circle_y, brushWidth):
+    radius = brushWidth // 2
+    assert len(mask.shape) == 2 or mask.shape[2] == 1, "The channel of mask doesn't fit the opencv format"
+    mask_processed = cv2.circle(mask, (circle_x, circle_y), radius, 255)
+    return mask_processed
+def freeFormMask(img_height, img_width, maxVertex, maxLength, maxBrushWidth, maxAngle):
+    mask = np.zeros((img_height, img_width))
+    numVertex = random.randint(1, maxVertex)
+    startX = random.randint(10, img_width)
+    startY = random.randint(10, img_height)
+    brushWidth = random.randint(10, maxBrushWidth)
+    for i in range(numVertex):
+        angle = random.uniform(0, maxAngle)
+        if i % 2 == 0:
+            angle = 2 * np.pi - angle
+        length = random.randint(10, maxLength)
+        mask, endX, endY = draw_line(mask, startX, startY, angle, length, brushWidth)
+        startX = startX + int(length * np.sin(angle))
+        startY = startY + int(length * np.cos(angle))
+        mask = draw_circle(mask, endX, endY, brushWidth)
+    if random.random() < 0.5:
+        mask = np.fliplr(mask)
+    if random.random() < 0.5:
+        mask = np.flipud(mask)
+    if len(mask.shape) == 2:
+        mask = mask[:, :, np.newaxis]
+    return mask
+if __name__ == "__main__":
+    # for stationary mask generation
+    # stationary_mask_generator(240, 480, 50, 120)
+    # for free-form mask generation
+    # mask = freeFormMask(240, 480, 30, 50, 20, np.pi)
+    # cv2.imwrite('mask.png', mask)
+    # for matrix mask generation
+    # img_height, img_width = 240, 480
+    # masks = matrix2bbox(240, 480, 20, 20, 5, 4)
+    # matrixMask = mbbox2masks(img_height, img_width, masks)
+    # cv2.imwrite('matrixMask.png', matrixMask)
+    pass

FGT_codes/FGT/flowCheckPoint/config.yaml ADDED Viewed

	@@ -0,0 +1,11 @@

+PASSMASK: 1
+cnum: 48
+conv_type: vanilla
+flow_interval: 1
+in_channel: 3
+init_weights: 1
+num_flows: 1
+resBlocks: 1
+use_bias: 1
+use_residual: 1
+model: lafc_single

FGT_codes/FGT/flowCheckPoint/lafc_single.pth.tar ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fa168e8b711852c458594cddf4262afdb81e096253197a802a29b4dec9d6d12
+size 11547053

FGT_codes/FGT/inputs.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import argparse
+def args_parser():
+    parser = argparse.ArgumentParser(description="General top layer trainer")
+    parser.add_argument("--opt", type=str, default="config/train.yaml", help="Path to optional configuration file")
+    parser.add_argument('--model', type=str, default='model',
+                        help='Model block name, in the `model` directory')
+    parser.add_argument('--name', type=str, default='FGT_train', help='Experiment name')
+    parser.add_argument('--outputdir', type=str, default='/myData/ret/experiments', help='Output dir to save results')
+    parser.add_argument('--datadir', type=str, default='/myData/', metavar='PATH')
+    parser.add_argument('--datasetName_train', type=str, default='train_dataset_frames_diffusedFlows',
+                        help='The file name of the train dataset, in `data` directory')
+    parser.add_argument('--network', type=str, default='network',
+                        help='The network file which defines the training process, in the `network` directory')
+    parser.add_argument('--finetune', type=int, default=0, help='Whether to fine tune trained models')
+    # parser.add_argument('--checkPoint', type=str, default='', help='checkpoint path for continue training')
+    parser.add_argument('--gen_state', type=str, default='', help='Checkpoint of the generator')
+    parser.add_argument('--dis_state', type=str, default='', help='Checkpoint of the discriminator')
+    parser.add_argument('--opt_state', type=str, default='', help='Checkpoint of the options')
+    parser.add_argument('--record_iter', type=int, default=16, help='How many iters to print an item of log')
+    parser.add_argument('--flow_checkPoint', type=str, default='flowCheckPoint/',
+                        help='The path for flow model filling')
+    parser.add_argument('--dataMode', type=str, default='resize', choices=['resize', 'crop'])
+    # data related parameters
+    parser.add_argument('--flow2rgb', type=int, default=1, help='Whether to transform flows from raw data to rgb')
+    parser.add_argument('--flow_direction', type=str, default='for', choices=['for', 'back', 'bi'],
+                        help='Which GT flow should be chosen for guidance')
+    parser.add_argument('--num_frames', type=int, default=5, help='How many frames are chosen for frame completion')
+    parser.add_argument('--sample', type=str, default='random', choices=['random', 'seq'],
+                        help='Choose the sample method for training in each iterations')
+    parser.add_argument('--max_val', type=float, default=0.01, help='The maximal value to quantize the optical flows')
+    # model related parameters
+    parser.add_argument('--res_h', type=int, default=240, help='The height of the frame resolution')
+    parser.add_argument('--res_w', type=int, default=432, help='The width of the frame resolution')
+    parser.add_argument('--in_channel', type=int, default=4, help='The input channel of the frame branch')
+    parser.add_argument('--cnum', type=int, default=64, help='The initial channel number of the frame branch')
+    parser.add_argument('--flow_inChannel', type=int, default=2, help='The input channel of the flow branch')
+    parser.add_argument('--flow_cnum', type=int, default=64, help='The initial channel dimension of the flow branch')
+    parser.add_argument('--dist_cnum', type=int, default=32, help='The initial channel num in the discriminator')
+    parser.add_argument('--frame_hidden', type=int, default=512,
+                        help='The channel / patch dimension in the frame branch')
+    parser.add_argument('--flow_hidden', type=int, default=256, help='The channel / patch dimension in the flow branch')
+    parser.add_argument('--PASSMASK', type=int, default=1,
+                        help='1 -> concat the mask with the corrupted optical flows to fill the flow')
+    parser.add_argument('--numBlocks', type=int, default=8, help='How many transformer blocks do we need to stack')
+    parser.add_argument('--kernel_size_w', type=int, default=7, help='The width of the kernel for extracting patches')
+    parser.add_argument('--kernel_size_h', type=int, default=7, help='The height of the kernel for extracting patches')
+    parser.add_argument('--stride_h', type=int, default=3, help='The height of the stride')
+    parser.add_argument('--stride_w', type=int, default=3, help='The width of the stride')
+    parser.add_argument('--pad_h', type=int, default=3, help='The height of the padding')
+    parser.add_argument('--pad_w', type=int, default=3, help='The width of the padding')
+    parser.add_argument('--num_head', type=int, default=4, help='The head number for the multihead attention')
+    parser.add_argument('--conv_type', type=str, choices=['vanilla', 'gated', 'partial'], default='vanilla',
+                        help='Which kind of conv to use')
+    parser.add_argument('--norm', type=str, default='None', choices=['None', 'BN', 'SN', 'IN'],
+                        help='The normalization method for the conv blocks')
+    parser.add_argument('--use_bias', type=int, default=1, help='If 1, use bias in the convolution blocks')
+    parser.add_argument('--ape', type=int, default=1, help='If ape = 1, use absolute positional embedding')
+    parser.add_argument('--pos_mode', type=str, default='single', choices=['single', 'dual'],
+                        help='If pos_mode = dual, add positional embedding to flow patches')
+    parser.add_argument('--mlp_ratio', type=int, default=40, help='The mlp dilation rate for the feed forward layers')
+    parser.add_argument('--drop', type=int, default=0, help='The dropout rate, 0 by default')
+    parser.add_argument('--init_weights', type=int, default=1, help='If 1, initialize the network, 1 by default')
+    # loss related parameters
+    parser.add_argument('--L1M', type=float, default=1, help='The weight of L1 loss in the masked area')
+    parser.add_argument('--L1V', type=float, default=1, help='The weight of L1 loss in the valid area')
+    parser.add_argument('--adv', type=float, default=0.01, help='The weight of adversarial loss')
+    # spatial and temporal related parameters
+    parser.add_argument('--tw', type=int, default=2, help='The number of temporal group in the temporal transformer')
+    parser.add_argument('--sw', type=int, default=8,
+                        help='The number of spatial window size in the spatial transformer')
+    parser.add_argument('--gd', type=int, default=4, help='Global downsample rate for spatial transformer')
+    parser.add_argument('--ref_length', type=int, default=10, help='The sample interval during inference')
+    parser.add_argument('--use_valid', action='store_true')
+    args = parser.parse_args()
+    return args

FGT_codes/FGT/metrics/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import numpy as np
+from skimage.metrics import peak_signal_noise_ratio as psnr
+from skimage.metrics import structural_similarity as ssim
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
+def calculate_metrics(results, gts):
+    B, H, W, C = results.shape
+    psnr_values, ssim_values, L1errors, L2errors = [], [], [], []
+    for i in range(B):
+        result = results[i]
+        gt = gts[i]
+        result_img = result
+        gt_img = gt
+        residual = result - gt
+        L1error = np.mean(np.abs(residual))
+        L2error = np.sum(residual ** 2) ** 0.5 / (H * W * C)
+        psnr_value = psnr(result_img, gt_img)
+        ssim_value = ssim(result_img, gt_img, multichannel=True)
+        L1errors.append(L1error)
+        L2errors.append(L2error)
+        psnr_values.append(psnr_value)
+        ssim_values.append(ssim_value)
+    L1_value = np.mean(L1errors)
+    L2_value = np.mean(L2errors)
+    psnr_value = np.mean(psnr_values)
+    ssim_value = np.mean(ssim_values)
+    return {'l1': L1_value, 'l2': L2_value, 'psnr': psnr_value, 'ssim': ssim_value}

FGT_codes/FGT/metrics/psnr.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import numpy
+import math
+def psnr(img1, img2):
+    mse = numpy.mean( (img1 - img2) ** 2 )
+    if mse == 0:
+        return 100
+    PIXEL_MAX = 255.0
+    return 20 * math.log10(PIXEL_MAX / math.sqrt(mse))

FGT_codes/FGT/metrics/ssim.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import cv2
+import numpy as np
+def calculate_ssim(img1, img2):
+    C1 = (0.01 * 255)**2
+    C2 = (0.03 * 255)**2
+    img1 = img1.astype(np.float64)
+    img2 = img2.astype(np.float64)
+    kernel = cv2.getGaussianKernel(11, 1.5)
+    window = np.outer(kernel, kernel.transpose())
+    mu1 = cv2.filter2D(img1, -1, window)[5:-5, 5:-5]  # valid
+    mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
+    mu1_sq = mu1**2
+    mu2_sq = mu2**2
+    mu1_mu2 = mu1 * mu2
+    sigma1_sq = cv2.filter2D(img1**2, -1, window)[5:-5, 5:-5] - mu1_sq
+    sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
+    sigma12 = cv2.filter2D(img1 * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
+    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) *
+                                                            (sigma1_sq + sigma2_sq + C2))
+    return ssim_map.mean()
+def ssim(img1, img2):
+    '''calculate SSIM
+    the same outputs as MATLAB's
+    img1, img2: [0, 255]
+    '''
+    if not img1.shape == img2.shape:
+        raise ValueError('Input images must have the same dimensions.')
+    if img1.ndim == 2:
+        return calculate_ssim(img1, img2)
+    elif img1.ndim == 3:
+        if img1.shape[2] == 3:
+            ssims = []
+            for i in range(3):
+                ssims.append(calculate_ssim(img1[:, :, i], img2[:, :, i]))
+            return np.array(ssims).mean()
+        elif img1.shape[2] == 1:
+            return calculate_ssim(np.squeeze(img1), np.squeeze(img2))
+    else:
+        raise ValueError('Wrong input image dimensions.')

FGT_codes/FGT/models/BaseNetwork.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from .utils.network_blocks_2d import *
+class BaseNetwork(nn.Module):
+    def __init__(self, conv_type):
+        super(BaseNetwork, self).__init__()
+        self.conv_type = conv_type
+        if conv_type == 'gated':
+            self.ConvBlock = GatedConv
+            self.DeconvBlock = GatedDeconv
+        if conv_type == 'partial':
+            self.ConvBlock = PartialConv
+            self.DeconvBlock = PartialDeconv
+        if conv_type == 'vanilla':
+            self.ConvBlock = VanillaConv
+            self.DeconvBlock = VanillaDeconv
+        self.ConvBlock2d = self.ConvBlock
+        self.DeconvBlock2d = self.DeconvBlock
+    def init_weights(self, init_type='normal', gain=0.02):
+        '''
+        initialize network's weights
+        init_type: normal | xavier | kaiming | orthogonal
+        https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/9451e70673400885567d08a9e97ade2524c700d0/models/networks.py#L39
+        '''
+        def init_func(m):
+            classname = m.__class__.__name__
+            if hasattr(m, 'weight') and (classname.find('Conv') != -1 or classname.find('Linear') != -1):
+                if init_type == 'normal':
+                    nn.init.normal_(m.weight.data, 0.0, gain)
+                elif init_type == 'xavier':
+                    nn.init.xavier_normal_(m.weight.data, gain=gain)
+                elif init_type == 'kaiming':
+                    nn.init.kaiming_normal_(m.weight.data, a=0, mode='fan_in')
+                elif init_type == 'orthogonal':
+                    nn.init.orthogonal_(m.weight.data, gain=gain)
+                if hasattr(m, 'bias') and m.bias is not None:
+                    nn.init.constant_(m.bias.data, 0.0)
+            elif classname.find('BatchNorm2d') != -1:
+                nn.init.normal_(m.weight.data, 1.0, gain)
+                nn.init.constant_(m.bias.data, 0.0)
+        self.apply(init_func)

FGT_codes/FGT/models/__init__.py ADDED Viewed

File without changes

FGT_codes/FGT/models/__pycache__/BaseNetwork.cpython-39.pyc ADDED Viewed

Binary file (1.97 kB). View file

FGT_codes/FGT/models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (163 Bytes). View file

FGT_codes/FGT/models/__pycache__/model.cpython-39.pyc ADDED Viewed

Binary file (10.3 kB). View file

FGT_codes/FGT/models/lafc_single.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+import functools
+from .BaseNetwork import BaseNetwork
+from models.utils.reconstructionLayers import make_layer, ResidualBlock_noBN
+class Model(nn.Module):
+    def __init__(self, config):
+        super(Model, self).__init__()
+        self.net = P3DNet(config['num_flows'], config['cnum'], config['in_channel'], config['PASSMASK'],
+                          config['use_residual'],
+                          config['resBlocks'], config['use_bias'], config['conv_type'], config['init_weights'])
+    def forward(self, flows, masks, edges=None):
+        ret = self.net(flows, masks, edges)
+        return ret
+class P3DNet(BaseNetwork):
+    def __init__(self, num_flows, num_feats, in_channels, passmask, use_residual, res_blocks,
+                 use_bias, conv_type, init_weights):
+        super().__init__(conv_type)
+        self.passmask = passmask
+        self.encoder2 = nn.Sequential(
+            nn.ReplicationPad2d(2),
+            self.ConvBlock2d(in_channels, num_feats, kernel_size=5, stride=1, padding=0, bias=use_bias, norm=None),
+            self.ConvBlock2d(num_feats, num_feats * 2, kernel_size=3, stride=2, padding=1, bias=use_bias, norm=None)
+        )
+        self.encoder4 = nn.Sequential(
+            self.ConvBlock2d(num_feats * 2, num_feats * 2, kernel_size=3, stride=1, padding=1, bias=use_bias,
+                             norm=None),
+            self.ConvBlock2d(num_feats * 2, num_feats * 4, kernel_size=3, stride=2, padding=1, bias=use_bias, norm=None)
+        )
+        residualBlock = functools.partial(ResidualBlock_noBN, nf=num_feats * 4)
+        self.res_blocks = make_layer(residualBlock, res_blocks)
+        self.resNums = res_blocks
+        # dilation convolution to enlarge the receptive field
+        self.middle = nn.Sequential(
+            self.ConvBlock2d(num_feats * 4, num_feats * 4, kernel_size=3, stride=1, padding=8, bias=use_bias,
+                             dilation=8, norm=None),
+            self.ConvBlock2d(num_feats * 4, num_feats * 4, kernel_size=3, stride=1, padding=4, bias=use_bias,
+                             dilation=4, norm=None),
+            self.ConvBlock2d(num_feats * 4, num_feats * 4, kernel_size=3, stride=1, padding=2, bias=use_bias,
+                             dilation=2, norm=None),
+            self.ConvBlock2d(num_feats * 4, num_feats * 4, kernel_size=3, stride=1, padding=1, bias=use_bias,
+                             dilation=1, norm=None),
+        )
+        self.decoder2 = nn.Sequential(
+            self.DeconvBlock2d(num_feats * 8, num_feats * 2, kernel_size=3, stride=1, padding=1, bias=use_bias,
+                               norm=None),
+            self.ConvBlock2d(num_feats * 2, num_feats * 2, kernel_size=3, stride=1, padding=1, bias=use_bias,
+                             norm=None),
+            self.ConvBlock2d(num_feats * 2, num_feats * 2, kernel_size=3, stride=1, padding=1, bias=use_bias,
+                             norm=None)
+        )
+        self.decoder = nn.Sequential(
+            self.DeconvBlock2d(num_feats * 4, num_feats, kernel_size=3, stride=1, padding=1, bias=use_bias,
+                               norm=None),
+            self.ConvBlock2d(num_feats, num_feats // 2, kernel_size=3, stride=1, padding=1, bias=use_bias,
+                             norm=None),
+            self.ConvBlock2d(num_feats // 2, 2, kernel_size=3, stride=1, padding=1, bias=use_bias,
+                             norm=None)
+        )
+        self.edgeDetector = EdgeDetection(conv_type)
+        if init_weights:
+            self.init_weights()
+    def forward(self, flows, masks, edges=None):
+        if self.passmask:
+            inputs = torch.cat((flows, masks), dim=1)
+        else:
+            inputs = flows
+        if edges is not None:
+            inputs = torch.cat((inputs, edges), dim=1)
+        e2 = self.encoder2(inputs)
+        e4 = self.encoder4(e2)
+        if self.resNums > 0:
+            e4_res = self.res_blocks(e4)
+        else:
+            e4_res = e4
+        c_e4_filled = self.middle(e4_res)
+        c_e4 = torch.cat((c_e4_filled, e4), dim=1)
+        c_e2Post = self.decoder2(c_e4)
+        c_e2 = torch.cat((c_e2Post, e2), dim=1)
+        output = self.decoder(c_e2)
+        edge = self.edgeDetector(output)
+        return output, edge
+class EdgeDetection(BaseNetwork):
+    def __init__(self, conv_type, in_channels=2, out_channels=1, mid_channels=16):
+        super(EdgeDetection, self).__init__(conv_type)
+        self.projection = self.ConvBlock2d(in_channels=in_channels, out_channels=mid_channels, kernel_size=3, stride=1,
+                                           padding=1, norm=None)
+        self.mid_layer_1 = self.ConvBlock2d(in_channels=mid_channels, out_channels=mid_channels, kernel_size=3,
+                                            stride=1, padding=1, norm=None)
+        self.mid_layer_2 = self.ConvBlock2d(in_channels=mid_channels, out_channels=mid_channels, kernel_size=3,
+                                            stride=1, padding=1, activation=None, norm=None)
+        self.l_relu = nn.LeakyReLU()
+        self.out_layer = self.ConvBlock2d(in_channels=mid_channels, out_channels=out_channels, kernel_size=1,
+                                          activation=None, norm=None)
+    def forward(self, flow):
+        flow = self.projection(flow)
+        edge = self.mid_layer_1(flow)
+        edge = self.mid_layer_2(edge)
+        edge = self.l_relu(flow + edge)
+        edge = self.out_layer(edge)
+        edge = torch.sigmoid(edge)
+        return edge

FGT_codes/FGT/models/model.py ADDED Viewed

	@@ -0,0 +1,284 @@

+from models.BaseNetwork import BaseNetwork
+from models.transformer_base.ffn_base import FusionFeedForward
+from models.transformer_base.attention_flow import SWMHSA_depthGlobalWindowConcatLN_qkFlow_reweightFlow
+from models.transformer_base.attention_base import TMHSA
+import torch
+import torch.nn as nn
+from functools import reduce
+import torch.nn.functional as F
+class Model(nn.Module):
+    def __init__(self, config):
+        super(Model, self).__init__()
+        self.net = FGT(config['tw'], config['sw'], config['gd'], config['input_resolution'], config['in_channel'],
+                        config['cnum'], config['flow_inChannel'], config['flow_cnum'], config['frame_hidden'],
+                        config['flow_hidden'], config['PASSMASK'],
+                        config['numBlocks'], config['kernel_size'], config['stride'], config['padding'],
+                        config['num_head'], config['conv_type'], config['norm'],
+                        config['use_bias'], config['ape'],
+                        config['mlp_ratio'], config['drop'], config['init_weights'])
+    def forward(self, frames, flows, masks):
+        ret = self.net(frames, flows, masks)
+        return ret
+class Encoder(nn.Module):
+    def __init__(self, in_channels):
+        super(Encoder, self).__init__()
+        self.group = [1, 2, 4, 8, 1]
+        self.layers = nn.ModuleList([
+            nn.Conv2d(in_channels, 64, kernel_size=3, stride=2, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(64, 128, kernel_size=3, stride=2, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1, groups=1),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(640, 512, kernel_size=3, stride=1, padding=1, groups=2),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(768, 384, kernel_size=3, stride=1, padding=1, groups=4),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(640, 256, kernel_size=3, stride=1, padding=1, groups=8),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv2d(512, 128, kernel_size=3, stride=1, padding=1, groups=1),
+            nn.LeakyReLU(0.2, inplace=True)
+        ])
+    def forward(self, x):
+        bt, c, h, w = x.size()
+        h, w = h // 4, w // 4
+        out = x
+        for i, layer in enumerate(self.layers):
+            if i == 8:
+                x0 = out
+            if i > 8 and i % 2 == 0:
+                g = self.group[(i - 8) // 2]
+                x = x0.view(bt, g, -1, h, w)
+                o = out.view(bt, g, -1, h, w)
+                out = torch.cat([x, o], 2).view(bt, -1, h, w)
+            out = layer(out)
+        return out
+class AddPosEmb(nn.Module):
+    def __init__(self, h, w, in_channels, out_channels):
+        super(AddPosEmb, self).__init__()
+        self.proj = nn.Conv2d(in_channels, out_channels, 3, 1, 1, bias=True, groups=out_channels)
+        self.h, self.w = h, w
+    def forward(self, x, h=0, w=0):
+        B, N, C = x.shape
+        if h == 0 and w == 0:
+            assert N == self.h * self.w, 'Wrong input size'
+        else:
+            assert N == h * w, 'Wrong input size during inference'
+        feat_token = x
+        if h == 0 and w == 0:
+            cnn_feat = feat_token.transpose(1, 2).view(B, C, self.h, self.w)
+        else:
+            cnn_feat = feat_token.transpose(1, 2).view(B, C, h, w)
+        x = self.proj(cnn_feat) + cnn_feat
+        x = x.flatten(2).transpose(1, 2)
+        return x
+class Vec2Patch(nn.Module):
+    def __init__(self, channel, hidden, output_size, kernel_size, stride, padding):
+        super(Vec2Patch, self).__init__()
+        self.relu = nn.LeakyReLU(0.2, inplace=True)
+        c_out = reduce((lambda x, y: x * y), kernel_size) * channel
+        self.embedding = nn.Linear(hidden, c_out)
+        self.restore = nn.Fold(output_size=output_size, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+    def forward(self, x, output_h=0, output_w=0):
+        feat = self.embedding(x)
+        feat = feat.permute(0, 2, 1)
+        if output_h != 0 or output_w != 0:
+            feat = F.fold(feat, output_size=(output_h, output_w), kernel_size=self.kernel_size, stride=self.stride,
+                          padding=self.padding)
+        else:
+            feat = self.restore(feat)
+        return feat
+class TemporalTransformer(nn.Module):
+    def __init__(self, token_size, frame_hidden, num_heads, t_groupSize, mlp_ratio, dropout, n_vecs,
+                 t2t_params):
+        super(TemporalTransformer, self).__init__()
+        self.attention = TMHSA(token_size=token_size, group_size=t_groupSize, d_model=frame_hidden, head=num_heads,
+                               p=dropout)
+        self.ffn = FusionFeedForward(frame_hidden, mlp_ratio, n_vecs, t2t_params, p=dropout)
+        self.norm1 = nn.LayerNorm(frame_hidden)
+        self.norm2 = nn.LayerNorm(frame_hidden)
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, x, t, h, w, output_size):
+        token_size = h * w
+        s = self.norm1(x)
+        x = x + self.dropout(self.attention(s, t, h, w))
+        y = self.norm2(x)
+        x = x + self.ffn(y, token_size, output_size[0], output_size[1])
+        return x
+class SpatialTransformer(nn.Module):
+    def __init__(self, token_size, frame_hidden, flow_hidden, num_heads, s_windowSize, g_downSize, mlp_ratio,
+                 dropout, n_vecs, t2t_params):
+        super(SpatialTransformer, self).__init__()
+        self.attention = SWMHSA_depthGlobalWindowConcatLN_qkFlow_reweightFlow(token_size=token_size, window_size=s_windowSize,
+                                                                kernel_size=g_downSize, d_model=frame_hidden,
+                                                                flow_dModel=flow_hidden, head=num_heads, p=dropout)
+        self.ffn = FusionFeedForward(frame_hidden, mlp_ratio, n_vecs, t2t_params, p=dropout)
+        self.norm = nn.LayerNorm(frame_hidden)
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, x, f, t, h, w, output_size):
+        token_size = h * w
+        x = x + self.dropout(self.attention(x, f, t, h, w))
+        y = self.norm(x)
+        x = x + self.ffn(y, token_size, output_size[0], output_size[1])
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, token_size, frame_hidden, flow_hidden, num_heads, t_groupSize, s_windowSize, g_downSize,
+                 mlp_ratio,
+                 dropout, n_vecs,
+                 t2t_params):
+        super(TransformerBlock, self).__init__()
+        self.t_transformer = TemporalTransformer(token_size=token_size, frame_hidden=frame_hidden, num_heads=num_heads,
+                                                 t_groupSize=t_groupSize, mlp_ratio=mlp_ratio,
+                                                 dropout=dropout, n_vecs=n_vecs,
+                                                 t2t_params=t2t_params)  # temporal multi-head self attention
+        self.s_transformer = SpatialTransformer(token_size=token_size, frame_hidden=frame_hidden,
+                                                flow_hidden=flow_hidden, num_heads=num_heads, s_windowSize=s_windowSize,
+                                                g_downSize=g_downSize, mlp_ratio=mlp_ratio,
+                                                dropout=dropout, n_vecs=n_vecs, t2t_params=t2t_params)
+    def forward(self, inputs):
+        x, f, t = inputs['x'], inputs['f'], inputs['t']
+        h, w = inputs['h'], inputs['w']
+        output_size = inputs['output_size']
+        x = self.t_transformer(x, t, h, w, output_size)
+        x = self.s_transformer(x, f, t, h, w, output_size)
+        return {'x': x, 'f': f, 't': t, 'h': h, 'w': w, 'output_size': output_size}
+class Decoder(BaseNetwork):
+    def __init__(self, conv_type, in_channels, out_channels, use_bias, norm=None):
+        super(Decoder, self).__init__(conv_type)
+        self.layer1 = self.DeconvBlock(in_channels, in_channels, kernel_size=3, padding=1, norm=norm,
+                                       bias=use_bias)
+        self.layer2 = self.ConvBlock(in_channels, in_channels // 2, kernel_size=3, stride=1, padding=1, norm=norm,
+                                     bias=use_bias)
+        self.layer3 = self.DeconvBlock(in_channels // 2, in_channels // 2, kernel_size=3, padding=1, norm=norm,
+                                       bias=use_bias)
+        self.final = self.ConvBlock(in_channels // 2, out_channels, kernel_size=3, stride=1, padding=1, norm=norm,
+                                    bias=use_bias, activation=None)
+    def forward(self, features):
+        feat1 = self.layer1(features)
+        feat2 = self.layer2(feat1)
+        feat3 = self.layer3(feat2)
+        output = self.final(feat3)
+        return output
+class FGT(BaseNetwork):
+    def __init__(self, t_groupSize, s_windowSize, g_downSize, input_resolution, in_channels, cnum, flow_inChannel,
+                 flow_cnum,
+                 frame_hidden, flow_hidden, passmask, numBlocks, kernel_size, stride, padding, num_heads, conv_type,
+                 norm, use_bias, ape, mlp_ratio=4, drop=0, init_weights=True):
+        super(FGT, self).__init__(conv_type)
+        self.in_channels = in_channels
+        self.passmask = passmask
+        self.ape = ape
+        self.frame_endoder = Encoder(in_channels)
+        self.flow_encoder = nn.Sequential(
+            nn.ReplicationPad2d(2),
+            self.ConvBlock(flow_inChannel, flow_cnum, kernel_size=5, stride=1, padding=0, bias=use_bias, norm=norm),
+            self.ConvBlock(flow_cnum, flow_cnum * 2, kernel_size=3, stride=2, padding=1, bias=use_bias, norm=norm),
+            self.ConvBlock(flow_cnum * 2, flow_cnum * 2, kernel_size=3, stride=1, padding=1, bias=use_bias, norm=norm),
+            self.ConvBlock(flow_cnum * 2, flow_cnum * 2, kernel_size=3, stride=2, padding=1, bias=use_bias, norm=norm)
+        )
+        # patch to vector operation
+        self.patch2vec = nn.Conv2d(cnum * 2, frame_hidden, kernel_size=kernel_size, stride=stride, padding=padding)
+        self.f_patch2vec = nn.Conv2d(flow_cnum * 2, flow_hidden, kernel_size=kernel_size, stride=stride,
+                                     padding=padding)
+        # initialize transformer blocks for frame completion
+        n_vecs = 1
+        token_size = []
+        output_shape = (input_resolution[0] // 4, input_resolution[1] // 4)
+        for i, d in enumerate(kernel_size):
+            token_nums = int((output_shape[i] + 2 * padding[i] - kernel_size[i]) / stride[i] + 1)
+            n_vecs *= token_nums
+            token_size.append(token_nums)
+        # Add positional embedding to the encode features
+        if self.ape:
+            self.add_pos_emb = AddPosEmb(token_size[0], token_size[1], frame_hidden, frame_hidden)
+        self.token_size = token_size
+        # initialize transformer blocks
+        blocks = []
+        t2t_params = {'kernel_size': kernel_size, 'stride': stride, 'padding': padding, 'output_size': output_shape}
+        for i in range(numBlocks // 2 - 1):
+            layer = TransformerBlock(token_size, frame_hidden, flow_hidden, num_heads, t_groupSize, s_windowSize,
+                                     g_downSize, mlp_ratio, drop, n_vecs, t2t_params)
+            blocks.append(layer)
+        self.first_t_transformer = TemporalTransformer(token_size, frame_hidden, num_heads, t_groupSize, mlp_ratio,
+                                                       drop, n_vecs, t2t_params)
+        self.first_s_transformer = SpatialTransformer(token_size, frame_hidden, flow_hidden, num_heads, s_windowSize,
+                                                      g_downSize, mlp_ratio, drop, n_vecs, t2t_params)
+        self.transformer = nn.Sequential(*blocks)
+        # vector to patch operation
+        self.vec2patch = Vec2Patch(cnum * 2, frame_hidden, output_shape, kernel_size, stride, padding)
+        # decoder
+        self.decoder = Decoder(conv_type, cnum * 2, 3, use_bias, norm)
+        if init_weights:
+            self.init_weights()
+    def forward(self, masked_frames, flows, masks):
+        b, t, c, h, w = masked_frames.shape
+        cf = flows.shape[2]
+        output_shape = (h // 4, w // 4)
+        if self.passmask:
+            inputs = torch.cat((masked_frames, masks), dim=2)
+        else:
+            inputs = masked_frames
+        inputs = inputs.view(b * t, self.in_channels, h, w)
+        flows = flows.view(b * t, cf, h, w)
+        enc_feats = self.frame_endoder(inputs)
+        flow_feats = self.flow_encoder(flows)
+        trans_feat = self.patch2vec(enc_feats)
+        flow_patches = self.f_patch2vec(flow_feats)
+        _, c, h, w = trans_feat.shape
+        cf = flow_patches.shape[1]
+        if h != self.token_size[0] or w != self.token_size[1]:
+            new_h, new_w = h, w
+        else:
+            new_h, new_w = 0, 0
+            output_shape = (0, 0)
+        trans_feat = trans_feat.view(b * t, c, -1).permute(0, 2, 1)
+        flow_patches = flow_patches.view(b * t, cf, -1).permute(0, 2, 1)
+        trans_feat = self.first_t_transformer(trans_feat, t, new_h, new_w, output_shape)
+        trans_feat = self.add_pos_emb(trans_feat, new_h, new_w)
+        trans_feat = self.first_s_transformer(trans_feat, flow_patches, t, new_h, new_w, output_shape)
+        inputs_trans_feat = {'x': trans_feat, 'f': flow_patches, 't': t, 'h': new_h, 'w': new_w,
+                             'output_size': output_shape}
+        trans_feat = self.transformer(inputs_trans_feat)['x']
+        trans_feat = self.vec2patch(trans_feat, output_shape[0], output_shape[1])
+        enc_feats = enc_feats + trans_feat
+        output = self.decoder(enc_feats)
+        output = torch.tanh(output)
+        return output

FGT_codes/FGT/models/temporal_patch_gan.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# temporal patch GAN to maintain the temporal consecutive of the flows
+import torch
+import torch.nn as nn
+from .BaseNetwork import BaseNetwork
+class Discriminator(BaseNetwork):
+    def __init__(self, in_channels, conv_type, dist_cnum, use_sigmoid=False, use_spectral_norm=True, init_weights=True):
+        """
+        Args:
+            in_channels: The input channels of the discriminator
+            use_sigmoid: Whether to use sigmoid for the base network (true for the nsgan)
+            use_spectral_norm: The usage of the spectral norm: always be true for the stability of GAN
+            init_weights: always be True
+        """
+        super(Discriminator, self).__init__(conv_type)
+        self.use_sigmoid = use_sigmoid
+        nf = dist_cnum
+        self.conv = nn.Sequential(
+            spectral_norm(
+                nn.Conv3d(in_channels=in_channels, out_channels=nf * 1, kernel_size=(3, 5, 5), stride=(1, 2, 2),
+                          padding=(1, 2, 2),
+                          bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(in_channels=nf * 1, out_channels=nf * 2, kernel_size=(3, 5, 5), stride=(1, 2, 2),
+                          padding=(1, 2, 2),
+                          bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(in_channels=nf * 2, out_channels=nf * 4, kernel_size=(3, 5, 5), stride=(1, 2, 2),
+                          padding=(1, 2, 2),
+                          bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(in_channels=nf * 4, out_channels=nf * 4, kernel_size=(3, 5, 5), stride=(1, 2, 2),
+                          padding=(1, 2, 2),
+                          bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            spectral_norm(
+                nn.Conv3d(in_channels=nf * 4, out_channels=nf * 4, kernel_size=(3, 5, 5), stride=(1, 2, 2),
+                          padding=(1, 2, 2),
+                          bias=not use_spectral_norm), use_spectral_norm),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Conv3d(in_channels=nf * 4, out_channels=nf * 4, kernel_size=(3, 5, 5), stride=(1, 2, 2),
+                      padding=(1, 2, 2))
+        )
+        if init_weights:
+            self.init_weights()
+    def forward(self, xs, t):
+        """
+        Args:
+            xs: Input feature, with shape of [bt, c, h, w]
+        Returns: The discriminative map from the GAN
+        """
+        bt, c, h, w = xs.shape
+        b = bt // t
+        xs = xs.view(b, t, c, h, w).permute(0, 2, 1, 3, 4).contiguous()
+        feat = self.conv(xs)
+        if self.use_sigmoid:
+            feat = torch.sigmoid(feat)
+        out = torch.transpose(feat, 1, 2)  # [b, t, c, h, w]
+        return out
+def spectral_norm(module, mode=True):
+    if mode:
+        return nn.utils.spectral_norm(module)
+    return module

FGT_codes/FGT/models/transformer_base/__init__.py ADDED Viewed

File without changes

FGT_codes/FGT/models/transformer_base/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (180 Bytes). View file

FGT_codes/FGT/models/transformer_base/__pycache__/attention_base.cpython-39.pyc ADDED Viewed

Binary file (4.1 kB). View file

FGT_codes/FGT/models/transformer_base/__pycache__/attention_flow.cpython-39.pyc ADDED Viewed

Binary file (5.51 kB). View file

FGT_codes/FGT/models/transformer_base/__pycache__/ffn_base.cpython-39.pyc ADDED Viewed

Binary file (4.11 kB). View file

FGT_codes/FGT/models/transformer_base/attention_base.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class Attention(nn.Module):
+    """
+    Compute 'Scaled Dot Product Attention
+    """
+    def __init__(self, p=0.1):
+        super(Attention, self).__init__()
+        self.dropout = nn.Dropout(p=p)
+    def forward(self, query, key, value):
+        scores = torch.matmul(query, key.transpose(-2, -1)
+                              ) / math.sqrt(query.size(-1))
+        p_attn = F.softmax(scores, dim=-1)
+        p_attn = self.dropout(p_attn)
+        p_val = torch.matmul(p_attn, value)
+        return p_val, p_attn
+class TMHSA(nn.Module):
+    def __init__(self, token_size, group_size, d_model, head, p=0.1):
+        super(TMHSA, self).__init__()
+        self.h, self.w = token_size
+        self.group_size = group_size  # 这里的group size表示可分的组
+        self.wh, self.ww = math.ceil(self.h / self.group_size), math.ceil(self.w / self.group_size)
+        self.pad_r = (self.ww - self.w % self.ww) % self.ww
+        self.pad_b = (self.wh - self.h % self.wh) % self.wh
+        self.new_h, self.new_w = self.h + self.pad_b, self.w + self.pad_r  # 只在右侧和下侧进行padding，另一侧不padding，实现起来更加容易
+        self.window_h, self.window_w = self.new_h // self.group_size, self.new_w // self.group_size  # 这里面的group表示的是窗口大小，而window_size表示的是group大小（与spatial的定义不同）
+        self.d_model = d_model
+        self.p = p
+        self.query_embedding = nn.Linear(d_model, d_model)
+        self.key_embedding = nn.Linear(d_model, d_model)
+        self.value_embedding = nn.Linear(d_model, d_model)
+        self.output_linear = nn.Linear(d_model, d_model)
+        self.attention = Attention(p=p)
+        self.head = head
+    def inference(self, x, t, h, w):
+        # calculate the attention related parameters
+        wh, ww = math.ceil(h / self.group_size), math.ceil(w / self.group_size)
+        pad_r = (ww - w % ww) % ww
+        pad_b = (wh - h % wh) % wh
+        new_h, new_w = h + pad_b, w + pad_r
+        window_h, window_w = new_h // self.group_size, new_w // self.group_size
+        bt, n, c = x.shape
+        b = bt // t
+        c_h = c // self.head
+        x = x.view(bt, h, w, c)
+        if pad_r > 0 or pad_b > 0:
+            x = F.pad(x,
+                      (0, 0, 0, pad_r, 0, pad_b))  # channel, channel, left, right, top, bottom -> [bt, new_h, new_w, c]
+        query = self.query_embedding(x)
+        key = self.key_embedding(x)
+        value = self.value_embedding(x)
+        query = query.view(b, t, self.group_size, window_h, self.group_size, window_w, self.head, c_h)
+        query = query.permute(0, 2, 4, 6, 1, 3, 5, 7).reshape(b, self.group_size * self.group_size, self.head, -1, c_h)
+        key = key.view(b, t, self.group_size, window_h, self.group_size, window_w, self.head, c_h)
+        key = key.permute(0, 2, 4, 6, 1, 3, 5, 7).reshape(b, self.group_size * self.group_size, self.head, -1, c_h)
+        value = value.view(b, t, self.group_size, window_h, self.group_size, window_w, self.head, c_h)
+        value = value.permute(0, 2, 4, 6, 1, 3, 5, 7).reshape(b, self.group_size * self.group_size, self.head, -1, c_h)
+        att, _ = self.attention(query, key, value)
+        att = att.view(b, self.group_size, self.group_size, self.head, t, window_h, window_w, c_h)
+        att = att.permute(0, 4, 1, 5, 2, 6, 3, 7).contiguous().view(bt, new_h, new_w, c)
+        if pad_b > 0 or pad_r > 0:
+            att = att[:, :h, :w, :]
+        att = att.reshape(bt, n, c)
+        output = self.output_linear(att)
+        return output
+    def forward(self, x, t, h=0, w=0):
+        bt, n, c = x.shape
+        if h == 0 and w == 0:
+            assert n == self.h * self.w, 'Wrong input shape: {} with token: h->{}, w->{}'.format(x.shape, self.h,
+                                                                                                 self.w)
+        else:
+            assert n == h * w, 'Wrong input shape: {} with token: h->{}, w->{}'.format(x.shape, h, w)
+            return self.inference(x, t, h, w)
+        b = bt // t
+        c_h = c // self.head
+        x = x.view(bt, self.h, self.w, c)
+        if self.pad_r > 0 or self.pad_b > 0:
+            x = F.pad(x, (
+            0, 0, 0, self.pad_r, 0, self.pad_b))  # channel, channel, left, right, top, bottom -> [bt, new_h, new_w, c]
+        query = self.query_embedding(x)
+        key = self.key_embedding(x)
+        value = self.value_embedding(x)
+        query = query.view(b, t, self.group_size, self.window_h, self.group_size, self.window_w, self.head, c_h)
+        query = query.permute(0, 2, 4, 6, 1, 3, 5, 7).reshape(b, self.group_size * self.group_size, self.head, -1, c_h)
+        key = key.view(b, t, self.group_size, self.window_h, self.group_size, self.window_w, self.head, c_h)
+        key = key.permute(0, 2, 4, 6, 1, 3, 5, 7).reshape(b, self.group_size * self.group_size, self.head, -1, c_h)
+        value = value.view(b, t, self.group_size, self.window_h, self.group_size, self.window_w, self.head, c_h)
+        value = value.permute(0, 2, 4, 6, 1, 3, 5, 7).reshape(b, self.group_size * self.group_size, self.head, -1, c_h)
+        att, _ = self.attention(query, key, value)
+        att = att.view(b, self.group_size, self.group_size, self.head, t, self.window_h, self.window_w, c_h)
+        att = att.permute(0, 4, 1, 5, 2, 6, 3, 7).contiguous().view(bt, self.new_h, self.new_w, c)
+        if self.pad_b > 0 or self.pad_r > 0:
+            att = att[:, :self.h, :self.w, :]
+        att = att.reshape(bt, n, c)
+        output = self.output_linear(att)
+        return output

FGT_codes/FGT/models/transformer_base/attention_flow.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class Attention(nn.Module):
+    """
+    Compute 'Scaled Dot Product Attention
+    """
+    def __init__(self, p=0.1):
+        super(Attention, self).__init__()
+        self.dropout = nn.Dropout(p=p)
+    def forward(self, query, key, value):
+        scores = torch.matmul(query, key.transpose(-2, -1)
+                              ) / math.sqrt(query.size(-1))
+        p_attn = F.softmax(scores, dim=-1)
+        p_attn = self.dropout(p_attn)
+        p_val = torch.matmul(p_attn, value)
+        return p_val, p_attn
+class SWMHSA_depthGlobalWindowConcatLN_qkFlow_reweightFlow(nn.Module):
+    def __init__(self, token_size, window_size, kernel_size, d_model, flow_dModel, head, p=0.1):
+        super(SWMHSA_depthGlobalWindowConcatLN_qkFlow_reweightFlow, self).__init__()
+        self.h, self.w = token_size
+        self.head = head
+        self.window_size = window_size
+        self.d_model = d_model
+        self.flow_dModel = flow_dModel
+        in_channels = d_model + flow_dModel
+        self.query_embedding = nn.Linear(in_channels, d_model)
+        self.key_embedding = nn.Linear(in_channels, d_model)
+        self.value_embedding = nn.Linear(d_model, d_model)
+        self.output_linear = nn.Linear(d_model, d_model)
+        self.attention = Attention(p)
+        self.pad_l = self.pad_t = 0
+        self.pad_r = (self.window_size - self.w % self.window_size) % self.window_size
+        self.pad_b = (self.window_size - self.h % self.window_size) % self.window_size
+        self.new_h, self.new_w = self.h + self.pad_b, self.w + self.pad_r
+        self.group_h, self.group_w = self.new_h // self.window_size, self.new_w // self.window_size
+        self.global_extract_v = nn.Conv2d(d_model, d_model, kernel_size=kernel_size, stride=kernel_size, padding=0,
+                                          groups=d_model)
+        self.global_extract_k = nn.Conv2d(in_channels, in_channels, kernel_size=kernel_size, stride=kernel_size,
+                                          padding=0,
+                                          groups=in_channels)
+        self.q_norm = nn.LayerNorm(d_model + flow_dModel)
+        self.k_norm = nn.LayerNorm(d_model + flow_dModel)
+        self.v_norm = nn.LayerNorm(d_model)
+        self.reweightFlow = nn.Sequential(
+            nn.Linear(in_channels, flow_dModel),
+            nn.Sigmoid()
+        )
+    def inference(self, x, f, h, w):
+        pad_r = (self.window_size - w % self.window_size) % self.window_size
+        pad_b = (self.window_size - h % self.window_size) % self.window_size
+        new_h, new_w = h + pad_b, w + pad_r
+        group_h, group_w = new_h // self.window_size, new_w // self.window_size
+        bt, n, c = x.shape
+        cf = f.shape[2]
+        x = x.view(bt, h, w, c)
+        f = f.view(bt, h, w, cf)
+        if pad_r > 0 or pad_b > 0:
+            x = F.pad(x, (0, 0, self.pad_l, pad_r, self.pad_t, pad_b))
+            f = F.pad(f, (0, 0, self.pad_l, pad_r, self.pad_t, pad_b))
+        y = x.permute(0, 3, 1, 2)
+        xf = torch.cat((x, f), dim=-1)
+        flow_weights = self.reweightFlow(xf)
+        f = f * flow_weights
+        qk = torch.cat((x, f), dim=-1)  # [b, h, w, c]
+        qk_c = qk.shape[-1]
+        # generate q
+        q = qk.reshape(bt, group_h, self.window_size, group_w, self.window_size, qk_c).transpose(2, 3)
+        q = q.reshape(bt, group_h * group_w, self.window_size * self.window_size, qk_c)
+        # generate k
+        ky = qk.permute(0, 3, 1, 2)  # [b, c, h, w]
+        k_global = self.global_extract_k(ky)
+        k_global = k_global.permute(0, 2, 3, 1).reshape(bt, -1, qk_c).unsqueeze(1).repeat(1, group_h * group_w, 1, 1)
+        k = torch.cat((q, k_global), dim=2)
+        # norm q and k
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        # generate v
+        global_tokens = self.global_extract_v(y)  # [bt, c, h', w']
+        global_tokens = global_tokens.permute(0, 2, 3, 1).reshape(bt, -1, c).unsqueeze(1).repeat(1,
+                                                                                                 group_h * group_w,
+                                                                                                 1,
+                                                                                                 1)  # [bt, gh * gw, h'*w', c]
+        x = x.reshape(bt, group_h, self.window_size, group_w, self.window_size, c).transpose(2,
+                                                                                             3)  # [bt, gh, gw, ws, ws, c]
+        x = x.reshape(bt, group_h * group_w, self.window_size * self.window_size, c)  # [bt, gh * gw, ws^2, c]
+        v = torch.cat((x, global_tokens), dim=2)
+        v = self.v_norm(v)
+        query = self.query_embedding(q)  # [bt, self.group_h, self.group_w, self.window_size, self.window_size, c]
+        key = self.key_embedding(k)
+        value = self.value_embedding(v)
+        query = query.reshape(bt, group_h * group_w, self.window_size * self.window_size, self.head,
+                              c // self.head).permute(0, 1, 3, 2, 4)
+        key = key.reshape(bt, group_h * group_w, -1, self.head,
+                          c // self.head).permute(0, 1, 3, 2, 4)
+        value = value.reshape(bt, group_h * group_w, -1, self.head,
+                              c // self.head).permute(0, 1, 3, 2, 4)
+        attn, _ = self.attention(query, key, value)
+        x = attn.transpose(2, 3).reshape(bt, group_h, group_w, self.window_size, self.window_size, c)
+        x = x.transpose(2, 3).reshape(bt, group_h * self.window_size, group_w * self.window_size, c)
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :h, :w, :].contiguous()
+        x = x.reshape(bt, n, c)
+        output = self.output_linear(x)
+        return output
+    def forward(self, x, f, t, h=0, w=0):
+        if h != 0 or w != 0:
+            return self.inference(x, f, h, w)
+        bt, n, c = x.shape
+        cf = f.shape[2]
+        x = x.view(bt, self.h, self.w, c)
+        f = f.view(bt, self.h, self.w, cf)
+        if self.pad_r > 0 or self.pad_b > 0:
+            x = F.pad(x, (0, 0, self.pad_l, self.pad_r, self.pad_t, self.pad_b))
+            f = F.pad(f, (0, 0, self.pad_l, self.pad_r, self.pad_t, self.pad_b))  # [bt, cf, h, w]
+        y = x.permute(0, 3, 1, 2)
+        xf = torch.cat((x, f), dim=-1)
+        weights = self.reweightFlow(xf)
+        f = f * weights
+        qk = torch.cat((x, f), dim=-1)  # [b, h, w, c]
+        qk_c = qk.shape[-1]
+        # generate q
+        q = qk.reshape(bt, self.group_h, self.window_size, self.group_w, self.window_size, qk_c).transpose(2, 3)
+        q = q.reshape(bt, self.group_h * self.group_w, self.window_size * self.window_size, qk_c)
+        # generate k
+        ky = qk.permute(0, 3, 1, 2)  # [b, c, h, w]
+        k_global = self.global_extract_k(ky)  # [b, qk_c, h, w]
+        k_global = k_global.permute(0, 2, 3, 1).reshape(bt, -1, qk_c).unsqueeze(1).repeat(1,
+                                                                                          self.group_h * self.group_w,
+                                                                                          1, 1)
+        k = torch.cat((q, k_global), dim=2)
+        # norm q and k
+        q = self.q_norm(q)
+        k = self.k_norm(k)
+        # generate v
+        global_tokens = self.global_extract_v(y)  # [bt, c, h', w']
+        global_tokens = global_tokens.permute(0, 2, 3, 1).reshape(bt, -1, c).unsqueeze(1).repeat(1,
+                                                                                                 self.group_h * self.group_w,
+                                                                                                 1,
+                                                                                                 1)  # [bt, gh * gw, h'*w', c]
+        x = x.reshape(bt, self.group_h, self.window_size, self.group_w, self.window_size, c).transpose(2,
+                                                                                                       3)  # [bt, gh, gw, ws, ws, c]
+        x = x.reshape(bt, self.group_h * self.group_w, self.window_size * self.window_size, c)  # [bt, gh * gw, ws^2, c]
+        v = torch.cat((x, global_tokens), dim=2)
+        v = self.v_norm(v)
+        query = self.query_embedding(q)  # [bt, self.group_h, self.group_w, self.window_size, self.window_size, c]
+        key = self.key_embedding(k)
+        value = self.value_embedding(v)
+        query = query.reshape(bt, self.group_h * self.group_w, self.window_size * self.window_size, self.head,
+                              c // self.head).permute(0, 1, 3, 2, 4)
+        key = key.reshape(bt, self.group_h * self.group_w, -1, self.head,
+                          c // self.head).permute(0, 1, 3, 2, 4)
+        value = value.reshape(bt, self.group_h * self.group_w, -1, self.head,
+                              c // self.head).permute(0, 1, 3, 2, 4)
+        attn, _ = self.attention(query, key, value)
+        x = attn.transpose(2, 3).reshape(bt, self.group_h, self.group_w, self.window_size, self.window_size, c)
+        x = x.transpose(2, 3).reshape(bt, self.group_h * self.window_size, self.group_w * self.window_size, c)
+        if self.pad_r > 0 or self.pad_b > 0:
+            x = x[:, :self.h, :self.w, :].contiguous()
+        x = x.reshape(bt, n, c)
+        output = self.output_linear(x)
+        return output

FGT_codes/FGT/models/transformer_base/ffn_base.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import torch
+from functools import reduce
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+class FeedForward(nn.Module):
+    def __init__(self, frame_hidden, mlp_ratio, n_vecs, t2t_params, p):
+        """
+        Args:
+            frame_hidden: hidden size of frame features
+            mlp_ratio: mlp ratio in the middle layer of the transformers
+            n_vecs: number of vectors in the transformer
+            t2t_params: dictionary -> {'kernel_size': kernel_size, 'stride': stride, 'padding': padding, 'output_size': output_shape}
+            p: dropout rate, 0 by default
+        """
+        super(FeedForward, self).__init__()
+        self.conv = nn.Sequential(
+            nn.Linear(frame_hidden, frame_hidden * mlp_ratio),
+            nn.ReLU(inplace=True),
+            nn.Dropout(p),
+            nn.Linear(frame_hidden * mlp_ratio, frame_hidden),
+            nn.Dropout(p)
+        )
+    def forward(self, x, n_vecs=0, output_h=0, output_w=0):
+        x = self.conv(x)
+        return x
+class FusionFeedForward(nn.Module):
+    def __init__(self, frame_hidden, mlp_ratio, n_vecs, t2t_params, p):
+        super(FusionFeedForward, self).__init__()
+        self.kernel_shape = reduce((lambda x, y: x * y), t2t_params['kernel_size'])
+        self.t2t_params = t2t_params
+        hidden_size = self.kernel_shape * mlp_ratio
+        self.conv1 = nn.Linear(frame_hidden, hidden_size)
+        self.conv2 = nn.Sequential(
+            nn.ReLU(inplace=True),
+            nn.Dropout(p),
+            nn.Linear(hidden_size, frame_hidden),
+            nn.Dropout(p)
+        )
+        assert t2t_params is not None and n_vecs is not None
+        tp = t2t_params.copy()
+        self.fold = nn.Fold(**tp)
+        del tp['output_size']
+        self.unfold = nn.Unfold(**tp)
+        self.n_vecs = n_vecs
+    def forward(self, x, n_vecs=0, output_h=0, output_w=0):
+        x = self.conv1(x)
+        b, n, c = x.size()
+        if n_vecs != 0:
+            normalizer = x.new_ones(b, n, self.kernel_shape).view(-1, n_vecs, self.kernel_shape).permute(0, 2, 1)
+            x = self.unfold(F.fold(x.view(-1, n_vecs, c).permute(0, 2, 1), output_size=(output_h, output_w),
+                                   kernel_size=self.t2t_params['kernel_size'], stride=self.t2t_params['stride'],
+                                   padding=self.t2t_params['padding']) / F.fold(normalizer,
+                                                                                output_size=(output_h, output_w),
+                                                                                kernel_size=self.t2t_params[
+                                                                                    'kernel_size'],
+                                                                                stride=self.t2t_params['stride'],
+                                                                                padding=self.t2t_params[
+                                                                                    'padding'])).permute(0,
+                                                                                                         2,
+                                                                                                         1).contiguous().view(
+                b, n, c)
+        else:
+            normalizer = x.new_ones(b, n, self.kernel_shape).view(-1, self.n_vecs, self.kernel_shape).permute(0, 2, 1)
+            x = self.unfold(self.fold(x.view(-1, self.n_vecs, c).permute(0, 2, 1)) / self.fold(normalizer)).permute(0,
+                                                                                                                    2,
+                                                                                                                    1).contiguous().view(
+                b, n, c)
+        x = self.conv2(x)
+        return x
+class ResidualBlock_noBN(nn.Module):
+    """Residual block w/o BN
+    ---Conv-ReLU-Conv-+-
+     |________________|
+    """
+    def __init__(self, nf=64):
+        super(ResidualBlock_noBN, self).__init__()
+        self.conv1 = nn.Conv2d(nf, nf, kernel_size=3, stride=1, padding=1, bias=True)
+        self.conv2 = nn.Conv2d(nf, nf, kernel_size=3, stride=1, padding=1, bias=True)
+        self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True)
+    def forward(self, x):
+        """
+        Args:
+            x: with shape of [b, c, t, h, w]
+        Returns: processed features with shape [b, c, t, h, w]
+        """
+        identity = x
+        out = self.lrelu(self.conv1(x))
+        out = self.conv2(out)
+        out = identity + out
+        # Remove ReLU at the end of the residual block
+        # http://torch.ch/blog/2016/02/04/resnets.html
+        return out
+def make_layer(block, n_layers):
+    layers = []
+    for _ in range(n_layers):
+        layers.append(block())
+    return nn.Sequential(*layers)

FGT_codes/FGT/models/utils/RAFT/utils/__init__.py ADDED Viewed

File without changes

FGT_codes/FGT/models/utils/RAFT/utils/utils.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import torch
+import torch.nn.functional as F
+import numpy as np
+from scipy import interpolate
+class InputPadder:
+    """ Pads images such that dimensions are divisible by 8 """
+    def __init__(self, dims, mode='sintel'):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
+        pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
+        if mode == 'sintel':
+            self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
+        else:
+            self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht]
+    def pad(self, *inputs):
+        return [F.pad(x, self._pad, mode='replicate') for x in inputs]
+    def unpad(self,x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
+        return x[..., c[0]:c[1], c[2]:c[3]]
+def forward_interpolate(flow):
+    flow = flow.detach().cpu().numpy()
+    dx, dy = flow[0], flow[1]
+    ht, wd = dx.shape
+    x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht))
+    x1 = x0 + dx
+    y1 = y0 + dy
+    x1 = x1.reshape(-1)
+    y1 = y1.reshape(-1)
+    dx = dx.reshape(-1)
+    dy = dy.reshape(-1)
+    valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht)
+    x1 = x1[valid]
+    y1 = y1[valid]
+    dx = dx[valid]
+    dy = dy[valid]
+    flow_x = interpolate.griddata(
+        (x1, y1), dx, (x0, y0), method='nearest', fill_value=0)
+    flow_y = interpolate.griddata(
+        (x1, y1), dy, (x0, y0), method='nearest', fill_value=0)
+    flow = np.stack([flow_x, flow_y], axis=0)
+    return torch.from_numpy(flow).float()
+def bilinear_sampler(img, coords, mode='bilinear', mask=False):
+    """ Wrapper for grid_sample, uses pixel coordinates """
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1,1], dim=-1)
+    xgrid = 2*xgrid/(W-1) - 1
+    ygrid = 2*ygrid/(H-1) - 1
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    img = F.grid_sample(img, grid, align_corners=True)
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+    return img
+def coords_grid(batch, ht, wd):
+    coords = torch.meshgrid(torch.arange(ht), torch.arange(wd))
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+def upflow8(flow, mode='bilinear'):
+    new_size = (8 * flow.shape[2], 8 * flow.shape[3])
+    return  8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True)

FGT_codes/FGT/models/utils/__init__.py ADDED Viewed

File without changes

FGT_codes/FGT/models/utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (169 Bytes). View file

FGT_codes/FGT/models/utils/__pycache__/network_blocks_2d.cpython-39.pyc ADDED Viewed

Binary file (5.41 kB). View file