Spaces:

isl-org
/

lang-seg

Runtime error

App Files Files Community

akhaliq HF staff commited on Apr 11, 2022

Commit

0870534

•

1 Parent(s): 1c296ac

add files

Browse files

Files changed (15) hide show

LICENSE +21 -0
additional_utils/encoding_models.py +164 -0
additional_utils/models.py +250 -0
data/__init__.py +24 -0
label_files/ade20k_objectInfo150.txt +151 -0
lseg_app.py +386 -0
modules/lseg_module.py +183 -0
modules/lsegmentation_module.py +304 -0
modules/models/lseg_blocks.py +359 -0
modules/models/lseg_net.py +231 -0
modules/models/lseg_vit.py +535 -0
prepare_ade20k.py +45 -0
test_lseg.py +436 -0
train_lseg.py +7 -0
utils.py +368 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2021 Intelligent Systems Lab Org
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

additional_utils/encoding_models.py ADDED Viewed

	@@ -0,0 +1,164 @@

+###########################################################################
+# Referred to: https://github.com/zhanghang1989/PyTorch-Encoding
+###########################################################################
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parallel.data_parallel import DataParallel
+from torch.nn.parallel.scatter_gather import scatter
+import threading
+import torch
+from torch.cuda._utils import _get_device_index
+from torch.cuda.amp import autocast
+from torch._utils import ExceptionWrapper
+up_kwargs = {'mode': 'bilinear', 'align_corners': True}
+__all__ = ['MultiEvalModule']
+class MultiEvalModule(DataParallel):
+    """Multi-size Segmentation Eavluator"""
+    def __init__(self, module, nclass, device_ids=None, flip=True,
+                 scales=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75]):
+        super(MultiEvalModule, self).__init__(module, device_ids)
+        self.nclass = nclass
+        self.base_size = module.base_size
+        self.crop_size = module.crop_size
+        self.scales = scales
+        self.flip = flip
+        print('MultiEvalModule: base_size {}, crop_size {}'. \
+            format(self.base_size, self.crop_size))
+    def parallel_forward(self, inputs, **kwargs):
+        """Multi-GPU Mult-size Evaluation
+        Args:
+            inputs: list of Tensors
+        """
+        inputs = [(input.unsqueeze(0).cuda(device),)
+                  for input, device in zip(inputs, self.device_ids)]
+        replicas = self.replicate(self, self.device_ids[:len(inputs)])
+        kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
+        if len(inputs) < len(kwargs):
+            inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+        elif len(kwargs) < len(inputs):
+            kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+        outputs = self.parallel_apply(replicas, inputs, kwargs)
+        #for out in outputs:
+        #    print('out.size()', out.size())
+        return outputs
+    def forward(self, image):
+        """Mult-size Evaluation"""
+        # only single image is supported for evaluation
+        batch, _, h, w = image.size()
+        assert(batch == 1)
+        stride_rate = 2.0/3.0
+        crop_size = self.crop_size
+        stride = int(crop_size * stride_rate)
+        with torch.cuda.device_of(image):
+            scores = image.new().resize_(batch,self.nclass,h,w).zero_().cuda()
+        for scale in self.scales:
+            long_size = int(math.ceil(self.base_size * scale))
+            if h > w:
+                height = long_size
+                width = int(1.0 * w * long_size / h + 0.5)
+                short_size = width
+            else:
+                width = long_size
+                height = int(1.0 * h * long_size / w + 0.5)
+                short_size = height
+            """
+            short_size = int(math.ceil(self.base_size * scale))
+            if h > w:
+                width = short_size
+                height = int(1.0 * h * short_size / w)
+                long_size = height
+            else:
+                height = short_size
+                width = int(1.0 * w * short_size / h)
+                long_size = width
+            """
+            # resize image to current size
+            cur_img = resize_image(image, height, width, **self.module._up_kwargs)
+            if long_size <= crop_size:
+                pad_img = pad_image(cur_img, self.module.mean,
+                                    self.module.std, crop_size)
+                outputs = module_inference(self.module, pad_img, self.flip)
+                outputs = crop_image(outputs, 0, height, 0, width)
+            else:
+                if short_size < crop_size:
+                    # pad if needed
+                    pad_img = pad_image(cur_img, self.module.mean,
+                                        self.module.std, crop_size)
+                else:
+                    pad_img = cur_img
+                _,_,ph,pw = pad_img.size()
+                assert(ph >= height and pw >= width)
+                # grid forward and normalize
+                h_grids = int(math.ceil(1.0 * (ph-crop_size)/stride)) + 1
+                w_grids = int(math.ceil(1.0 * (pw-crop_size)/stride)) + 1
+                with torch.cuda.device_of(image):
+                    outputs = image.new().resize_(batch,self.nclass,ph,pw).zero_().cuda()
+                    count_norm = image.new().resize_(batch,1,ph,pw).zero_().cuda()
+                # grid evaluation
+                for idh in range(h_grids):
+                    for idw in range(w_grids):
+                        h0 = idh * stride
+                        w0 = idw * stride
+                        h1 = min(h0 + crop_size, ph)
+                        w1 = min(w0 + crop_size, pw)
+                        crop_img = crop_image(pad_img, h0, h1, w0, w1)
+                        # pad if needed
+                        pad_crop_img = pad_image(crop_img, self.module.mean,
+                                                 self.module.std, crop_size)
+                        output = module_inference(self.module, pad_crop_img, self.flip)
+                        outputs[:,:,h0:h1,w0:w1] += crop_image(output,
+                            0, h1-h0, 0, w1-w0)
+                        count_norm[:,:,h0:h1,w0:w1] += 1
+                assert((count_norm==0).sum()==0)
+                outputs = outputs / count_norm
+                outputs = outputs[:,:,:height,:width]
+            score = resize_image(outputs, h, w, **self.module._up_kwargs)
+            scores += score
+        return scores
+def module_inference(module, image, flip=True):
+    output = module.evaluate(image)
+    if flip:
+        fimg = flip_image(image)
+        foutput = module.evaluate(fimg)
+        output += flip_image(foutput)
+    return output
+def resize_image(img, h, w, **up_kwargs):
+    return F.interpolate(img, (h, w), **up_kwargs)
+def pad_image(img, mean, std, crop_size):
+    b,c,h,w = img.size()
+    assert(c==3)
+    padh = crop_size - h if h < crop_size else 0
+    padw = crop_size - w if w < crop_size else 0
+    pad_values = -np.array(mean) / np.array(std)
+    img_pad = img.new().resize_(b,c,h+padh,w+padw)
+    for i in range(c):
+        # note that pytorch pad params is in reversed orders
+        img_pad[:,i,:,:] = F.pad(img[:,i,:,:], (0, padw, 0, padh), value=pad_values[i])
+    assert(img_pad.size(2)>=crop_size and img_pad.size(3)>=crop_size)
+    return img_pad
+def crop_image(img, h0, h1, w0, w1):
+    return img[:,:,h0:h1,w0:w1]
+def flip_image(img):
+    assert(img.dim()==4)
+    with torch.cuda.device_of(img):
+        idx = torch.arange(img.size(3)-1, -1, -1).type_as(img).long()
+    return img.index_select(3, idx)

additional_utils/models.py ADDED Viewed

	@@ -0,0 +1,250 @@

+###########################################################################
+# Referred to: https://github.com/zhanghang1989/PyTorch-Encoding
+###########################################################################
+import math
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.parallel.data_parallel import DataParallel
+from torch.nn.parallel.scatter_gather import scatter
+import threading
+import torch
+from torch.cuda._utils import _get_device_index
+from torch.cuda.amp import autocast
+from torch._utils import ExceptionWrapper
+up_kwargs = {'mode': 'bilinear', 'align_corners': True}
+__all__ = ['LSeg_MultiEvalModule']
+class LSeg_MultiEvalModule(DataParallel):
+    """Multi-size Segmentation Eavluator"""
+    def __init__(self, module, device_ids=None, flip=True,
+                 scales=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75]):
+        super(LSeg_MultiEvalModule, self).__init__(module, device_ids)
+        self.base_size = module.base_size
+        self.crop_size = module.crop_size
+        self.scales = scales
+        self.flip = flip
+        print('MultiEvalModule: base_size {}, crop_size {}'. \
+            format(self.base_size, self.crop_size))
+    def parallel_forward(self, inputs, label_set='', **kwargs):
+        """Multi-GPU Mult-size Evaluation
+        Args:
+            inputs: list of Tensors
+        """
+        if len(label_set) < 10:
+            print('** MultiEvalModule parallel_forward phase: {} **'.format(label_set))
+        self.nclass = len(label_set)
+        inputs = [(input.unsqueeze(0).cuda(device),)
+                  for input, device in zip(inputs, self.device_ids)]
+        replicas = self.replicate(self, self.device_ids[:len(inputs)])
+        kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
+        if len(inputs) < len(kwargs):
+            inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+        elif len(kwargs) < len(inputs):
+            kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+        outputs = parallel_apply(replicas, inputs, label_set, kwargs)
+        return outputs
+    def forward(self, image, label_set=''):
+        """Mult-size Evaluation"""
+        # only single image is supported for evaluation
+        if len(label_set) < 10:
+            print('** MultiEvalModule forward phase: {} **'.format(label_set))
+        batch, _, h, w = image.size()
+        assert(batch == 1)
+        self.nclass = len(label_set)
+        stride_rate = 2.0/3.0
+        crop_size = self.crop_size
+        stride = int(crop_size * stride_rate)
+        with torch.cuda.device_of(image):
+            scores = image.new().resize_(batch,self.nclass,h,w).zero_().cuda()
+        for scale in self.scales:
+            long_size = int(math.ceil(self.base_size * scale))
+            if h > w:
+                height = long_size
+                width = int(1.0 * w * long_size / h + 0.5)
+                short_size = width
+            else:
+                width = long_size
+                height = int(1.0 * h * long_size / w + 0.5)
+                short_size = height
+            """
+            short_size = int(math.ceil(self.base_size * scale))
+            if h > w:
+                width = short_size
+                height = int(1.0 * h * short_size / w)
+                long_size = height
+            else:
+                height = short_size
+                width = int(1.0 * w * short_size / h)
+                long_size = width
+            """
+            # resize image to current size
+            cur_img = resize_image(image, height, width, **self.module._up_kwargs)
+            if long_size <= crop_size:
+                pad_img = pad_image(cur_img, self.module.mean,
+                                    self.module.std, crop_size)
+                outputs = module_inference(self.module, pad_img, label_set, self.flip)
+                outputs = crop_image(outputs, 0, height, 0, width)
+            else:
+                if short_size < crop_size:
+                    # pad if needed
+                    pad_img = pad_image(cur_img, self.module.mean,
+                                        self.module.std, crop_size)
+                else:
+                    pad_img = cur_img
+                _,_,ph,pw = pad_img.shape #.size()
+                assert(ph >= height and pw >= width)
+                # grid forward and normalize
+                h_grids = int(math.ceil(1.0 * (ph-crop_size)/stride)) + 1
+                w_grids = int(math.ceil(1.0 * (pw-crop_size)/stride)) + 1
+                with torch.cuda.device_of(image):
+                    outputs = image.new().resize_(batch,self.nclass,ph,pw).zero_().cuda()
+                    count_norm = image.new().resize_(batch,1,ph,pw).zero_().cuda()
+                # grid evaluation
+                for idh in range(h_grids):
+                    for idw in range(w_grids):
+                        h0 = idh * stride
+                        w0 = idw * stride
+                        h1 = min(h0 + crop_size, ph)
+                        w1 = min(w0 + crop_size, pw)
+                        crop_img = crop_image(pad_img, h0, h1, w0, w1)
+                        # pad if needed
+                        pad_crop_img = pad_image(crop_img, self.module.mean,
+                                                 self.module.std, crop_size)
+                        output = module_inference(self.module, pad_crop_img, label_set, self.flip)
+                        outputs[:,:,h0:h1,w0:w1] += crop_image(output,
+                            0, h1-h0, 0, w1-w0)
+                        count_norm[:,:,h0:h1,w0:w1] += 1
+                assert((count_norm==0).sum()==0)
+                outputs = outputs / count_norm
+                outputs = outputs[:,:,:height,:width]
+            score = resize_image(outputs, h, w, **self.module._up_kwargs)
+            scores += score
+        return scores
+def module_inference(module, image, label_set, flip=True):
+    output = module.evaluate_random(image, label_set)
+    if flip:
+        fimg = flip_image(image)
+        foutput = module.evaluate_random(fimg, label_set)
+        output += flip_image(foutput)
+    return output
+def resize_image(img, h, w, **up_kwargs):
+    return F.interpolate(img, (h, w), **up_kwargs)
+def pad_image(img, mean, std, crop_size):
+    b,c,h,w = img.shape #.size()
+    assert(c==3)
+    padh = crop_size - h if h < crop_size else 0
+    padw = crop_size - w if w < crop_size else 0
+    pad_values = -np.array(mean) / np.array(std)
+    img_pad = img.new().resize_(b,c,h+padh,w+padw)
+    for i in range(c):
+        # note that pytorch pad params is in reversed orders
+        img_pad[:,i,:,:] = F.pad(img[:,i,:,:], (0, padw, 0, padh), value=pad_values[i])
+    assert(img_pad.size(2)>=crop_size and img_pad.size(3)>=crop_size)
+    return img_pad
+def crop_image(img, h0, h1, w0, w1):
+    return img[:,:,h0:h1,w0:w1]
+def flip_image(img):
+    assert(img.dim()==4)
+    with torch.cuda.device_of(img):
+        idx = torch.arange(img.size(3)-1, -1, -1).type_as(img).long()
+    return img.index_select(3, idx)
+def get_a_var(obj):
+    if isinstance(obj, torch.Tensor):
+        return obj
+    if isinstance(obj, list) or isinstance(obj, tuple):
+        for result in map(get_a_var, obj):
+            if isinstance(result, torch.Tensor):
+                return result
+    if isinstance(obj, dict):
+        for result in map(get_a_var, obj.items()):
+            if isinstance(result, torch.Tensor):
+                return result
+    return None
+def parallel_apply(modules, inputs, label_set, kwargs_tup=None, devices=None):
+    r"""Applies each `module` in :attr:`modules` in parallel on arguments
+    contained in :attr:`inputs` (positional) and :attr:`kwargs_tup` (keyword)
+    on each of :attr:`devices`.
+    Args:
+        modules (Module): modules to be parallelized
+        inputs (tensor): inputs to the modules
+        devices (list of int or torch.device): CUDA devices
+    :attr:`modules`, :attr:`inputs`, :attr:`kwargs_tup` (if given), and
+    :attr:`devices` (if given) should all have same length. Moreover, each
+    element of :attr:`inputs` can either be a single object as the only argument
+    to a module, or a collection of positional arguments.
+    """
+    assert len(modules) == len(inputs)
+    if kwargs_tup is not None:
+        assert len(modules) == len(kwargs_tup)
+    else:
+        kwargs_tup = ({},) * len(modules)
+    if devices is not None:
+        assert len(modules) == len(devices)
+    else:
+        devices = [None] * len(modules)
+    devices = [_get_device_index(x, True) for x in devices]
+    lock = threading.Lock()
+    results = {}
+    grad_enabled, autocast_enabled = torch.is_grad_enabled(), torch.is_autocast_enabled()
+    def _worker(i, module, input, label_set, kwargs, device=None):
+        torch.set_grad_enabled(grad_enabled)
+        if device is None:
+            device = get_a_var(input).get_device()
+        try:
+            with torch.cuda.device(device), autocast(enabled=autocast_enabled):
+                # this also avoids accidental slicing of `input` if it is a Tensor
+                if not isinstance(input, (list, tuple)):
+                    input = (input,)
+                output = module(*input, label_set, **kwargs)
+            with lock:
+                results[i] = output
+        except Exception:
+            with lock:
+                results[i] = ExceptionWrapper(
+                    where="in replica {} on device {}".format(i, device))
+    if len(modules) > 1:
+        threads = [threading.Thread(target=_worker,
+                                    args=(i, module, input, label_set, kwargs, device))
+                   for i, (module, input, kwargs, device) in
+                   enumerate(zip(modules, inputs, kwargs_tup, devices))]
+        for thread in threads:
+            thread.start()
+        for thread in threads:
+            thread.join()
+    else:
+        _worker(0, modules[0], inputs[0], label_set, kwargs_tup[0], devices[0])
+    outputs = []
+    for i in range(len(inputs)):
+        output = results[i]
+        if isinstance(output, ExceptionWrapper):
+            output.reraise()
+        outputs.append(output)
+    return outputs

data/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import copy
+import itertools
+import functools
+import numpy as np
+import torch
+import torch.utils.data
+import torchvision.transforms as torch_transforms
+import encoding.datasets as enc_ds
+encoding_datasets = {
+    x: functools.partial(enc_ds.get_dataset, x)
+    for x in ["coco", "ade20k", "pascal_voc", "pascal_aug", "pcontext", "citys"]
+}
+def get_dataset(name, **kwargs):
+    if name in encoding_datasets:
+        return encoding_datasets[name.lower()](**kwargs)
+    assert False, f"dataset {name} not found"
+def get_available_datasets():
+    return list(encoding_datasets.keys())

label_files/ade20k_objectInfo150.txt ADDED Viewed

	@@ -0,0 +1,151 @@

+Idx,Ratio,Train,Val,Stuff,Name
+1,0.1576,11664,1172,1,wall
+2,0.1072,6046,612,1,building;edifice
+3,0.0878,8265,796,1,sky
+4,0.0621,9336,917,1,floor;flooring
+5,0.0480,6678,641,0,tree
+6,0.0450,6604,643,1,ceiling
+7,0.0398,4023,408,1,road;route
+8,0.0231,1906,199,0,bed
+9,0.0198,4688,460,0,windowpane;window
+10,0.0183,2423,225,1,grass
+11,0.0181,2874,294,0,cabinet
+12,0.0166,3068,310,1,sidewalk;pavement
+13,0.0160,5075,526,0,person;individual;someone;somebody;mortal;soul
+14,0.0151,1804,190,1,earth;ground
+15,0.0118,6666,796,0,door;double;door
+16,0.0110,4269,411,0,table
+17,0.0109,1691,160,1,mountain;mount
+18,0.0104,3999,441,0,plant;flora;plant;life
+19,0.0104,2149,217,0,curtain;drape;drapery;mantle;pall
+20,0.0103,3261,318,0,chair
+21,0.0098,3164,306,0,car;auto;automobile;machine;motorcar
+22,0.0074,709,75,1,water
+23,0.0067,3296,315,0,painting;picture
+24,0.0065,1191,106,0,sofa;couch;lounge
+25,0.0061,1516,162,0,shelf
+26,0.0060,667,69,1,house
+27,0.0053,651,57,1,sea
+28,0.0052,1847,224,0,mirror
+29,0.0046,1158,128,1,rug;carpet;carpeting
+30,0.0044,480,44,1,field
+31,0.0044,1172,98,0,armchair
+32,0.0044,1292,184,0,seat
+33,0.0033,1386,138,0,fence;fencing
+34,0.0031,698,61,0,desk
+35,0.0030,781,73,0,rock;stone
+36,0.0027,380,43,0,wardrobe;closet;press
+37,0.0026,3089,302,0,lamp
+38,0.0024,404,37,0,bathtub;bathing;tub;bath;tub
+39,0.0024,804,99,0,railing;rail
+40,0.0023,1453,153,0,cushion
+41,0.0023,411,37,0,base;pedestal;stand
+42,0.0022,1440,162,0,box
+43,0.0022,800,77,0,column;pillar
+44,0.0020,2650,298,0,signboard;sign
+45,0.0019,549,46,0,chest;of;drawers;chest;bureau;dresser
+46,0.0019,367,36,0,counter
+47,0.0018,311,30,1,sand
+48,0.0018,1181,122,0,sink
+49,0.0018,287,23,1,skyscraper
+50,0.0018,468,38,0,fireplace;hearth;open;fireplace
+51,0.0018,402,43,0,refrigerator;icebox
+52,0.0018,130,12,1,grandstand;covered;stand
+53,0.0018,561,64,1,path
+54,0.0017,880,102,0,stairs;steps
+55,0.0017,86,12,1,runway
+56,0.0017,172,11,0,case;display;case;showcase;vitrine
+57,0.0017,198,18,0,pool;table;billiard;table;snooker;table
+58,0.0017,930,109,0,pillow
+59,0.0015,139,18,0,screen;door;screen
+60,0.0015,564,52,1,stairway;staircase
+61,0.0015,320,26,1,river
+62,0.0015,261,29,1,bridge;span
+63,0.0014,275,22,0,bookcase
+64,0.0014,335,60,0,blind;screen
+65,0.0014,792,75,0,coffee;table;cocktail;table
+66,0.0014,395,49,0,toilet;can;commode;crapper;pot;potty;stool;throne
+67,0.0014,1309,138,0,flower
+68,0.0013,1112,113,0,book
+69,0.0013,266,27,1,hill
+70,0.0013,659,66,0,bench
+71,0.0012,331,31,0,countertop
+72,0.0012,531,56,0,stove;kitchen;stove;range;kitchen;range;cooking;stove
+73,0.0012,369,36,0,palm;palm;tree
+74,0.0012,144,9,0,kitchen;island
+75,0.0011,265,29,0,computer;computing;machine;computing;device;data;processor;electronic;computer;information;processing;system
+76,0.0010,324,33,0,swivel;chair
+77,0.0009,304,27,0,boat
+78,0.0009,170,20,0,bar
+79,0.0009,68,6,0,arcade;machine
+80,0.0009,65,8,1,hovel;hut;hutch;shack;shanty
+81,0.0009,248,25,0,bus;autobus;coach;charabanc;double-decker;jitney;motorbus;motorcoach;omnibus;passenger;vehicle
+82,0.0008,492,49,0,towel
+83,0.0008,2510,269,0,light;light;source
+84,0.0008,440,39,0,truck;motortruck
+85,0.0008,147,18,1,tower
+86,0.0008,583,56,0,chandelier;pendant;pendent
+87,0.0007,533,61,0,awning;sunshade;sunblind
+88,0.0007,1989,239,0,streetlight;street;lamp
+89,0.0007,71,5,0,booth;cubicle;stall;kiosk
+90,0.0007,618,53,0,television;television;receiver;television;set;tv;tv;set;idiot;box;boob;tube;telly;goggle;box
+91,0.0007,135,12,0,airplane;aeroplane;plane
+92,0.0007,83,5,1,dirt;track
+93,0.0007,178,17,0,apparel;wearing;apparel;dress;clothes
+94,0.0006,1003,104,0,pole
+95,0.0006,182,12,1,land;ground;soil
+96,0.0006,452,50,0,bannister;banister;balustrade;balusters;handrail
+97,0.0006,42,6,1,escalator;moving;staircase;moving;stairway
+98,0.0006,307,31,0,ottoman;pouf;pouffe;puff;hassock
+99,0.0006,965,114,0,bottle
+100,0.0006,117,13,0,buffet;counter;sideboard
+101,0.0006,354,35,0,poster;posting;placard;notice;bill;card
+102,0.0006,108,9,1,stage
+103,0.0006,557,55,0,van
+104,0.0006,52,4,0,ship
+105,0.0005,99,5,0,fountain
+106,0.0005,57,4,1,conveyer;belt;conveyor;belt;conveyer;conveyor;transporter
+107,0.0005,292,31,0,canopy
+108,0.0005,77,9,0,washer;automatic;washer;washing;machine
+109,0.0005,340,38,0,plaything;toy
+110,0.0005,66,3,1,swimming;pool;swimming;bath;natatorium
+111,0.0005,465,49,0,stool
+112,0.0005,50,4,0,barrel;cask
+113,0.0005,622,75,0,basket;handbasket
+114,0.0005,80,9,1,waterfall;falls
+115,0.0005,59,3,0,tent;collapsible;shelter
+116,0.0005,531,72,0,bag
+117,0.0005,282,30,0,minibike;motorbike
+118,0.0005,73,7,0,cradle
+119,0.0005,435,44,0,oven
+120,0.0005,136,25,0,ball
+121,0.0005,116,24,0,food;solid;food
+122,0.0004,266,31,0,step;stair
+123,0.0004,58,12,0,tank;storage;tank
+124,0.0004,418,83,0,trade;name;brand;name;brand;marque
+125,0.0004,319,43,0,microwave;microwave;oven
+126,0.0004,1193,139,0,pot;flowerpot
+127,0.0004,97,23,0,animal;animate;being;beast;brute;creature;fauna
+128,0.0004,347,36,0,bicycle;bike;wheel;cycle
+129,0.0004,52,5,1,lake
+130,0.0004,246,22,0,dishwasher;dish;washer;dishwashing;machine
+131,0.0004,108,13,0,screen;silver;screen;projection;screen
+132,0.0004,201,30,0,blanket;cover
+133,0.0004,285,21,0,sculpture
+134,0.0004,268,27,0,hood;exhaust;hood
+135,0.0003,1020,108,0,sconce
+136,0.0003,1282,122,0,vase
+137,0.0003,528,65,0,traffic;light;traffic;signal;stoplight
+138,0.0003,453,57,0,tray
+139,0.0003,671,100,0,ashcan;trash;can;garbage;can;wastebin;ash;bin;ash-bin;ashbin;dustbin;trash;barrel;trash;bin
+140,0.0003,397,44,0,fan
+141,0.0003,92,8,1,pier;wharf;wharfage;dock
+142,0.0003,228,18,0,crt;screen
+143,0.0003,570,59,0,plate
+144,0.0003,217,22,0,monitor;monitoring;device
+145,0.0003,206,19,0,bulletin;board;notice;board
+146,0.0003,130,14,0,shower
+147,0.0003,178,28,0,radiator
+148,0.0002,504,57,0,glass;drinking;glass
+149,0.0002,775,96,0,clock
+150,0.0002,421,56,0,flag

lseg_app.py ADDED Viewed

	@@ -0,0 +1,386 @@

+from collections import namedtuple
+import altair as alt
+import math
+import pandas as pd
+import streamlit as st
+st.set_page_config(layout="wide")
+from PIL import Image
+import os
+import torch
+import os
+import argparse
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+from torch.utils import data
+import torchvision.transforms as transform
+from torch.nn.parallel.scatter_gather import gather
+from additional_utils.models import LSeg_MultiEvalModule
+from modules.lseg_module import LSegModule
+import cv2
+import math
+import types
+import functools
+import torchvision.transforms as torch_transforms
+import copy
+import itertools
+from PIL import Image
+import matplotlib.pyplot as plt
+import clip
+from encoding.models.sseg import BaseNet
+import matplotlib as mpl
+import matplotlib.colors as mplc
+import matplotlib.figure as mplfigure
+import matplotlib.patches as mpatches
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from data import get_dataset
+import torchvision.transforms as transforms
+def get_new_pallete(num_cls):
+    n = num_cls
+    pallete = [0]*(n*3)
+    for j in range(0,n):
+            lab = j
+            pallete[j*3+0] = 0
+            pallete[j*3+1] = 0
+            pallete[j*3+2] = 0
+            i = 0
+            while (lab > 0):
+                    pallete[j*3+0] |= (((lab >> 0) & 1) << (7-i))
+                    pallete[j*3+1] |= (((lab >> 1) & 1) << (7-i))
+                    pallete[j*3+2] |= (((lab >> 2) & 1) << (7-i))
+                    i = i + 1
+                    lab >>= 3
+    return pallete
+def get_new_mask_pallete(npimg, new_palette, out_label_flag=False, labels=None):
+    """Get image color pallete for visualizing masks"""
+    # put colormap
+    out_img = Image.fromarray(npimg.squeeze().astype('uint8'))
+    out_img.putpalette(new_palette)
+    if out_label_flag:
+        assert labels is not None
+        u_index = np.unique(npimg)
+        patches = []
+        for i, index in enumerate(u_index):
+            label = labels[index]
+            cur_color = [new_palette[index * 3] / 255.0, new_palette[index * 3 + 1] / 255.0, new_palette[index * 3 + 2] / 255.0]
+            red_patch = mpatches.Patch(color=cur_color, label=label)
+            patches.append(red_patch)
+    return out_img, patches
+@st.cache(allow_output_mutation=True)
+def load_model():
+    class Options:
+        def __init__(self):
+            parser = argparse.ArgumentParser(description="PyTorch Segmentation")
+            # model and dataset
+            parser.add_argument(
+                "--model", type=str, default="encnet", help="model name (default: encnet)"
+            )
+            parser.add_argument(
+                "--backbone",
+                type=str,
+                default="clip_vitl16_384",
+                help="backbone name (default: resnet50)",
+            )
+            parser.add_argument(
+                "--dataset",
+                type=str,
+                default="ade20k",
+                help="dataset name (default: pascal12)",
+            )
+            parser.add_argument(
+                "--workers", type=int, default=16, metavar="N", help="dataloader threads"
+            )
+            parser.add_argument(
+                "--base-size", type=int, default=520, help="base image size"
+            )
+            parser.add_argument(
+                "--crop-size", type=int, default=480, help="crop image size"
+            )
+            parser.add_argument(
+                "--train-split",
+                type=str,
+                default="train",
+                help="dataset train split (default: train)",
+            )
+            parser.add_argument(
+                "--aux", action="store_true", default=False, help="Auxilary Loss"
+            )
+            parser.add_argument(
+                "--se-loss",
+                action="store_true",
+                default=False,
+                help="Semantic Encoding Loss SE-loss",
+            )
+            parser.add_argument(
+                "--se-weight", type=float, default=0.2, help="SE-loss weight (default: 0.2)"
+            )
+            parser.add_argument(
+                "--batch-size",
+                type=int,
+                default=16,
+                metavar="N",
+                help="input batch size for \
+                                training (default: auto)",
+            )
+            parser.add_argument(
+                "--test-batch-size",
+                type=int,
+                default=16,
+                metavar="N",
+                help="input batch size for \
+                                testing (default: same as batch size)",
+            )
+            # cuda, seed and logging
+            parser.add_argument(
+                "--no-cuda",
+                action="store_true",
+                default=False,
+                help="disables CUDA training",
+            )
+            parser.add_argument(
+                "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
+            )
+            # checking point
+            parser.add_argument(
+                "--weights", type=str, default='', help="checkpoint to test"
+            )
+            # evaluation option
+            parser.add_argument(
+                "--eval", action="store_true", default=False, help="evaluating mIoU"
+            )
+            parser.add_argument(
+                "--export",
+                type=str,
+                default=None,
+                help="put the path to resuming file if needed",
+            )
+            parser.add_argument(
+                "--acc-bn",
+                action="store_true",
+                default=False,
+                help="Re-accumulate BN statistics",
+            )
+            parser.add_argument(
+                "--test-val",
+                action="store_true",
+                default=False,
+                help="generate masks on val set",
+            )
+            parser.add_argument(
+                "--no-val",
+                action="store_true",
+                default=False,
+                help="skip validation during training",
+            )
+            parser.add_argument(
+                "--module",
+                default='lseg',
+                help="select model definition",
+            )
+            # test option
+            parser.add_argument(
+                "--data-path", type=str, default='../datasets/', help="path to test image folder"
+            )
+            parser.add_argument(
+                "--no-scaleinv",
+                dest="scale_inv",
+                default=True,
+                action="store_false",
+                help="turn off scaleinv layers",
+            )
+            parser.add_argument(
+                "--widehead", default=False, action="store_true", help="wider output head"
+            )
+            parser.add_argument(
+                "--widehead_hr",
+                default=False,
+                action="store_true",
+                help="wider output head",
+            )
+            parser.add_argument(
+                "--ignore_index",
+                type=int,
+                default=-1,
+                help="numeric value of ignore label in gt",
+            )
+            parser.add_argument(
+                "--label_src",
+                type=str,
+                default="default",
+                help="how to get the labels",
+            )
+            parser.add_argument(
+                "--arch_option",
+                type=int,
+                default=0,
+                help="which kind of architecture to be used",
+            )
+            parser.add_argument(
+                "--block_depth",
+                type=int,
+                default=0,
+                help="how many blocks should be used",
+            )
+            parser.add_argument(
+                "--activation",
+                choices=['lrelu', 'tanh'],
+                default="lrelu",
+                help="use which activation to activate the block",
+            )
+            self.parser = parser
+        def parse(self):
+            args = self.parser.parse_args(args=[])
+            args.cuda = not args.no_cuda and torch.cuda.is_available()
+            print(args)
+            return args
+    args = Options().parse()
+    torch.manual_seed(args.seed)
+    args.test_batch_size = 1
+    alpha=0.5
+    args.scale_inv = False
+    args.widehead = True
+    args.dataset = 'ade20k'
+    args.backbone = 'clip_vitl16_384'
+    args.weights = 'checkpoints/demo_e200.ckpt'
+    args.ignore_index = 255
+    module = LSegModule.load_from_checkpoint(
+        checkpoint_path=args.weights,
+        data_path=args.data_path,
+        dataset=args.dataset,
+        backbone=args.backbone,
+        aux=args.aux,
+        num_features=256,
+        aux_weight=0,
+        se_loss=False,
+        se_weight=0,
+        base_lr=0,
+        batch_size=1,
+        max_epochs=0,
+        ignore_index=args.ignore_index,
+        dropout=0.0,
+        scale_inv=args.scale_inv,
+        augment=False,
+        no_batchnorm=False,
+        widehead=args.widehead,
+        widehead_hr=args.widehead_hr,
+        map_locatin="cpu",
+        arch_option=0,
+        block_depth=0,
+        activation='lrelu',
+    )
+    input_transform = module.val_transform
+    # dataloader
+    loader_kwargs = (
+        {"num_workers": args.workers, "pin_memory": True} if args.cuda else {}
+    )
+    # model
+    if isinstance(module.net, BaseNet):
+        model = module.net
+    else:
+        model = module
+    model = model.eval()
+    model = model.cpu()
+    scales = (
+        [0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 2.25]
+        if args.dataset == "citys"
+        else [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    )
+    model.mean = [0.5, 0.5, 0.5]
+    model.std = [0.5, 0.5, 0.5]
+    evaluator = LSeg_MultiEvalModule(
+        model, scales=scales, flip=True
+    ).cuda()
+    evaluator.eval()
+    transform = transforms.Compose(
+    [
+        transforms.ToTensor(),
+        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
+        transforms.Resize([360,480]),
+    ]
+)
+    return evaluator, transform
+"""
+# LSeg Demo
+"""
+lseg_model, lseg_transform = load_model()
+uploaded_file = st.file_uploader("Choose an image...")
+input_labels = st.text_input("Input labels", value="dog, grass, other")
+st.write("The labels are", input_labels)
+if uploaded_file is not None:
+    image = Image.open(uploaded_file)
+    pimage = lseg_transform(np.array(image)).unsqueeze(0)
+    labels = []
+    for label in input_labels.split(","):
+        labels.append(label.strip())
+    with torch.no_grad():
+        outputs = lseg_model.parallel_forward(pimage, labels)
+        predicts = [
+            torch.max(output, 1)[1].cpu().numpy()
+            for output in outputs
+        ]
+    image = pimage[0].permute(1,2,0)
+    image = image * 0.5 + 0.5
+    image = Image.fromarray(np.uint8(255*image)).convert("RGBA")
+    pred = predicts[0]
+    new_palette = get_new_pallete(len(labels))
+    mask, patches = get_new_mask_pallete(pred, new_palette, out_label_flag=True, labels=labels)
+    seg = mask.convert("RGBA")
+    fig = plt.figure()
+    plt.subplot(121)
+    plt.imshow(image)
+    plt.axis('off')
+    plt.subplot(122)
+    plt.imshow(seg)
+    plt.legend(handles=patches, loc='upper right', bbox_to_anchor=(1.3, 1), prop={'size': 5})
+    plt.axis('off')
+    plt.tight_layout()
+    #st.image([image,seg], width=700, caption=["Input image", "Segmentation"])
+    st.pyplot(fig)

modules/lseg_module.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import re
+import torch
+import torch.nn as nn
+import torchvision.transforms as transforms
+from argparse import ArgumentParser
+import pytorch_lightning as pl
+from .lsegmentation_module import LSegmentationModule
+from .models.lseg_net import LSegNet
+from encoding.models.sseg.base import up_kwargs
+import os
+import clip
+import numpy as np
+from scipy import signal
+import glob
+from PIL import Image
+import matplotlib.pyplot as plt
+import pandas as pd
+class LSegModule(LSegmentationModule):
+    def __init__(self, data_path, dataset, batch_size, base_lr, max_epochs, **kwargs):
+        super(LSegModule, self).__init__(
+            data_path, dataset, batch_size, base_lr, max_epochs, **kwargs
+        )
+        if dataset == "citys":
+            self.base_size = 2048
+            self.crop_size = 768
+        else:
+            self.base_size = 520
+            self.crop_size = 480
+        use_pretrained = True
+        norm_mean= [0.5, 0.5, 0.5]
+        norm_std = [0.5, 0.5, 0.5]
+        print('** Use norm {}, {} as the mean and std **'.format(norm_mean, norm_std))
+        train_transform = [
+            transforms.ToTensor(),
+            transforms.Normalize(norm_mean, norm_std),
+        ]
+        val_transform = [
+            transforms.ToTensor(),
+            transforms.Normalize(norm_mean, norm_std),
+        ]
+        self.train_transform = transforms.Compose(train_transform)
+        self.val_transform = transforms.Compose(val_transform)
+        self.trainset = self.get_trainset(
+            dataset,
+            augment=kwargs["augment"],
+            base_size=self.base_size,
+            crop_size=self.crop_size,
+        )
+        self.valset = self.get_valset(
+            dataset,
+            augment=kwargs["augment"],
+            base_size=self.base_size,
+            crop_size=self.crop_size,
+        )
+        use_batchnorm = (
+            (not kwargs["no_batchnorm"]) if "no_batchnorm" in kwargs else True
+        )
+        # print(kwargs)
+        labels = self.get_labels('ade20k')
+        self.net = LSegNet(
+            labels=labels,
+            backbone=kwargs["backbone"],
+            features=kwargs["num_features"],
+            crop_size=self.crop_size,
+            arch_option=kwargs["arch_option"],
+            block_depth=kwargs["block_depth"],
+            activation=kwargs["activation"],
+        )
+        self.net.pretrained.model.patch_embed.img_size = (
+            self.crop_size,
+            self.crop_size,
+        )
+        self._up_kwargs = up_kwargs
+        self.mean = norm_mean
+        self.std = norm_std
+        self.criterion = self.get_criterion(**kwargs)
+    def get_labels(self, dataset):
+        labels = []
+        path = 'label_files/{}_objectInfo150.txt'.format(dataset)
+        assert os.path.exists(path), '*** Error : {} not exist !!!'.format(path)
+        f = open(path, 'r')
+        lines = f.readlines()
+        for line in lines:
+            label = line.strip().split(',')[-1].split(';')[0]
+            labels.append(label)
+        f.close()
+        if dataset in ['ade20k']:
+            labels = labels[1:]
+        return labels
+    @staticmethod
+    def add_model_specific_args(parent_parser):
+        parser = LSegmentationModule.add_model_specific_args(parent_parser)
+        parser = ArgumentParser(parents=[parser])
+        parser.add_argument(
+            "--backbone",
+            type=str,
+            default="clip_vitl16_384",
+            help="backbone network",
+        )
+        parser.add_argument(
+            "--num_features",
+            type=int,
+            default=256,
+            help="number of featurs that go from encoder to decoder",
+        )
+        parser.add_argument("--dropout", type=float, default=0.1, help="dropout rate")
+        parser.add_argument(
+            "--finetune_weights", type=str, help="load weights to finetune from"
+        )
+        parser.add_argument(
+            "--no-scaleinv",
+            default=True,
+            action="store_false",
+            help="turn off scaleinv layers",
+        )
+        parser.add_argument(
+            "--no-batchnorm",
+            default=False,
+            action="store_true",
+            help="turn off batchnorm",
+        )
+        parser.add_argument(
+            "--widehead", default=False, action="store_true", help="wider output head"
+        )
+        parser.add_argument(
+            "--widehead_hr",
+            default=False,
+            action="store_true",
+            help="wider output head",
+        )
+        parser.add_argument(
+            "--arch_option",
+            type=int,
+            default=0,
+            help="which kind of architecture to be used",
+        )
+        parser.add_argument(
+            "--block_depth",
+            type=int,
+            default=0,
+            help="how many blocks should be used",
+        )
+        parser.add_argument(
+            "--activation",
+            choices=['lrelu', 'tanh'],
+            default="lrelu",
+            help="use which activation to activate the block",
+        )
+        return parser

modules/lsegmentation_module.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import types
+import time
+import random
+import clip
+import torch
+import torch.nn as nn
+import torchvision.transforms as transforms
+from argparse import ArgumentParser
+import pytorch_lightning as pl
+from data import get_dataset, get_available_datasets
+from encoding.models import get_segmentation_model
+from encoding.nn import SegmentationLosses
+from encoding.utils import batch_pix_accuracy, batch_intersection_union
+# add mixed precision
+import torch.cuda.amp as amp
+import numpy as np
+from encoding.utils import SegmentationMetric
+class LSegmentationModule(pl.LightningModule):
+    def __init__(self, data_path, dataset, batch_size, base_lr, max_epochs, **kwargs):
+        super().__init__()
+        self.data_path = data_path
+        self.batch_size = batch_size
+        self.base_lr = base_lr / 16 * batch_size
+        self.lr = self.base_lr
+        self.epochs = max_epochs
+        self.other_kwargs = kwargs
+        self.enabled = False #True mixed precision will make things complicated and leading to NAN error
+        self.scaler = amp.GradScaler(enabled=self.enabled)
+    def forward(self, x):
+        return self.net(x)
+    def evaluate(self, x, target=None):
+        pred = self.net.forward(x)
+        if isinstance(pred, (tuple, list)):
+            pred = pred[0]
+        if target is None:
+            return pred
+        correct, labeled = batch_pix_accuracy(pred.data, target.data)
+        inter, union = batch_intersection_union(pred.data, target.data, self.nclass)
+        return correct, labeled, inter, union
+    def evaluate_random(self, x, labelset, target=None):
+        pred = self.net.forward(x, labelset)
+        if isinstance(pred, (tuple, list)):
+            pred = pred[0]
+        if target is None:
+            return pred
+        correct, labeled = batch_pix_accuracy(pred.data, target.data)
+        inter, union = batch_intersection_union(pred.data, target.data, self.nclass)
+        return correct, labeled, inter, union
+    def training_step(self, batch, batch_nb):
+        img, target = batch
+        with amp.autocast(enabled=self.enabled):
+            out = self(img)
+            multi_loss = isinstance(out, tuple)
+            if multi_loss:
+                loss = self.criterion(*out, target)
+            else:
+                loss = self.criterion(out, target)
+            loss = self.scaler.scale(loss)
+        final_output = out[0] if multi_loss else out
+        train_pred, train_gt = self._filter_invalid(final_output, target)
+        if train_gt.nelement() != 0:
+            self.train_accuracy(train_pred, train_gt)
+        self.log("train_loss", loss)
+        return loss
+    def training_epoch_end(self, outs):
+        self.log("train_acc_epoch", self.train_accuracy.compute())
+    def validation_step(self, batch, batch_nb):
+        img, target = batch
+        out = self(img)
+        multi_loss = isinstance(out, tuple)
+        if multi_loss:
+            val_loss = self.criterion(*out, target)
+        else:
+            val_loss = self.criterion(out, target)
+        final_output = out[0] if multi_loss else out
+        valid_pred, valid_gt = self._filter_invalid(final_output, target)
+        self.val_iou.update(target, final_output)
+        pixAcc, iou = self.val_iou.get()
+        self.log("val_loss_step", val_loss)
+        self.log("pix_acc_step", pixAcc)
+        self.log(
+            "val_acc_step",
+            self.val_accuracy(valid_pred, valid_gt),
+        )
+        self.log("val_iou", iou)
+    def validation_epoch_end(self, outs):
+        pixAcc, iou = self.val_iou.get()
+        self.log("val_acc_epoch", self.val_accuracy.compute())
+        self.log("val_iou_epoch", iou)
+        self.log("pix_acc_epoch", pixAcc)
+        self.val_iou.reset()
+    def _filter_invalid(self, pred, target):
+        valid = target != self.other_kwargs["ignore_index"]
+        _, mx = torch.max(pred, dim=1)
+        return mx[valid], target[valid]
+    def configure_optimizers(self):
+        params_list = [
+            {"params": self.net.pretrained.parameters(), "lr": self.base_lr},
+        ]
+        if hasattr(self.net, "scratch"):
+            print("Found output scratch")
+            params_list.append(
+                {"params": self.net.scratch.parameters(), "lr": self.base_lr * 10}
+            )
+        if hasattr(self.net, "auxlayer"):
+            print("Found auxlayer")
+            params_list.append(
+                {"params": self.net.auxlayer.parameters(), "lr": self.base_lr * 10}
+            )
+        if hasattr(self.net, "scale_inv_conv"):
+            print(self.net.scale_inv_conv)
+            print("Found scaleinv layers")
+            params_list.append(
+                {
+                    "params": self.net.scale_inv_conv.parameters(),
+                    "lr": self.base_lr * 10,
+                }
+            )
+            params_list.append(
+                {"params": self.net.scale2_conv.parameters(), "lr": self.base_lr * 10}
+            )
+            params_list.append(
+                {"params": self.net.scale3_conv.parameters(), "lr": self.base_lr * 10}
+            )
+            params_list.append(
+                {"params": self.net.scale4_conv.parameters(), "lr": self.base_lr * 10}
+            )
+        if self.other_kwargs["midasproto"]:
+            print("Using midas optimization protocol")
+            opt = torch.optim.Adam(
+                params_list,
+                lr=self.base_lr,
+                betas=(0.9, 0.999),
+                weight_decay=self.other_kwargs["weight_decay"],
+            )
+            sch = torch.optim.lr_scheduler.LambdaLR(
+                opt, lambda x: pow(1.0 - x / self.epochs, 0.9)
+            )
+        else:
+            opt = torch.optim.SGD(
+                params_list,
+                lr=self.base_lr,
+                momentum=0.9,
+                weight_decay=self.other_kwargs["weight_decay"],
+            )
+            sch = torch.optim.lr_scheduler.LambdaLR(
+                opt, lambda x: pow(1.0 - x / self.epochs, 0.9)
+            )
+        return [opt], [sch]
+    def train_dataloader(self):
+        return torch.utils.data.DataLoader(
+            self.trainset,
+            batch_size=self.batch_size,
+            shuffle=True,
+            num_workers=16,
+            worker_init_fn=lambda x: random.seed(time.time() + x),
+        )
+    def val_dataloader(self):
+        return torch.utils.data.DataLoader(
+            self.valset,
+            batch_size=self.batch_size,
+            shuffle=False,
+            num_workers=16,
+        )
+    def get_trainset(self, dset, augment=False, **kwargs):
+        print(kwargs)
+        if augment == True:
+            mode = "train_x"
+        else:
+            mode = "train"
+        print(mode)
+        dset = get_dataset(
+            dset,
+            root=self.data_path,
+            split="train",
+            mode=mode,
+            transform=self.train_transform,
+            **kwargs
+        )
+        self.num_classes = dset.num_class
+        self.train_accuracy = pl.metrics.Accuracy()
+        return dset
+    def get_valset(self, dset, augment=False, **kwargs):
+        self.val_accuracy = pl.metrics.Accuracy()
+        self.val_iou = SegmentationMetric(self.num_classes)
+        if augment == True:
+            mode = "val_x"
+        else:
+            mode = "val"
+        print(mode)
+        return get_dataset(
+            dset,
+            root=self.data_path,
+            split="val",
+            mode=mode,
+            transform=self.val_transform,
+            **kwargs
+        )
+    def get_criterion(self, **kwargs):
+        return SegmentationLosses(
+            se_loss=kwargs["se_loss"],
+            aux=kwargs["aux"],
+            nclass=self.num_classes,
+            se_weight=kwargs["se_weight"],
+            aux_weight=kwargs["aux_weight"],
+            ignore_index=kwargs["ignore_index"],
+        )
+    @staticmethod
+    def add_model_specific_args(parent_parser):
+        parser = ArgumentParser(parents=[parent_parser], add_help=False)
+        parser.add_argument(
+            "--data_path", type=str, help="path where dataset is stored"
+        )
+        parser.add_argument(
+            "--dataset",
+            choices=get_available_datasets(),
+            default="ade20k",
+            help="dataset to train on",
+        )
+        parser.add_argument(
+            "--batch_size", type=int, default=16, help="size of the batches"
+        )
+        parser.add_argument(
+            "--base_lr", type=float, default=0.004, help="learning rate"
+        )
+        parser.add_argument("--momentum", type=float, default=0.9, help="SGD momentum")
+        parser.add_argument(
+            "--weight_decay", type=float, default=1e-4, help="weight_decay"
+        )
+        parser.add_argument(
+            "--aux", action="store_true", default=False, help="Auxilary Loss"
+        )
+        parser.add_argument(
+            "--aux-weight",
+            type=float,
+            default=0.2,
+            help="Auxilary loss weight (default: 0.2)",
+        )
+        parser.add_argument(
+            "--se-loss",
+            action="store_true",
+            default=False,
+            help="Semantic Encoding Loss SE-loss",
+        )
+        parser.add_argument(
+            "--se-weight", type=float, default=0.2, help="SE-loss weight (default: 0.2)"
+        )
+        parser.add_argument(
+            "--midasproto", action="store_true", default=False, help="midasprotocol"
+        )
+        parser.add_argument(
+            "--ignore_index",
+            type=int,
+            default=-1,
+            help="numeric value of ignore label in gt",
+        )
+        parser.add_argument(
+            "--augment",
+            action="store_true",
+            default=False,
+            help="Use extended augmentations",
+        )
+        return parser

modules/models/lseg_blocks.py ADDED Viewed

	@@ -0,0 +1,359 @@

+import torch
+import torch.nn as nn
+from .lseg_vit import (
+    _make_pretrained_clip_vitl16_384,
+    _make_pretrained_clip_vitb32_384,
+    _make_pretrained_clipRN50x16_vitl16_384,
+    forward_vit,
+)
+def _make_encoder(
+    backbone,
+    features,
+    use_pretrained=True,
+    groups=1,
+    expand=False,
+    exportable=True,
+    hooks=None,
+    use_vit_only=False,
+    use_readout="ignore",
+    enable_attention_hooks=False,
+):
+    if backbone == "clip_vitl16_384":
+        clip_pretrained, pretrained = _make_pretrained_clip_vitl16_384(
+            use_pretrained,
+            hooks=hooks,
+            use_readout=use_readout,
+            enable_attention_hooks=enable_attention_hooks,
+        )
+        scratch = _make_scratch(
+            [256, 512, 1024, 1024], features, groups=groups, expand=expand
+        )
+    elif backbone == "clipRN50x16_vitl16_384":
+        clip_pretrained, pretrained = _make_pretrained_clipRN50x16_vitl16_384(
+            use_pretrained,
+            hooks=hooks,
+            use_readout=use_readout,
+            enable_attention_hooks=enable_attention_hooks,
+        )
+        scratch = _make_scratch(
+            [256, 512, 1024, 1024], features, groups=groups, expand=expand
+        )
+    elif backbone == "clip_vitb32_384":
+        clip_pretrained, pretrained = _make_pretrained_clip_vitb32_384(
+            use_pretrained,
+            hooks=hooks,
+            use_readout=use_readout,
+        )
+        scratch = _make_scratch(
+            [96, 192, 384, 768], features, groups=groups, expand=expand
+        )
+    else:
+        print(f"Backbone '{backbone}' not implemented")
+        assert False
+    return clip_pretrained, pretrained, scratch
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand == True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0],
+        out_shape1,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1],
+        out_shape2,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2],
+        out_shape3,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3],
+        out_shape4,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    return scratch
+class Interpolate(nn.Module):
+    """Interpolation module."""
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: interpolated data
+        """
+        x = self.interp(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+        )
+        return x
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module."""
+    def __init__(self, features):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True
+        )
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True
+        )
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.relu(x)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        return out + x
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block."""
+    def __init__(self, features):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.resConfUnit1 = ResidualConvUnit(features)
+        self.resConfUnit2 = ResidualConvUnit(features)
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            output += self.resConfUnit1(xs[1])
+        output = self.resConfUnit2(output)
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode="bilinear", align_corners=True
+        )
+        return output
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module."""
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups = 1
+        self.conv1 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+        self.conv2 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn == True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn == True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+        # return out + x
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block."""
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups = 1
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(
+            features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+            groups=1,
+        )
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+            # output += res
+        output = self.resConfUnit2(output)
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
+        )
+        output = self.out_conv(output)
+        return output

modules/models/lseg_net.py ADDED Viewed

	@@ -0,0 +1,231 @@

+import math
+import types
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .lseg_blocks import FeatureFusionBlock, Interpolate, _make_encoder, FeatureFusionBlock_custom, forward_vit
+import clip
+import numpy as np
+import pandas as pd
+import os
+class depthwise_clipseg_conv(nn.Module):
+    def __init__(self):
+        super(depthwise_clipseg_conv, self).__init__()
+        self.depthwise = nn.Conv2d(1, 1, kernel_size=3, padding=1)
+    def depthwise_clipseg(self, x, channels):
+        x = torch.cat([self.depthwise(x[:, i].unsqueeze(1)) for i in range(channels)], dim=1)
+        return x
+    def forward(self, x):
+        channels = x.shape[1]
+        out = self.depthwise_clipseg(x, channels)
+        return out
+class depthwise_conv(nn.Module):
+    def __init__(self, kernel_size=3, stride=1, padding=1):
+        super(depthwise_conv, self).__init__()
+        self.depthwise = nn.Conv2d(1, 1, kernel_size=kernel_size, stride=stride, padding=padding)
+    def forward(self, x):
+        # support for 4D tensor with NCHW
+        C, H, W = x.shape[1:]
+        x = x.reshape(-1, 1, H, W)
+        x = self.depthwise(x)
+        x = x.view(-1, C, H, W)
+        return x
+class depthwise_block(nn.Module):
+    def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'):
+        super(depthwise_block, self).__init__()
+        self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1)
+        if activation == 'relu':
+            self.activation = nn.ReLU()
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU()
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+    def forward(self, x, act=True):
+        x = self.depthwise(x)
+        if act:
+            x = self.activation(x)
+        return x
+class bottleneck_block(nn.Module):
+    def __init__(self, kernel_size=3, stride=1, padding=1, activation='relu'):
+        super(bottleneck_block, self).__init__()
+        self.depthwise = depthwise_conv(kernel_size=3, stride=1, padding=1)
+        if activation == 'relu':
+            self.activation = nn.ReLU()
+        elif activation == 'lrelu':
+            self.activation = nn.LeakyReLU()
+        elif activation == 'tanh':
+            self.activation = nn.Tanh()
+    def forward(self, x, act=True):
+        sum_layer = x.max(dim=1, keepdim=True)[0]
+        x = self.depthwise(x)
+        x = x + sum_layer
+        if act:
+            x = self.activation(x)
+        return x
+class BaseModel(torch.nn.Module):
+    def load(self, path):
+        """Load model from file.
+        Args:
+            path (str): file path
+        """
+        parameters = torch.load(path, map_location=torch.device("cpu"))
+        if "optimizer" in parameters:
+            parameters = parameters["model"]
+        self.load_state_dict(parameters)
+def _make_fusion_block(features, use_bn):
+    return FeatureFusionBlock_custom(
+        features,
+        activation=nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+    )
+class LSeg(BaseModel):
+    def __init__(
+        self,
+        head,
+        features=256,
+        backbone="clip_vitl16_384",
+        readout="project",
+        channels_last=False,
+        use_bn=False,
+        **kwargs,
+    ):
+        super(LSeg, self).__init__()
+        self.channels_last = channels_last
+        hooks = {
+            "clip_vitl16_384": [5, 11, 17, 23],
+            "clipRN50x16_vitl16_384": [5, 11, 17, 23],
+            "clip_vitb32_384": [2, 5, 8, 11],
+        }
+        # Instantiate backbone and reassemble blocks
+        self.clip_pretrained, self.pretrained, self.scratch = _make_encoder(
+            backbone,
+            features,
+            groups=1,
+            expand=False,
+            exportable=False,
+            hooks=hooks[backbone],
+            use_readout=readout,
+        )
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)).exp()
+        if backbone in ["clipRN50x16_vitl16_384"]:
+            self.out_c = 768
+        else:
+            self.out_c = 512
+        self.scratch.head1 = nn.Conv2d(features, self.out_c, kernel_size=1)
+        self.arch_option = kwargs["arch_option"]
+        if self.arch_option == 1:
+            self.scratch.head_block = bottleneck_block(activation=kwargs["activation"])
+            self.block_depth = kwargs['block_depth']
+        elif self.arch_option == 2:
+            self.scratch.head_block = depthwise_block(activation=kwargs["activation"])
+            self.block_depth = kwargs['block_depth']
+        self.scratch.output_conv = head
+        self.text = clip.tokenize(self.labels)
+    def forward(self, x, labelset=''):
+        if labelset == '':
+            text = self.text
+        else:
+            text = clip.tokenize(labelset)
+        if self.channels_last == True:
+            x.contiguous(memory_format=torch.channels_last)
+        layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        text = text.to(x.device)
+        self.logit_scale = self.logit_scale.to(x.device)
+        text_features = self.clip_pretrained.encode_text(text)
+        image_features = self.scratch.head1(path_1)
+        imshape = image_features.shape
+        image_features = image_features.permute(0,2,3,1).reshape(-1, self.out_c)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        logits_per_image = self.logit_scale * image_features.half() @ text_features.t()
+        out = logits_per_image.float().view(imshape[0], imshape[2], imshape[3], -1).permute(0,3,1,2)
+        if self.arch_option in [1, 2]:
+            for _ in range(self.block_depth - 1):
+                out = self.scratch.head_block(out)
+            out = self.scratch.head_block(out, False)
+        out = self.scratch.output_conv(out)
+        return out
+class LSegNet(LSeg):
+    """Network for semantic segmentation."""
+    def __init__(self, labels, path=None, scale_factor=0.5, crop_size=480, **kwargs):
+        features = kwargs["features"] if "features" in kwargs else 256
+        kwargs["use_bn"] = True
+        self.crop_size = crop_size
+        self.scale_factor = scale_factor
+        self.labels = labels
+        head = nn.Sequential(
+            Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+        )
+        super().__init__(head, **kwargs)
+        if path is not None:
+            self.load(path)

modules/models/lseg_vit.py ADDED Viewed

	@@ -0,0 +1,535 @@

+import torch
+import torch.nn as nn
+import timm
+import types
+import math
+import torch.nn.functional as F
+import clip
+activations = {}
+def get_activation(name):
+    def hook(model, input, output):
+        activations[name] = output
+    return hook
+attention = {}
+def get_attention(name):
+    def hook(module, input, output):
+        x = input[0]
+        B, N, C = x.shape
+        qkv = (
+            module.qkv(x)
+            .reshape(B, N, 3, module.num_heads, C // module.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+        attn = (q @ k.transpose(-2, -1)) * module.scale
+        attn = attn.softmax(dim=-1)  # [:,:,1,1:]
+        attention[name] = attn
+    return hook
+def get_mean_attention_map(attn, token, shape):
+    attn = attn[:, :, token, 1:]
+    attn = attn.unflatten(2, torch.Size([shape[2] // 16, shape[3] // 16])).float()
+    attn = torch.nn.functional.interpolate(
+        attn, size=shape[2:], mode="bicubic", align_corners=False
+    ).squeeze(0)
+    all_attn = torch.mean(attn, 0)
+    return all_attn
+class Slice(nn.Module):
+    def __init__(self, start_index=1):
+        super(Slice, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        return x[:, self.start_index :]
+class AddReadout(nn.Module):
+    def __init__(self, start_index=1):
+        super(AddReadout, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        if self.start_index == 2:
+            readout = (x[:, 0] + x[:, 1]) / 2
+        else:
+            readout = x[:, 0]
+        return x[:, self.start_index :] + readout.unsqueeze(1)
+class ProjectReadout(nn.Module):
+    def __init__(self, in_features, start_index=1):
+        super(ProjectReadout, self).__init__()
+        self.start_index = start_index
+        self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
+        features = torch.cat((x[:, self.start_index :], readout), -1)
+        return self.project(features)
+class Transpose(nn.Module):
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x):
+        x = x.transpose(self.dim0, self.dim1)
+        return x
+def forward_vit(pretrained, x):
+    b, c, h, w = x.shape
+    # encoder
+    glob = pretrained.model.forward_flex(x)
+    layer_1 = pretrained.activations["1"]
+    layer_2 = pretrained.activations["2"]
+    layer_3 = pretrained.activations["3"]
+    layer_4 = pretrained.activations["4"]
+    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
+    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
+    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
+    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size(
+                [
+                    h // pretrained.model.patch_size[1],
+                    w // pretrained.model.patch_size[0],
+                ]
+            ),
+        )
+    )
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+    layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
+    layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
+    layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
+    layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
+    return layer_1, layer_2, layer_3, layer_4
+def _resize_pos_embed(self, posemb, gs_h, gs_w):
+    posemb_tok, posemb_grid = (
+        posemb[:, : self.start_index],
+        posemb[0, self.start_index :],
+    )
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+    return posemb
+def forward_flex(self, x):
+    b, c, h, w = x.shape
+    pos_embed = self._resize_pos_embed(
+        self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
+    )
+    B = x.shape[0]
+    if hasattr(self.patch_embed, "backbone"):
+        x = self.patch_embed.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[-1]  # last feature if backbone outputs list/tuple of features
+    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
+    if getattr(self, "dist_token", None) is not None:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        dist_token = self.dist_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, dist_token, x), dim=1)
+    else:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+    x = x + pos_embed
+    x = self.pos_drop(x)
+    for blk in self.blocks:
+        x = blk(x)
+    x = self.norm(x)
+    return x
+def get_readout_oper(vit_features, features, use_readout, start_index=1):
+    if use_readout == "ignore":
+        readout_oper = [Slice(start_index)] * len(features)
+    elif use_readout == "add":
+        readout_oper = [AddReadout(start_index)] * len(features)
+    elif use_readout == "project":
+        readout_oper = [
+            ProjectReadout(vit_features, start_index) for out_feat in features
+        ]
+    else:
+        assert (
+            False
+        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
+    return readout_oper
+def _make_pretrained_clip_vitl16_384(
+    pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False
+):
+    clip_pretrained, _ = clip.load("ViT-B/32", device='cuda', jit=False)
+    model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
+    hooks = [5, 11, 17, 23] if hooks == None else hooks
+    pretrained = _make_vit_b16_backbone(
+        model,
+        features=[256, 512, 1024, 1024],
+        hooks=hooks,
+        vit_features=1024,
+        use_readout=use_readout,
+        enable_attention_hooks=enable_attention_hooks,
+    )
+    return clip_pretrained, pretrained
+def _make_pretrained_clipRN50x16_vitl16_384(
+    pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False
+):
+    clip_pretrained, _ = clip.load("RN50x16", device='cuda', jit=False)
+    model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
+    hooks = [5, 11, 17, 23] if hooks == None else hooks
+    pretrained = _make_vit_b16_backbone(
+        model,
+        features=[256, 512, 1024, 1024],
+        hooks=hooks,
+        vit_features=1024,
+        use_readout=use_readout,
+        enable_attention_hooks=enable_attention_hooks,
+    )
+    return clip_pretrained, pretrained
+def _make_pretrained_clip_vitb32_384(pretrained, use_readout="ignore", hooks=None, enable_attention_hooks=False):
+    clip_pretrained, _ = clip.load("ViT-B/32", device='cuda', jit=False)
+    model = timm.create_model("vit_base_patch32_384", pretrained=pretrained)
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    pretrained = _make_vit_b32_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout,
+        enable_attention_hooks=False,
+    )
+    return clip_pretrained, pretrained
+def _make_vit_b32_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout="ignore",
+    start_index=1,
+    enable_attention_hooks=False,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
+    pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
+    pretrained.activations = activations
+    pretrained.model.patch_size = [32, 32]
+    pretrained.model.start_index = start_index
+    if enable_attention_hooks:
+        pretrained.model.blocks[hooks[0]].attn.register_forward_hook(
+            get_attention("attn_1")
+        )
+        pretrained.model.blocks[hooks[1]].attn.register_forward_hook(
+            get_attention("attn_2")
+        )
+        pretrained.model.blocks[hooks[2]].attn.register_forward_hook(
+            get_attention("attn_3")
+        )
+        pretrained.model.blocks[hooks[3]].attn.register_forward_hook(
+            get_attention("attn_4")
+        )
+        pretrained.attention = attention
+    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
+    pretrained.act_postprocess1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // pretrained.model.patch_size[1], size[1] // pretrained.model.patch_size[0]])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[0],
+            out_channels=features[0],
+            kernel_size=8,
+            stride=8,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // pretrained.model.patch_size[1], size[1] // pretrained.model.patch_size[0]])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[1],
+            out_channels=features[1],
+            kernel_size=4,
+            stride=4,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // pretrained.model.patch_size[1], size[1] // pretrained.model.patch_size[0]])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[2],
+            out_channels=features[2],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            # output_padding=output_padding,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // pretrained.model.patch_size[1], size[1] // pretrained.model.patch_size[0]])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model
+    )
+    return pretrained
+def _make_vit_b16_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout="ignore",
+    start_index=1,
+    enable_attention_hooks=False,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
+    pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
+    pretrained.activations = activations
+    if enable_attention_hooks:
+        pretrained.model.blocks[hooks[0]].attn.register_forward_hook(
+            get_attention("attn_1")
+        )
+        pretrained.model.blocks[hooks[1]].attn.register_forward_hook(
+            get_attention("attn_2")
+        )
+        pretrained.model.blocks[hooks[2]].attn.register_forward_hook(
+            get_attention("attn_3")
+        )
+        pretrained.model.blocks[hooks[3]].attn.register_forward_hook(
+            get_attention("attn_4")
+        )
+        pretrained.attention = attention
+    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
+    # 32, 48, 136, 384
+    pretrained.act_postprocess1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[0],
+            out_channels=features[0],
+            kernel_size=4,
+            stride=4,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[1],
+            out_channels=features[1],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model
+    )
+    return pretrained

prepare_ade20k.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# +
+# revised from https://github.com/zhanghang1989/PyTorch-Encoding/blob/331ecdd5306104614cb414b16fbcd9d1a8d40e1e/scripts/prepare_ade20k.py
+"""Prepare ADE20K dataset"""
+import os
+import shutil
+import argparse
+import zipfile
+from encoding.utils import download, mkdir
+# -
+_TARGET_DIR = os.path.expanduser('../datasets/')
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Initialize ADE20K dataset.',
+        epilog='Example: python prepare_ade20k.py',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument('--download-dir', default=None, help='dataset directory on disk')
+    args = parser.parse_args()
+    return args
+def download_ade(path, overwrite=False):
+    _AUG_DOWNLOAD_URLS = [
+        ('http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip', '219e1696abb36c8ba3a3afe7fb2f4b4606a897c7'),
+        ('http://data.csail.mit.edu/places/ADEchallenge/release_test.zip', 'e05747892219d10e9243933371a497e905a4860c'),]
+    download_dir = path
+    mkdir(download_dir)
+    for url, checksum in _AUG_DOWNLOAD_URLS:
+        filename = download(url, path=download_dir, overwrite=overwrite, sha1_hash=checksum)
+        # extract
+        with zipfile.ZipFile(filename,"r") as zip_ref:
+            zip_ref.extractall(path=path)
+if __name__ == '__main__':
+    args = parse_args()
+    mkdir(os.path.expanduser('../datasets/'))
+    if args.download_dir is not None:
+        if os.path.isdir(_TARGET_DIR):
+            os.remove(_TARGET_DIR)
+        # make symlink
+        os.symlink(args.download_dir, _TARGET_DIR)
+    else:
+        download_ade(_TARGET_DIR, overwrite=False)

test_lseg.py ADDED Viewed

	@@ -0,0 +1,436 @@

+import os
+import argparse
+import numpy as np
+from tqdm import tqdm
+from collections import OrderedDict
+import torch
+import torch.nn.functional as F
+from torch.utils import data
+import torchvision.transforms as transform
+from torch.nn.parallel.scatter_gather import gather
+import encoding.utils as utils
+from encoding.nn import SegmentationLosses, SyncBatchNorm
+from encoding.parallel import DataParallelModel, DataParallelCriterion
+from encoding.datasets import test_batchify_fn
+from encoding.models.sseg import BaseNet
+from modules.lseg_module import LSegModule
+from utils import Resize
+import cv2
+import math
+import types
+import functools
+import torchvision.transforms as torch_transforms
+import copy
+import itertools
+from PIL import Image
+import matplotlib.pyplot as plt
+import clip
+import matplotlib as mpl
+import matplotlib.colors as mplc
+import matplotlib.figure as mplfigure
+import matplotlib.patches as mpatches
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from data import get_dataset
+from additional_utils.encoding_models import MultiEvalModule as LSeg_MultiEvalModule
+import torchvision.transforms as transforms
+class Options:
+    def __init__(self):
+        parser = argparse.ArgumentParser(description="PyTorch Segmentation")
+        # model and dataset
+        parser.add_argument(
+            "--model", type=str, default="encnet", help="model name (default: encnet)"
+        )
+        parser.add_argument(
+            "--backbone",
+            type=str,
+            default="clip_vitl16_384",
+            help="backbone name (default: resnet50)",
+        )
+        parser.add_argument(
+            "--dataset",
+            type=str,
+            default="ade20k",
+            help="dataset name (default: pascal12)",
+        )
+        parser.add_argument(
+            "--workers", type=int, default=16, metavar="N", help="dataloader threads"
+        )
+        parser.add_argument(
+            "--base-size", type=int, default=520, help="base image size"
+        )
+        parser.add_argument(
+            "--crop-size", type=int, default=480, help="crop image size"
+        )
+        parser.add_argument(
+            "--train-split",
+            type=str,
+            default="train",
+            help="dataset train split (default: train)",
+        )
+        # training hyper params
+        parser.add_argument(
+            "--aux", action="store_true", default=False, help="Auxilary Loss"
+        )
+        parser.add_argument(
+            "--se-loss",
+            action="store_true",
+            default=False,
+            help="Semantic Encoding Loss SE-loss",
+        )
+        parser.add_argument(
+            "--se-weight", type=float, default=0.2, help="SE-loss weight (default: 0.2)"
+        )
+        parser.add_argument(
+            "--batch-size",
+            type=int,
+            default=16,
+            metavar="N",
+            help="input batch size for \
+                            training (default: auto)",
+        )
+        parser.add_argument(
+            "--test-batch-size",
+            type=int,
+            default=16,
+            metavar="N",
+            help="input batch size for \
+                            testing (default: same as batch size)",
+        )
+        # cuda, seed and logging
+        parser.add_argument(
+            "--no-cuda",
+            action="store_true",
+            default=False,
+            help="disables CUDA training",
+        )
+        parser.add_argument(
+            "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
+        )
+        parser.add_argument(
+            "--weights", type=str, default=None, help="checkpoint to test"
+        )
+        parser.add_argument(
+            "--eval", action="store_true", default=False, help="evaluating mIoU"
+        )
+        parser.add_argument(
+            "--export",
+            type=str,
+            default=None,
+            help="put the path to resuming file if needed",
+        )
+        parser.add_argument(
+            "--acc-bn",
+            action="store_true",
+            default=False,
+            help="Re-accumulate BN statistics",
+        )
+        parser.add_argument(
+            "--test-val",
+            action="store_true",
+            default=False,
+            help="generate masks on val set",
+        )
+        parser.add_argument(
+            "--no-val",
+            action="store_true",
+            default=False,
+            help="skip validation during training",
+        )
+        parser.add_argument(
+            "--module",
+            default='lseg',
+            help="select model definition",
+        )
+        # test option
+        parser.add_argument(
+            "--data-path", type=str, default=None, help="path to test image folder"
+        )
+        parser.add_argument(
+            "--no-scaleinv",
+            dest="scale_inv",
+            default=True,
+            action="store_false",
+            help="turn off scaleinv layers",
+        )
+        parser.add_argument(
+            "--widehead", default=False, action="store_true", help="wider output head"
+        )
+        parser.add_argument(
+            "--widehead_hr",
+            default=False,
+            action="store_true",
+            help="wider output head",
+        )
+        parser.add_argument(
+            "--ignore_index",
+            type=int,
+            default=-1,
+            help="numeric value of ignore label in gt",
+        )
+        parser.add_argument(
+            "--label_src",
+            type=str,
+            default="default",
+            help="how to get the labels",
+        )
+        parser.add_argument(
+            "--jobname",
+            type=str,
+            default="default",
+            help="select which dataset",
+        )
+        parser.add_argument(
+            "--no-strict",
+            dest="strict",
+            default=True,
+            action="store_false",
+            help="no-strict copy the model",
+        )
+        parser.add_argument(
+            "--arch_option",
+            type=int,
+            default=0,
+            help="which kind of architecture to be used",
+        )
+        parser.add_argument(
+            "--block_depth",
+            type=int,
+            default=0,
+            help="how many blocks should be used",
+        )
+        parser.add_argument(
+            "--activation",
+            choices=['lrelu', 'tanh'],
+            default="lrelu",
+            help="use which activation to activate the block",
+        )
+        self.parser = parser
+    def parse(self):
+        args = self.parser.parse_args()
+        args.cuda = not args.no_cuda and torch.cuda.is_available()
+        print(args)
+        return args
+def test(args):
+    module = LSegModule.load_from_checkpoint(
+        checkpoint_path=args.weights,
+        data_path=args.data_path,
+        dataset=args.dataset,
+        backbone=args.backbone,
+        aux=args.aux,
+        num_features=256,
+        aux_weight=0,
+        se_loss=False,
+        se_weight=0,
+        base_lr=0,
+        batch_size=1,
+        max_epochs=0,
+        ignore_index=args.ignore_index,
+        dropout=0.0,
+        scale_inv=args.scale_inv,
+        augment=False,
+        no_batchnorm=False,
+        widehead=args.widehead,
+        widehead_hr=args.widehead_hr,
+        map_locatin="cpu",
+        arch_option=args.arch_option,
+        strict=args.strict,
+        block_depth=args.block_depth,
+        activation=args.activation,
+    )
+    input_transform = module.val_transform
+    num_classes = module.num_classes
+    # dataset
+    testset = get_dataset(
+        args.dataset,
+        root=args.data_path,
+        split="val",
+        mode="testval",
+        transform=input_transform,
+    )
+    # dataloader
+    loader_kwargs = (
+        {"num_workers": args.workers, "pin_memory": True} if args.cuda else {}
+    )
+    test_data = data.DataLoader(
+        testset,
+        batch_size=args.test_batch_size,
+        drop_last=False,
+        shuffle=False,
+        collate_fn=test_batchify_fn,
+        **loader_kwargs
+    )
+    if isinstance(module.net, BaseNet):
+        model = module.net
+    else:
+        model = module
+    model = model.eval()
+    model = model.cpu()
+    print(model)
+    if args.acc_bn:
+        from encoding.utils.precise_bn import update_bn_stats
+        data_kwargs = {
+            "transform": input_transform,
+            "base_size": args.base_size,
+            "crop_size": args.crop_size,
+        }
+        trainset = get_dataset(
+            args.dataset, split=args.train_split, mode="train", **data_kwargs
+        )
+        trainloader = data.DataLoader(
+            ReturnFirstClosure(trainset),
+            root=args.data_path,
+            batch_size=args.batch_size,
+            drop_last=True,
+            shuffle=True,
+            **loader_kwargs
+        )
+        print("Reseting BN statistics")
+        model.cuda()
+        update_bn_stats(model, trainloader)
+    if args.export:
+        torch.save(model.state_dict(), args.export + ".pth")
+        return
+    scales = (
+        [0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 2.25]
+        if args.dataset == "citys"
+        else [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    )
+    evaluator = LSeg_MultiEvalModule(
+        model, num_classes, scales=scales, flip=True
+    ).cuda()
+    evaluator.eval()
+    metric = utils.SegmentationMetric(testset.num_class)
+    tbar = tqdm(test_data)
+    f = open("logs/log_test_{}_{}.txt".format(args.jobname, args.dataset), "a+")
+    per_class_iou = np.zeros(testset.num_class)
+    cnt = 0
+    for i, (image, dst) in enumerate(tbar):
+        if args.eval:
+            with torch.no_grad():
+                if False:
+                    sample = {"image": image[0].cpu().permute(1, 2, 0).numpy()}
+                    out = torch.zeros(
+                        1, testset.num_class, image[0].shape[1], image[0].shape[2]
+                    ).cuda()
+                    H, W = image[0].shape[1], image[0].shape[2]
+                    for scale in scales:
+                        long_size = int(math.ceil(520 * scale))
+                        if H > W:
+                            height = long_size
+                            width = int(1.0 * W * long_size / H + 0.5)
+                            short_size = width
+                        else:
+                            width = long_size
+                            height = int(1.0 * H * long_size / W + 0.5)
+                            short_size = height
+                        rs = Resize(
+                            width,
+                            height,
+                            resize_target=False,
+                            keep_aspect_ratio=True,
+                            ensure_multiple_of=32,
+                            resize_method="minimal",
+                            image_interpolation_method=cv2.INTER_AREA,
+                        )
+                        inf_image = (
+                            torch.from_numpy(rs(sample)["image"])
+                            .cuda()
+                            .permute(2, 0, 1)
+                            .unsqueeze(0)
+                        )
+                        inf_image = torch.cat((inf_image, torch.fliplr(inf_image)), 0)
+                        try:
+                            pred = model(inf_image)
+                        except:
+                            print(H, W, sz, i)
+                            exit()
+                        pred0 = F.softmax(pred[0], dim=1)
+                        pred1 = F.softmax(pred[1], dim=1)
+                        pred = pred0 + 0.2 * pred1
+                        out += F.interpolate(
+                            pred.sum(0, keepdim=True),
+                            (out.shape[2], out.shape[3]),
+                            mode="bilinear",
+                            align_corners=True,
+                        )
+                    predicts = [out]
+                else:
+                    predicts = evaluator.parallel_forward(image)
+                metric.update(dst, predicts)
+                pixAcc, mIoU = metric.get()
+                _, _, total_inter, total_union = metric.get_all()
+                per_class_iou += 1.0 * total_inter / (np.spacing(1) + total_union)
+                cnt+=1
+                tbar.set_description("pixAcc: %.4f, mIoU: %.4f" % (pixAcc, mIoU))
+        else:
+            with torch.no_grad():
+                outputs = evaluator.parallel_forward(image)
+                predicts = [
+                    testset.make_pred(torch.max(output, 1)[1].cpu().numpy())
+                    for output in outputs
+                ]
+            # output folder
+            outdir = "outdir_ours"
+            if not os.path.exists(outdir):
+                os.makedirs(outdir)
+            for predict, impath in zip(predicts, dst):
+                mask = utils.get_mask_pallete(predict, args.dataset)
+                outname = os.path.splitext(impath)[0] + ".png"
+                mask.save(os.path.join(outdir, outname))
+    if args.eval:
+        each_classes_iou = per_class_iou/cnt
+        print("pixAcc: %.4f, mIoU: %.4f" % (pixAcc, mIoU))
+        print(each_classes_iou)
+        f.write("dataset {} ==> pixAcc: {:.4f}, mIoU: {:.4f}\n".format(args.dataset, pixAcc, mIoU))
+        for per_iou in each_classes_iou: f.write('{:.4f}, '.format(per_iou))
+        f.write('\n')
+class ReturnFirstClosure(object):
+    def __init__(self, data):
+        self._data = data
+    def __len__(self):
+        return len(self._data)
+    def __getitem__(self, idx):
+        outputs = self._data[idx]
+        return outputs[0]
+if __name__ == "__main__":
+    args = Options().parse()
+    torch.manual_seed(args.seed)
+    args.test_batch_size = torch.cuda.device_count()
+    test(args)

train_lseg.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from modules.lseg_module import LSegModule
+from utils import do_training, get_default_argument_parser
+if __name__ == "__main__":
+    parser = LSegModule.add_model_specific_args(get_default_argument_parser())
+    args = parser.parse_args()
+    do_training(args, LSegModule)

utils.py ADDED Viewed

	@@ -0,0 +1,368 @@

+import os
+import pathlib
+from glob import glob
+from argparse import ArgumentParser
+import torch
+import pytorch_lightning as pl
+import numpy as np
+import cv2
+import random
+import math
+from torchvision import transforms
+def do_training(hparams, model_constructor):
+    # instantiate model
+    model = model_constructor(**vars(hparams))
+    # set all sorts of training parameters
+    hparams.gpus = -1
+    hparams.accelerator = "ddp"
+    hparams.benchmark = True
+    if hparams.dry_run:
+        print("Doing a dry run")
+        hparams.overfit_batches = hparams.batch_size
+    if not hparams.no_resume:
+        hparams = set_resume_parameters(hparams)
+    if not hasattr(hparams, "version") or hparams.version is None:
+        hparams.version = 0
+    hparams.sync_batchnorm = True
+    ttlogger = pl.loggers.TestTubeLogger(
+        "checkpoints", name=hparams.exp_name, version=hparams.version
+    )
+    hparams.callbacks = make_checkpoint_callbacks(hparams.exp_name, hparams.version)
+    wblogger = get_wandb_logger(hparams)
+    hparams.logger = [wblogger, ttlogger]
+    trainer = pl.Trainer.from_argparse_args(hparams)
+    trainer.fit(model)
+def get_default_argument_parser():
+    parser = ArgumentParser(add_help=False)
+    parser.add_argument(
+        "--num_nodes",
+        type=int,
+        default=1,
+        help="number of nodes for distributed training",
+    )
+    parser.add_argument(
+        "--exp_name", type=str, required=True, help="name your experiment"
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        default=False,
+        help="run on batch of train/val/test",
+    )
+    parser.add_argument(
+        "--no_resume",
+        action="store_true",
+        default=False,
+        help="resume if we have a checkpoint",
+    )
+    parser.add_argument(
+        "--accumulate_grad_batches",
+        type=int,
+        default=1,
+        help="accumulate N batches for gradient computation",
+    )
+    parser.add_argument(
+        "--max_epochs", type=int, default=200, help="maximum number of epochs"
+    )
+    parser.add_argument(
+        "--project_name", type=str, default="lightseg", help="project name for logging"
+    )
+    return parser
+def make_checkpoint_callbacks(exp_name, version, base_path="checkpoints", frequency=1):
+    version = 0 if version is None else version
+    base_callback = pl.callbacks.ModelCheckpoint(
+        dirpath=f"{base_path}/{exp_name}/version_{version}/checkpoints/",
+        save_last=True,
+        verbose=True,
+    )
+    val_callback = pl.callbacks.ModelCheckpoint(
+        monitor="val_acc_epoch",
+        dirpath=f"{base_path}/{exp_name}/version_{version}/checkpoints/",
+        filename="result-{epoch}-{val_acc_epoch:.2f}",
+        mode="max",
+        save_top_k=3,
+        verbose=True,
+    )
+    return [base_callback, val_callback]
+def get_latest_version(folder):
+    versions = [
+        int(pathlib.PurePath(path).name.split("_")[-1])
+        for path in glob(f"{folder}/version_*/")
+    ]
+    if len(versions) == 0:
+        return None
+    versions.sort()
+    return versions[-1]
+def get_latest_checkpoint(exp_name, version):
+    while version > -1:
+        folder = f"./checkpoints/{exp_name}/version_{version}/checkpoints/"
+        latest = f"{folder}/last.ckpt"
+        if os.path.exists(latest):
+            return latest, version
+        chkpts = glob(f"{folder}/epoch=*.ckpt")
+        if len(chkpts) > 0:
+            break
+        version -= 1
+    if len(chkpts) == 0:
+        return None, None
+    latest = max(chkpts, key=os.path.getctime)
+    return latest, version
+def set_resume_parameters(hparams):
+    version = get_latest_version(f"./checkpoints/{hparams.exp_name}")
+    if version is not None:
+        latest, version = get_latest_checkpoint(hparams.exp_name, version)
+        print(f"Resuming checkpoint {latest}, exp_version={version}")
+        hparams.resume_from_checkpoint = latest
+        hparams.version = version
+        wandb_file = "checkpoints/{hparams.exp_name}/version_{version}/wandb_id"
+        if os.path.exists(wandb_file):
+            with open(wandb_file, "r") as f:
+                hparams.wandb_id = f.read()
+    else:
+        version = 0
+    return hparams
+def get_wandb_logger(hparams):
+    exp_dir = f"checkpoints/{hparams.exp_name}/version_{hparams.version}/"
+    id_file = f"{exp_dir}/wandb_id"
+    if os.path.exists(id_file):
+        with open(id_file) as f:
+            hparams.wandb_id = f.read()
+    else:
+        hparams.wandb_id = None
+    logger = pl.loggers.WandbLogger(
+        save_dir="checkpoints",
+        project=hparams.project_name,
+        name=hparams.exp_name,
+        id=hparams.wandb_id,
+    )
+    if hparams.wandb_id is None:
+        _ = logger.experiment
+    if not os.path.exists(exp_dir):
+        os.makedirs(exp_dir)
+        with open(id_file, "w") as f:
+            f.write(logger.version)
+    return logger
+class Resize(object):
+    """Resize sample to given size (width, height)."""
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+        letter_box=False,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+        self.__letter_box = letter_box
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(
+                    f"resize_method {self.__resize_method} not implemented"
+                )
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, min_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, min_val=self.__width
+            )
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, max_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, max_val=self.__width
+            )
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def make_letter_box(self, sample):
+        top = bottom = (self.__height - sample.shape[0]) // 2
+        left = right = (self.__width - sample.shape[1]) // 2
+        sample = cv2.copyMakeBorder(
+            sample, top, bottom, left, right, cv2.BORDER_CONSTANT, None, 0
+        )
+        return sample
+    def __call__(self, sample):
+        width, height = self.get_size(
+            sample["image"].shape[1], sample["image"].shape[0]
+        )
+        # resize sample
+        sample["image"] = cv2.resize(
+            sample["image"],
+            (width, height),
+            interpolation=self.__image_interpolation_method,
+        )
+        if self.__letter_box:
+            sample["image"] = self.make_letter_box(sample["image"])
+        if self.__resize_target:
+            if "disparity" in sample:
+                sample["disparity"] = cv2.resize(
+                    sample["disparity"],
+                    (width, height),
+                    interpolation=cv2.INTER_NEAREST,
+                )
+                if self.__letter_box:
+                    sample["disparity"] = self.make_letter_box(sample["disparity"])
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(
+                    sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
+                )
+                if self.__letter_box:
+                    sample["depth"] = self.make_letter_box(sample["depth"])
+            sample["mask"] = cv2.resize(
+                sample["mask"].astype(np.float32),
+                (width, height),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            if self.__letter_box:
+                sample["mask"] = self.make_letter_box(sample["mask"])
+            sample["mask"] = sample["mask"].astype(bool)
+        return sample