import argparse import logging import torch import torch.nn as nn import timeit from maskrcnn_benchmark.layers import * from maskrcnn_benchmark.modeling.backbone.resnet_big import StdConv2d from maskrcnn_benchmark.modeling.backbone.fpn import * from maskrcnn_benchmark.modeling.rpn.inference import * from maskrcnn_benchmark.modeling.roi_heads.box_head.inference import PostProcessor from maskrcnn_benchmark.modeling.rpn.anchor_generator import BufferList def profile(model, input_size, custom_ops={}, device="cpu", verbose=False, extra_args={}, return_time=False): handler_collection = [] def add_hooks(m): if len(list(m.children())) > 0: return m.register_buffer("total_ops", torch.zeros(1)) m.register_buffer("total_params", torch.zeros(1)) for p in m.parameters(): m.total_params += torch.Tensor([p.numel()]) m_type = type(m) fn = None if m_type in custom_ops: fn = custom_ops[m_type] elif m_type in register_hooks: fn = register_hooks[m_type] else: print("Not implemented for ", m) if fn is not None: if verbose: print("Register FLOP counter for module %s" % str(m)) handler = m.register_forward_hook(fn) handler_collection.append(handler) original_device = model.parameters().__next__().device training = model.training model.eval().to(device) model.apply(add_hooks) x = torch.zeros(input_size).to(device) with torch.no_grad(): tic = timeit.time.perf_counter() model(x, **extra_args) toc = timeit.time.perf_counter() total_time = toc - tic total_ops = 0 total_params = 0 for m in model.modules(): if len(list(m.children())) > 0: # skip for non-leaf module continue total_ops += m.total_ops total_params += m.total_params total_ops = total_ops.item() total_params = total_params.item() model.train(training).to(original_device) for handler in handler_collection: handler.remove() if return_time: return total_ops, total_params, total_time else: return total_ops, total_params multiply_adds = 1 def count_conv2d(m, x, y): x = x[0] cin = m.in_channels cout = m.out_channels kh, kw = m.kernel_size batch_size = x.size()[0] out_h = y.size(2) out_w = y.size(3) # ops per output element # kernel_mul = kh * kw * cin # kernel_add = kh * kw * cin - 1 kernel_ops = multiply_adds * kh * kw * cin // m.groups bias_ops = 1 if m.bias is not None else 0 ops_per_element = kernel_ops + bias_ops # total ops # num_out_elements = y.numel() output_elements = batch_size * out_w * out_h * cout total_ops = output_elements * ops_per_element m.total_ops = torch.Tensor([int(total_ops)]) def count_convtranspose2d(m, x, y): x = x[0] cin = m.in_channels cout = m.out_channels kh, kw = m.kernel_size batch_size = x.size()[0] out_h = y.size(2) out_w = y.size(3) # ops per output element # kernel_mul = kh * kw * cin # kernel_add = kh * kw * cin - 1 kernel_ops = multiply_adds * kh * kw * cin // m.groups bias_ops = 1 if m.bias is not None else 0 ops_per_element = kernel_ops + bias_ops # total ops # num_out_elements = y.numel() # output_elements = batch_size * out_w * out_h * cout ops_per_element = m.weight.nelement() output_elements = y.nelement() total_ops = output_elements * ops_per_element m.total_ops = torch.Tensor([int(total_ops)]) def count_bn(m, x, y): x = x[0] nelements = x.numel() # subtract, divide, gamma, beta total_ops = 4 * nelements m.total_ops = torch.Tensor([int(total_ops)]) def count_relu(m, x, y): x = x[0] nelements = x.numel() total_ops = nelements m.total_ops = torch.Tensor([int(total_ops)]) def count_softmax(m, x, y): x = x[0] batch_size, nfeatures = x.size() total_exp = nfeatures total_add = nfeatures - 1 total_div = nfeatures total_ops = batch_size * (total_exp + total_add + total_div) m.total_ops = torch.Tensor([int(total_ops)]) def count_maxpool(m, x, y): kernel_ops = torch.prod(torch.Tensor([m.kernel_size])) num_elements = y.numel() total_ops = kernel_ops * num_elements m.total_ops = torch.Tensor([int(total_ops)]) def count_adap_maxpool(m, x, y): kernel = torch.Tensor([*(x[0].shape[2:])]) // torch.Tensor(list((m.output_size,))).squeeze() kernel_ops = torch.prod(kernel) num_elements = y.numel() total_ops = kernel_ops * num_elements m.total_ops = torch.Tensor([int(total_ops)]) def count_avgpool(m, x, y): total_add = torch.prod(torch.Tensor([m.kernel_size])) total_div = 1 kernel_ops = total_add + total_div num_elements = y.numel() total_ops = kernel_ops * num_elements m.total_ops = torch.Tensor([int(total_ops)]) def count_adap_avgpool(m, x, y): kernel = torch.Tensor([*(x[0].shape[2:])]) // torch.Tensor(list((m.output_size,))).squeeze() total_add = torch.prod(kernel) total_div = 1 kernel_ops = total_add + total_div num_elements = y.numel() total_ops = kernel_ops * num_elements m.total_ops = torch.Tensor([int(total_ops)]) def count_linear(m, x, y): # per output element total_mul = m.in_features total_add = m.in_features - 1 num_elements = y.numel() total_ops = (total_mul + total_add) * num_elements m.total_ops = torch.Tensor([int(total_ops)]) def count_LastLevelMaxPool(m, x, y): num_elements = y[-1].numel() total_ops = num_elements m.total_ops = torch.Tensor([int(total_ops)]) def count_ROIAlign(m, x, y): num_elements = y.numel() total_ops = num_elements * 4 m.total_ops = torch.Tensor([int(total_ops)]) register_hooks = { Scale: None, Conv2d: count_conv2d, nn.Conv2d: count_conv2d, ModulatedDeformConv: count_conv2d, StdConv2d: count_conv2d, nn.BatchNorm1d: count_bn, nn.BatchNorm2d: count_bn, nn.BatchNorm3d: count_bn, FrozenBatchNorm2d: count_bn, nn.GroupNorm: count_bn, NaiveSyncBatchNorm2d: count_bn, nn.ReLU: count_relu, nn.ReLU6: count_relu, swish: None, nn.ConstantPad2d: None, SPPLayer: count_LastLevelMaxPool, LastLevelMaxPool: count_LastLevelMaxPool, nn.MaxPool1d: count_maxpool, nn.MaxPool2d: count_maxpool, nn.MaxPool3d: count_maxpool, nn.AdaptiveMaxPool1d: count_adap_maxpool, nn.AdaptiveMaxPool2d: count_adap_maxpool, nn.AdaptiveMaxPool3d: count_adap_maxpool, nn.AvgPool1d: count_avgpool, nn.AvgPool2d: count_avgpool, nn.AvgPool3d: count_avgpool, nn.AdaptiveAvgPool1d: count_adap_avgpool, nn.AdaptiveAvgPool2d: count_adap_avgpool, nn.AdaptiveAvgPool3d: count_adap_avgpool, nn.Linear: count_linear, nn.Upsample: None, nn.Dropout: None, nn.Sigmoid: None, DropBlock2D: None, ROIAlign: count_ROIAlign, RPNPostProcessor: None, PostProcessor: None, BufferList: None, RetinaPostProcessor: None, FCOSPostProcessor: None, ATSSPostProcessor: None, }