|
|
|
|
|
import logging |
|
import unittest |
|
from typing import Callable, Tuple |
|
|
|
import torch |
|
import torch.nn as nn |
|
from fvcore.common.benchmark import benchmark |
|
from pytorchvideo.layers.accelerator.mobile_cpu.convolutions import ( |
|
Conv3d3x3x3DwBnAct, |
|
Conv3dPwBnAct, |
|
) |
|
from pytorchvideo.models.accelerator.mobile_cpu.residual_blocks import ( |
|
X3dBottleneckBlock, |
|
) |
|
from torch.utils.mobile_optimizer import optimize_for_mobile |
|
|
|
|
|
TORCH_VERSION: Tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2]) |
|
if TORCH_VERSION >= (1, 11): |
|
from torch.ao.quantization import ( |
|
convert, |
|
DeQuantStub, |
|
fuse_modules, |
|
get_default_qconfig, |
|
prepare, |
|
QuantStub, |
|
|
|
) |
|
else: |
|
from torch.quantization import ( |
|
convert, |
|
DeQuantStub, |
|
fuse_modules, |
|
get_default_qconfig, |
|
prepare, |
|
QuantStub, |
|
|
|
) |
|
|
|
|
|
class TestBenchmarkEfficientBlocks(unittest.TestCase): |
|
def setUp(self): |
|
super().setUp() |
|
torch.set_rng_state(torch.manual_seed(42).get_state()) |
|
|
|
def test_benchmark_conv3d_pw_bn_relu(self, num_iters: int = 20) -> None: |
|
""" |
|
Benchmark Conv3dPwBnAct with ReLU activation. |
|
Note efficient block Conv3dPwBnAct is designed for mobile cpu with qnnpack |
|
backend, and benchmarking on server with another backend (e.g., fbgemm) may |
|
have different latency result compared to running on mobile cpu with qnnpack. |
|
Running on x86 based server cpu with qnnpack may also have different latency as |
|
running on mobile cpu with qnnpack, as qnnpack is optimized for |
|
ARM based mobile cpu. |
|
Args: |
|
num_iters (int): number of iterations to perform benchmarking. |
|
""" |
|
|
|
torch.backends.quantized.engine = "qnnpack" |
|
kwargs_list = [ |
|
{ |
|
"mode": "original", |
|
"input_blob_size": (1, 48, 4, 40, 40), |
|
"in_channels": 48, |
|
"out_channels": 108, |
|
"quantize": False, |
|
}, |
|
{ |
|
"mode": "deployable", |
|
"input_blob_size": (1, 48, 4, 40, 40), |
|
"in_channels": 48, |
|
"out_channels": 108, |
|
"quantize": False, |
|
}, |
|
{ |
|
"mode": "original", |
|
"input_blob_size": (1, 48, 4, 40, 40), |
|
"in_channels": 48, |
|
"out_channels": 108, |
|
"quantize": True, |
|
}, |
|
{ |
|
"mode": "deployable", |
|
"input_blob_size": (1, 48, 4, 40, 40), |
|
"in_channels": 48, |
|
"out_channels": 108, |
|
"quantize": True, |
|
}, |
|
{ |
|
"mode": "deployable", |
|
"input_blob_size": (1, 48, 4, 40, 40), |
|
"in_channels": 48, |
|
"out_channels": 108, |
|
"quantize": True, |
|
"native_conv3d_op_qnnpack": True, |
|
}, |
|
] |
|
|
|
def _benchmark_conv3d_pw_bn_relu_forward(**kwargs) -> Callable: |
|
assert kwargs["mode"] in ("original", "deployable"), ( |
|
"kwargs['mode'] must be either 'original' or 'deployable'," |
|
"but got {}.".format(kwargs["mode"]) |
|
) |
|
input_tensor = torch.randn((kwargs["input_blob_size"])) |
|
conv_block = Conv3dPwBnAct( |
|
kwargs["in_channels"], |
|
kwargs["out_channels"], |
|
use_bn=False, |
|
) |
|
|
|
if kwargs["mode"] == "deployable": |
|
native_conv3d_op_qnnpack = kwargs.get("native_conv3d_op_qnnpack", False) |
|
conv_block.convert( |
|
kwargs["input_blob_size"], |
|
convert_for_quantize=kwargs["quantize"], |
|
native_conv3d_op_qnnpack=native_conv3d_op_qnnpack, |
|
) |
|
conv_block.eval() |
|
|
|
def func_to_benchmark_dummy() -> None: |
|
return |
|
|
|
if kwargs["quantize"] is True: |
|
if kwargs["mode"] == "original": |
|
conv_block.kernel = fuse_modules( |
|
conv_block.kernel, ["conv", "act.act"] |
|
) |
|
conv_block = nn.Sequential( |
|
QuantStub(), |
|
conv_block, |
|
DeQuantStub(), |
|
) |
|
|
|
conv_block.qconfig = get_default_qconfig("qnnpack") |
|
conv_block = prepare(conv_block) |
|
try: |
|
conv_block = convert(conv_block) |
|
|
|
except Exception as e: |
|
logging.info( |
|
"benchmark_conv3d_pw_bn_relu: " |
|
"catch exception '{}' with kwargs of {}".format(e, kwargs) |
|
) |
|
|
|
return func_to_benchmark_dummy |
|
try: |
|
traced_model = torch.jit.trace(conv_block, input_tensor, strict=False) |
|
except Exception as e: |
|
logging.info( |
|
"benchmark_conv3d_pw_bn_relu: " |
|
"catch exception '{}' with kwargs of {}".format(e, kwargs) |
|
) |
|
|
|
return func_to_benchmark_dummy |
|
|
|
if kwargs["quantize"] is False: |
|
traced_model = optimize_for_mobile(traced_model) |
|
|
|
logging.info(f"model arch: {traced_model}") |
|
|
|
def func_to_benchmark() -> None: |
|
try: |
|
_ = traced_model(input_tensor) |
|
except Exception as e: |
|
logging.info( |
|
"benchmark_conv3d_pw_bn_relu: " |
|
"catch exception '{}' with kwargs of {}".format(e, kwargs) |
|
) |
|
|
|
return |
|
|
|
return func_to_benchmark |
|
|
|
benchmark( |
|
_benchmark_conv3d_pw_bn_relu_forward, |
|
"benchmark_conv3d_pw_bn_relu", |
|
kwargs_list, |
|
num_iters=num_iters, |
|
warmup_iters=2, |
|
) |
|
|
|
self.assertTrue(True) |
|
|
|
def test_benchmark_conv3d_3x3x3_dw_bn_relu(self, num_iters: int = 20) -> None: |
|
""" |
|
Benchmark Conv3d3x3x3DwBnAct with ReLU activation. |
|
Note efficient block Conv3d3x3x3DwBnAct is designed for mobile cpu with qnnpack |
|
backend, and benchmarking on server with another backend (e.g., fbgemm) may have |
|
different latency result compared as running on mobile cpu. |
|
Args: |
|
num_iters (int): number of iterations to perform benchmarking. |
|
""" |
|
torch.backends.quantized.engine = "qnnpack" |
|
kwargs_list = [ |
|
{ |
|
"mode": "original", |
|
"input_blob_size": (1, 48, 4, 40, 40), |
|
"in_channels": 48, |
|
"quantize": False, |
|
}, |
|
{ |
|
"mode": "deployable", |
|
"input_blob_size": (1, 48, 4, 40, 40), |
|
"in_channels": 48, |
|
"quantize": False, |
|
}, |
|
{ |
|
"mode": "original", |
|
"input_blob_size": (1, 48, 4, 40, 40), |
|
"in_channels": 48, |
|
"quantize": True, |
|
}, |
|
{ |
|
"mode": "deployable", |
|
"input_blob_size": (1, 48, 4, 40, 40), |
|
"in_channels": 48, |
|
"quantize": True, |
|
}, |
|
{ |
|
"mode": "deployable", |
|
"input_blob_size": (1, 48, 4, 40, 40), |
|
"in_channels": 48, |
|
"quantize": True, |
|
"native_conv3d_op_qnnpack": True, |
|
}, |
|
] |
|
|
|
def _benchmark_conv3d_3x3x3_dw_bn_relu_forward(**kwargs) -> Callable: |
|
assert kwargs["mode"] in ("original", "deployable"), ( |
|
"kwargs['mode'] must be either 'original' or 'deployable'," |
|
"but got {}.".format(kwargs["mode"]) |
|
) |
|
input_tensor = torch.randn((kwargs["input_blob_size"])) |
|
conv_block = Conv3d3x3x3DwBnAct( |
|
kwargs["in_channels"], |
|
use_bn=False, |
|
) |
|
|
|
def func_to_benchmark_dummy() -> None: |
|
return |
|
|
|
if kwargs["mode"] == "deployable": |
|
native_conv3d_op_qnnpack = kwargs.get("native_conv3d_op_qnnpack", False) |
|
conv_block.convert( |
|
kwargs["input_blob_size"], |
|
convert_for_quantize=kwargs["quantize"], |
|
native_conv3d_op_qnnpack=native_conv3d_op_qnnpack, |
|
) |
|
conv_block.eval() |
|
if kwargs["quantize"] is True: |
|
if kwargs["mode"] == "original": |
|
conv_block.kernel = fuse_modules( |
|
conv_block.kernel, ["conv", "act.act"] |
|
) |
|
conv_block = nn.Sequential( |
|
QuantStub(), |
|
conv_block, |
|
DeQuantStub(), |
|
) |
|
|
|
conv_block.qconfig = get_default_qconfig("qnnpack") |
|
conv_block = prepare(conv_block) |
|
try: |
|
conv_block = convert(conv_block) |
|
except Exception as e: |
|
logging.info( |
|
"benchmark_conv3d_3x3x3_dw_bn_relu: " |
|
"catch exception '{}' with kwargs of {}".format(e, kwargs) |
|
) |
|
|
|
return func_to_benchmark_dummy |
|
try: |
|
traced_model = torch.jit.trace(conv_block, input_tensor, strict=False) |
|
except Exception as e: |
|
logging.info( |
|
"benchmark_conv3d_3x3x3_dw_bn_relu: " |
|
"catch exception '{}' with kwargs of {}".format(e, kwargs) |
|
) |
|
|
|
return func_to_benchmark_dummy |
|
if kwargs["quantize"] is False: |
|
traced_model = optimize_for_mobile(traced_model) |
|
|
|
logging.info(f"model arch: {traced_model}") |
|
|
|
def func_to_benchmark() -> None: |
|
try: |
|
_ = traced_model(input_tensor) |
|
except Exception as e: |
|
logging.info( |
|
"benchmark_conv3d_3x3x3_dw_bn_relu: " |
|
"catch exception '{}' with kwargs of {}".format(e, kwargs) |
|
) |
|
return |
|
|
|
return func_to_benchmark |
|
|
|
benchmark( |
|
_benchmark_conv3d_3x3x3_dw_bn_relu_forward, |
|
"benchmark_conv3d_3x3x3_dw_bn_relu", |
|
kwargs_list, |
|
num_iters=num_iters, |
|
warmup_iters=2, |
|
) |
|
|
|
self.assertTrue(True) |
|
|
|
def test_benchmark_x3d_bottleneck_block(self, num_iters: int = 20) -> None: |
|
""" |
|
Benchmark X3dBottleneckBlock. |
|
Note efficient block X3dBottleneckBlock is designed for mobile cpu with qnnpack |
|
backend, and benchmarking on server/laptop may have different latency result |
|
compared to running on mobile cpu. |
|
Args: |
|
num_iters (int): number of iterations to perform benchmarking. |
|
""" |
|
torch.backends.quantized.engine = "qnnpack" |
|
kwargs_list = [ |
|
{ |
|
"mode": "original", |
|
"input_blob_size": (1, 48, 4, 20, 20), |
|
"in_channels": 48, |
|
"mid_channels": 108, |
|
"out_channels": 48, |
|
"quantize": False, |
|
}, |
|
{ |
|
"mode": "deployable", |
|
"input_blob_size": (1, 48, 4, 20, 20), |
|
"in_channels": 48, |
|
"mid_channels": 108, |
|
"out_channels": 48, |
|
"quantize": False, |
|
}, |
|
{ |
|
"mode": "original", |
|
"input_blob_size": (1, 48, 4, 20, 20), |
|
"in_channels": 48, |
|
"mid_channels": 108, |
|
"out_channels": 48, |
|
"quantize": True, |
|
}, |
|
{ |
|
"mode": "deployable", |
|
"input_blob_size": (1, 48, 4, 20, 20), |
|
"in_channels": 48, |
|
"mid_channels": 108, |
|
"out_channels": 48, |
|
"quantize": True, |
|
}, |
|
{ |
|
"mode": "deployable", |
|
"input_blob_size": (1, 48, 4, 20, 20), |
|
"in_channels": 48, |
|
"mid_channels": 108, |
|
"out_channels": 48, |
|
"quantize": True, |
|
"native_conv3d_op_qnnpack": True, |
|
}, |
|
] |
|
|
|
def _benchmark_x3d_bottleneck_forward(**kwargs) -> Callable: |
|
assert kwargs["mode"] in ("original", "deployable"), ( |
|
"kwargs['mode'] must be either 'original' or 'deployable'," |
|
"but got {}.".format(kwargs["mode"]) |
|
) |
|
input_tensor = torch.randn((kwargs["input_blob_size"])) |
|
conv_block = X3dBottleneckBlock( |
|
kwargs["in_channels"], |
|
kwargs["mid_channels"], |
|
kwargs["out_channels"], |
|
use_bn=(False, False, False), |
|
) |
|
|
|
if kwargs["mode"] == "deployable": |
|
native_conv3d_op_qnnpack = kwargs.get("native_conv3d_op_qnnpack", False) |
|
conv_block.convert( |
|
kwargs["input_blob_size"], |
|
convert_for_quantize=kwargs["quantize"], |
|
native_conv3d_op_qnnpack=native_conv3d_op_qnnpack, |
|
) |
|
conv_block.eval() |
|
|
|
def func_to_benchmark_dummy() -> None: |
|
return |
|
|
|
if kwargs["quantize"] is True: |
|
conv_block = nn.Sequential( |
|
QuantStub(), |
|
conv_block, |
|
DeQuantStub(), |
|
) |
|
|
|
conv_block.qconfig = get_default_qconfig("qnnpack") |
|
conv_block = prepare(conv_block) |
|
try: |
|
conv_block = convert(conv_block) |
|
traced_model = torch.jit.trace( |
|
conv_block, input_tensor, strict=False |
|
) |
|
except Exception as e: |
|
logging.info( |
|
"benchmark_x3d_bottleneck_forward: " |
|
"catch exception '{}' with kwargs of {}".format(e, kwargs) |
|
) |
|
|
|
return func_to_benchmark_dummy |
|
|
|
try: |
|
traced_model = torch.jit.trace(conv_block, input_tensor, strict=False) |
|
except Exception as e: |
|
logging.info( |
|
"benchmark_x3d_bottleneck_forward: " |
|
"catch exception '{}' with kwargs of {}".format(e, kwargs) |
|
) |
|
|
|
return func_to_benchmark_dummy |
|
|
|
if kwargs["quantize"] is False: |
|
traced_model = optimize_for_mobile(traced_model) |
|
|
|
logging.info(f"model arch: {traced_model}") |
|
|
|
def func_to_benchmark() -> None: |
|
try: |
|
_ = traced_model(input_tensor) |
|
except Exception as e: |
|
logging.info( |
|
"benchmark_x3d_bottleneck_forward: " |
|
"catch exception '{}' with kwargs of {}".format(e, kwargs) |
|
) |
|
return |
|
|
|
return func_to_benchmark |
|
|
|
benchmark( |
|
_benchmark_x3d_bottleneck_forward, |
|
"benchmark_x3d_bottleneck_forward", |
|
kwargs_list, |
|
num_iters=num_iters, |
|
warmup_iters=2, |
|
) |
|
|
|
self.assertTrue(True) |
|
|