pandagpt-vicuna-v0-7b / code /pytorchvideo /tests /benchmark_accelerator_efficient_blocks.py
mvsoom's picture
Upload folder using huggingface_hub
3133fdb
raw
history blame contribute delete
16 kB
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import logging
import unittest
from typing import Callable, Tuple
import torch
import torch.nn as nn
from fvcore.common.benchmark import benchmark
from pytorchvideo.layers.accelerator.mobile_cpu.convolutions import (
Conv3d3x3x3DwBnAct,
Conv3dPwBnAct,
)
from pytorchvideo.models.accelerator.mobile_cpu.residual_blocks import (
X3dBottleneckBlock,
)
from torch.utils.mobile_optimizer import optimize_for_mobile
TORCH_VERSION: Tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2])
if TORCH_VERSION >= (1, 11):
from torch.ao.quantization import (
convert,
DeQuantStub,
fuse_modules,
get_default_qconfig,
prepare,
QuantStub,
# quantize_fx
)
else:
from torch.quantization import (
convert,
DeQuantStub,
fuse_modules,
get_default_qconfig,
prepare,
QuantStub,
# quantize_fx
)
class TestBenchmarkEfficientBlocks(unittest.TestCase):
def setUp(self):
super().setUp()
torch.set_rng_state(torch.manual_seed(42).get_state())
def test_benchmark_conv3d_pw_bn_relu(self, num_iters: int = 20) -> None:
"""
Benchmark Conv3dPwBnAct with ReLU activation.
Note efficient block Conv3dPwBnAct is designed for mobile cpu with qnnpack
backend, and benchmarking on server with another backend (e.g., fbgemm) may
have different latency result compared to running on mobile cpu with qnnpack.
Running on x86 based server cpu with qnnpack may also have different latency as
running on mobile cpu with qnnpack, as qnnpack is optimized for
ARM based mobile cpu.
Args:
num_iters (int): number of iterations to perform benchmarking.
"""
torch.backends.quantized.engine = "qnnpack"
kwargs_list = [
{
"mode": "original",
"input_blob_size": (1, 48, 4, 40, 40),
"in_channels": 48,
"out_channels": 108,
"quantize": False,
},
{
"mode": "deployable",
"input_blob_size": (1, 48, 4, 40, 40),
"in_channels": 48,
"out_channels": 108,
"quantize": False,
},
{
"mode": "original",
"input_blob_size": (1, 48, 4, 40, 40),
"in_channels": 48,
"out_channels": 108,
"quantize": True,
},
{
"mode": "deployable",
"input_blob_size": (1, 48, 4, 40, 40),
"in_channels": 48,
"out_channels": 108,
"quantize": True,
},
{
"mode": "deployable",
"input_blob_size": (1, 48, 4, 40, 40),
"in_channels": 48,
"out_channels": 108,
"quantize": True,
"native_conv3d_op_qnnpack": True,
},
]
def _benchmark_conv3d_pw_bn_relu_forward(**kwargs) -> Callable:
assert kwargs["mode"] in ("original", "deployable"), (
"kwargs['mode'] must be either 'original' or 'deployable',"
"but got {}.".format(kwargs["mode"])
)
input_tensor = torch.randn((kwargs["input_blob_size"]))
conv_block = Conv3dPwBnAct(
kwargs["in_channels"],
kwargs["out_channels"],
use_bn=False, # assume BN has already been fused for forward
)
if kwargs["mode"] == "deployable":
native_conv3d_op_qnnpack = kwargs.get("native_conv3d_op_qnnpack", False)
conv_block.convert(
kwargs["input_blob_size"],
convert_for_quantize=kwargs["quantize"],
native_conv3d_op_qnnpack=native_conv3d_op_qnnpack,
)
conv_block.eval()
def func_to_benchmark_dummy() -> None:
return
if kwargs["quantize"] is True:
if kwargs["mode"] == "original": # manually fuse conv and relu
conv_block.kernel = fuse_modules(
conv_block.kernel, ["conv", "act.act"]
)
conv_block = nn.Sequential(
QuantStub(),
conv_block,
DeQuantStub(),
)
conv_block.qconfig = get_default_qconfig("qnnpack")
conv_block = prepare(conv_block)
try:
conv_block = convert(conv_block)
except Exception as e:
logging.info(
"benchmark_conv3d_pw_bn_relu: "
"catch exception '{}' with kwargs of {}".format(e, kwargs)
)
return func_to_benchmark_dummy
try:
traced_model = torch.jit.trace(conv_block, input_tensor, strict=False)
except Exception as e:
logging.info(
"benchmark_conv3d_pw_bn_relu: "
"catch exception '{}' with kwargs of {}".format(e, kwargs)
)
return func_to_benchmark_dummy
if kwargs["quantize"] is False:
traced_model = optimize_for_mobile(traced_model)
logging.info(f"model arch: {traced_model}")
def func_to_benchmark() -> None:
try:
_ = traced_model(input_tensor)
except Exception as e:
logging.info(
"benchmark_conv3d_pw_bn_relu: "
"catch exception '{}' with kwargs of {}".format(e, kwargs)
)
return
return func_to_benchmark
benchmark(
_benchmark_conv3d_pw_bn_relu_forward,
"benchmark_conv3d_pw_bn_relu",
kwargs_list,
num_iters=num_iters,
warmup_iters=2,
)
self.assertTrue(True)
def test_benchmark_conv3d_3x3x3_dw_bn_relu(self, num_iters: int = 20) -> None:
"""
Benchmark Conv3d3x3x3DwBnAct with ReLU activation.
Note efficient block Conv3d3x3x3DwBnAct is designed for mobile cpu with qnnpack
backend, and benchmarking on server with another backend (e.g., fbgemm) may have
different latency result compared as running on mobile cpu.
Args:
num_iters (int): number of iterations to perform benchmarking.
"""
torch.backends.quantized.engine = "qnnpack"
kwargs_list = [
{
"mode": "original",
"input_blob_size": (1, 48, 4, 40, 40),
"in_channels": 48,
"quantize": False,
},
{
"mode": "deployable",
"input_blob_size": (1, 48, 4, 40, 40),
"in_channels": 48,
"quantize": False,
},
{
"mode": "original",
"input_blob_size": (1, 48, 4, 40, 40),
"in_channels": 48,
"quantize": True,
},
{
"mode": "deployable",
"input_blob_size": (1, 48, 4, 40, 40),
"in_channels": 48,
"quantize": True,
},
{
"mode": "deployable",
"input_blob_size": (1, 48, 4, 40, 40),
"in_channels": 48,
"quantize": True,
"native_conv3d_op_qnnpack": True,
},
]
def _benchmark_conv3d_3x3x3_dw_bn_relu_forward(**kwargs) -> Callable:
assert kwargs["mode"] in ("original", "deployable"), (
"kwargs['mode'] must be either 'original' or 'deployable',"
"but got {}.".format(kwargs["mode"])
)
input_tensor = torch.randn((kwargs["input_blob_size"]))
conv_block = Conv3d3x3x3DwBnAct(
kwargs["in_channels"],
use_bn=False, # assume BN has already been fused for forward
)
def func_to_benchmark_dummy() -> None:
return
if kwargs["mode"] == "deployable":
native_conv3d_op_qnnpack = kwargs.get("native_conv3d_op_qnnpack", False)
conv_block.convert(
kwargs["input_blob_size"],
convert_for_quantize=kwargs["quantize"],
native_conv3d_op_qnnpack=native_conv3d_op_qnnpack,
)
conv_block.eval()
if kwargs["quantize"] is True:
if kwargs["mode"] == "original": # manually fuse conv and relu
conv_block.kernel = fuse_modules(
conv_block.kernel, ["conv", "act.act"]
)
conv_block = nn.Sequential(
QuantStub(),
conv_block,
DeQuantStub(),
)
conv_block.qconfig = get_default_qconfig("qnnpack")
conv_block = prepare(conv_block)
try:
conv_block = convert(conv_block)
except Exception as e:
logging.info(
"benchmark_conv3d_3x3x3_dw_bn_relu: "
"catch exception '{}' with kwargs of {}".format(e, kwargs)
)
return func_to_benchmark_dummy
try:
traced_model = torch.jit.trace(conv_block, input_tensor, strict=False)
except Exception as e:
logging.info(
"benchmark_conv3d_3x3x3_dw_bn_relu: "
"catch exception '{}' with kwargs of {}".format(e, kwargs)
)
return func_to_benchmark_dummy
if kwargs["quantize"] is False:
traced_model = optimize_for_mobile(traced_model)
logging.info(f"model arch: {traced_model}")
def func_to_benchmark() -> None:
try:
_ = traced_model(input_tensor)
except Exception as e:
logging.info(
"benchmark_conv3d_3x3x3_dw_bn_relu: "
"catch exception '{}' with kwargs of {}".format(e, kwargs)
)
return
return func_to_benchmark
benchmark(
_benchmark_conv3d_3x3x3_dw_bn_relu_forward,
"benchmark_conv3d_3x3x3_dw_bn_relu",
kwargs_list,
num_iters=num_iters,
warmup_iters=2,
)
self.assertTrue(True)
def test_benchmark_x3d_bottleneck_block(self, num_iters: int = 20) -> None:
"""
Benchmark X3dBottleneckBlock.
Note efficient block X3dBottleneckBlock is designed for mobile cpu with qnnpack
backend, and benchmarking on server/laptop may have different latency result
compared to running on mobile cpu.
Args:
num_iters (int): number of iterations to perform benchmarking.
"""
torch.backends.quantized.engine = "qnnpack"
kwargs_list = [
{
"mode": "original",
"input_blob_size": (1, 48, 4, 20, 20),
"in_channels": 48,
"mid_channels": 108,
"out_channels": 48,
"quantize": False,
},
{
"mode": "deployable",
"input_blob_size": (1, 48, 4, 20, 20),
"in_channels": 48,
"mid_channels": 108,
"out_channels": 48,
"quantize": False,
},
{
"mode": "original",
"input_blob_size": (1, 48, 4, 20, 20),
"in_channels": 48,
"mid_channels": 108,
"out_channels": 48,
"quantize": True,
},
{
"mode": "deployable",
"input_blob_size": (1, 48, 4, 20, 20),
"in_channels": 48,
"mid_channels": 108,
"out_channels": 48,
"quantize": True,
},
{
"mode": "deployable",
"input_blob_size": (1, 48, 4, 20, 20),
"in_channels": 48,
"mid_channels": 108,
"out_channels": 48,
"quantize": True,
"native_conv3d_op_qnnpack": True,
},
]
def _benchmark_x3d_bottleneck_forward(**kwargs) -> Callable:
assert kwargs["mode"] in ("original", "deployable"), (
"kwargs['mode'] must be either 'original' or 'deployable',"
"but got {}.".format(kwargs["mode"])
)
input_tensor = torch.randn((kwargs["input_blob_size"]))
conv_block = X3dBottleneckBlock(
kwargs["in_channels"],
kwargs["mid_channels"],
kwargs["out_channels"],
use_bn=(False, False, False), # Assume BN has been fused for forward
)
if kwargs["mode"] == "deployable":
native_conv3d_op_qnnpack = kwargs.get("native_conv3d_op_qnnpack", False)
conv_block.convert(
kwargs["input_blob_size"],
convert_for_quantize=kwargs["quantize"],
native_conv3d_op_qnnpack=native_conv3d_op_qnnpack,
)
conv_block.eval()
def func_to_benchmark_dummy() -> None:
return
if kwargs["quantize"] is True:
conv_block = nn.Sequential(
QuantStub(),
conv_block,
DeQuantStub(),
)
conv_block.qconfig = get_default_qconfig("qnnpack")
conv_block = prepare(conv_block)
try:
conv_block = convert(conv_block)
traced_model = torch.jit.trace(
conv_block, input_tensor, strict=False
)
except Exception as e:
logging.info(
"benchmark_x3d_bottleneck_forward: "
"catch exception '{}' with kwargs of {}".format(e, kwargs)
)
return func_to_benchmark_dummy
try:
traced_model = torch.jit.trace(conv_block, input_tensor, strict=False)
except Exception as e:
logging.info(
"benchmark_x3d_bottleneck_forward: "
"catch exception '{}' with kwargs of {}".format(e, kwargs)
)
return func_to_benchmark_dummy
if kwargs["quantize"] is False:
traced_model = optimize_for_mobile(traced_model)
logging.info(f"model arch: {traced_model}")
def func_to_benchmark() -> None:
try:
_ = traced_model(input_tensor)
except Exception as e:
logging.info(
"benchmark_x3d_bottleneck_forward: "
"catch exception '{}' with kwargs of {}".format(e, kwargs)
)
return
return func_to_benchmark
benchmark(
_benchmark_x3d_bottleneck_forward,
"benchmark_x3d_bottleneck_forward",
kwargs_list,
num_iters=num_iters,
warmup_iters=2,
)
self.assertTrue(True)