mvsoom
/

pandagpt-vicuna-v0-7b

Model card Files Files and versions Community

pandagpt-vicuna-v0-7b / code /pytorchvideo /tests /benchmark_accelerator_efficient_blocks.py

mvsoom

Upload folder using huggingface_hub

3133fdb about 2 years ago

raw

history blame contribute delete

16 kB

	# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.

	import logging
	import unittest
	from typing import Callable, Tuple

	import torch
	import torch.nn as nn
	from fvcore.common.benchmark import benchmark
	from pytorchvideo.layers.accelerator.mobile_cpu.convolutions import (
	Conv3d3x3x3DwBnAct,
	Conv3dPwBnAct,
	)
	from pytorchvideo.models.accelerator.mobile_cpu.residual_blocks import (
	X3dBottleneckBlock,
	)
	from torch.utils.mobile_optimizer import optimize_for_mobile


	TORCH_VERSION: Tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2])
	if TORCH_VERSION >= (1, 11):
	from torch.ao.quantization import (
	convert,
	DeQuantStub,
	fuse_modules,
	get_default_qconfig,
	prepare,
	QuantStub,
	# quantize_fx
	)
	else:
	from torch.quantization import (
	convert,
	DeQuantStub,
	fuse_modules,
	get_default_qconfig,
	prepare,
	QuantStub,
	# quantize_fx
	)


	class TestBenchmarkEfficientBlocks(unittest.TestCase):
	def setUp(self):
	super().setUp()
	torch.set_rng_state(torch.manual_seed(42).get_state())

	def test_benchmark_conv3d_pw_bn_relu(self, num_iters: int = 20) -> None:
	"""
	Benchmark Conv3dPwBnAct with ReLU activation.
	Note efficient block Conv3dPwBnAct is designed for mobile cpu with qnnpack
	backend, and benchmarking on server with another backend (e.g., fbgemm) may
	have different latency result compared to running on mobile cpu with qnnpack.
	Running on x86 based server cpu with qnnpack may also have different latency as
	running on mobile cpu with qnnpack, as qnnpack is optimized for
	ARM based mobile cpu.
	Args:
	num_iters (int): number of iterations to perform benchmarking.
	"""

	torch.backends.quantized.engine = "qnnpack"
	kwargs_list = [
	{
	"mode": "original",
	"input_blob_size": (1, 48, 4, 40, 40),
	"in_channels": 48,
	"out_channels": 108,
	"quantize": False,
	},
	{
	"mode": "deployable",
	"input_blob_size": (1, 48, 4, 40, 40),
	"in_channels": 48,
	"out_channels": 108,
	"quantize": False,
	},
	{
	"mode": "original",
	"input_blob_size": (1, 48, 4, 40, 40),
	"in_channels": 48,
	"out_channels": 108,
	"quantize": True,
	},
	{
	"mode": "deployable",
	"input_blob_size": (1, 48, 4, 40, 40),
	"in_channels": 48,
	"out_channels": 108,
	"quantize": True,
	},
	{
	"mode": "deployable",
	"input_blob_size": (1, 48, 4, 40, 40),
	"in_channels": 48,
	"out_channels": 108,
	"quantize": True,
	"native_conv3d_op_qnnpack": True,
	},
	]

	def _benchmark_conv3d_pw_bn_relu_forward(**kwargs) -> Callable:
	assert kwargs["mode"] in ("original", "deployable"), (
	"kwargs['mode'] must be either 'original' or 'deployable',"
	"but got {}.".format(kwargs["mode"])
	)
	input_tensor = torch.randn((kwargs["input_blob_size"]))
	conv_block = Conv3dPwBnAct(
	kwargs["in_channels"],
	kwargs["out_channels"],
	use_bn=False, # assume BN has already been fused for forward
	)

	if kwargs["mode"] == "deployable":
	native_conv3d_op_qnnpack = kwargs.get("native_conv3d_op_qnnpack", False)
	conv_block.convert(
	kwargs["input_blob_size"],
	convert_for_quantize=kwargs["quantize"],
	native_conv3d_op_qnnpack=native_conv3d_op_qnnpack,
	)
	conv_block.eval()

	def func_to_benchmark_dummy() -> None:
	return

	if kwargs["quantize"] is True:
	if kwargs["mode"] == "original": # manually fuse conv and relu
	conv_block.kernel = fuse_modules(
	conv_block.kernel, ["conv", "act.act"]
	)
	conv_block = nn.Sequential(
	QuantStub(),
	conv_block,
	DeQuantStub(),
	)

	conv_block.qconfig = get_default_qconfig("qnnpack")
	conv_block = prepare(conv_block)
	try:
	conv_block = convert(conv_block)

	except Exception as e:
	logging.info(
	"benchmark_conv3d_pw_bn_relu: "
	"catch exception '{}' with kwargs of {}".format(e, kwargs)
	)

	return func_to_benchmark_dummy
	try:
	traced_model = torch.jit.trace(conv_block, input_tensor, strict=False)
	except Exception as e:
	logging.info(
	"benchmark_conv3d_pw_bn_relu: "
	"catch exception '{}' with kwargs of {}".format(e, kwargs)
	)

	return func_to_benchmark_dummy

	if kwargs["quantize"] is False:
	traced_model = optimize_for_mobile(traced_model)

	logging.info(f"model arch: {traced_model}")

	def func_to_benchmark() -> None:
	try:
	_ = traced_model(input_tensor)
	except Exception as e:
	logging.info(
	"benchmark_conv3d_pw_bn_relu: "
	"catch exception '{}' with kwargs of {}".format(e, kwargs)
	)

	return

	return func_to_benchmark

	benchmark(
	_benchmark_conv3d_pw_bn_relu_forward,
	"benchmark_conv3d_pw_bn_relu",
	kwargs_list,
	num_iters=num_iters,
	warmup_iters=2,
	)

	self.assertTrue(True)

	def test_benchmark_conv3d_3x3x3_dw_bn_relu(self, num_iters: int = 20) -> None:
	"""
	Benchmark Conv3d3x3x3DwBnAct with ReLU activation.
	Note efficient block Conv3d3x3x3DwBnAct is designed for mobile cpu with qnnpack
	backend, and benchmarking on server with another backend (e.g., fbgemm) may have
	different latency result compared as running on mobile cpu.
	Args:
	num_iters (int): number of iterations to perform benchmarking.
	"""
	torch.backends.quantized.engine = "qnnpack"
	kwargs_list = [
	{
	"mode": "original",
	"input_blob_size": (1, 48, 4, 40, 40),
	"in_channels": 48,
	"quantize": False,
	},
	{
	"mode": "deployable",
	"input_blob_size": (1, 48, 4, 40, 40),
	"in_channels": 48,
	"quantize": False,
	},
	{
	"mode": "original",
	"input_blob_size": (1, 48, 4, 40, 40),
	"in_channels": 48,
	"quantize": True,
	},
	{
	"mode": "deployable",
	"input_blob_size": (1, 48, 4, 40, 40),
	"in_channels": 48,
	"quantize": True,
	},
	{
	"mode": "deployable",
	"input_blob_size": (1, 48, 4, 40, 40),
	"in_channels": 48,
	"quantize": True,
	"native_conv3d_op_qnnpack": True,
	},
	]

	def _benchmark_conv3d_3x3x3_dw_bn_relu_forward(**kwargs) -> Callable:
	assert kwargs["mode"] in ("original", "deployable"), (
	"kwargs['mode'] must be either 'original' or 'deployable',"
	"but got {}.".format(kwargs["mode"])
	)
	input_tensor = torch.randn((kwargs["input_blob_size"]))
	conv_block = Conv3d3x3x3DwBnAct(
	kwargs["in_channels"],
	use_bn=False, # assume BN has already been fused for forward
	)

	def func_to_benchmark_dummy() -> None:
	return

	if kwargs["mode"] == "deployable":
	native_conv3d_op_qnnpack = kwargs.get("native_conv3d_op_qnnpack", False)
	conv_block.convert(
	kwargs["input_blob_size"],
	convert_for_quantize=kwargs["quantize"],
	native_conv3d_op_qnnpack=native_conv3d_op_qnnpack,
	)
	conv_block.eval()
	if kwargs["quantize"] is True:
	if kwargs["mode"] == "original": # manually fuse conv and relu
	conv_block.kernel = fuse_modules(
	conv_block.kernel, ["conv", "act.act"]
	)
	conv_block = nn.Sequential(
	QuantStub(),
	conv_block,
	DeQuantStub(),
	)

	conv_block.qconfig = get_default_qconfig("qnnpack")
	conv_block = prepare(conv_block)
	try:
	conv_block = convert(conv_block)
	except Exception as e:
	logging.info(
	"benchmark_conv3d_3x3x3_dw_bn_relu: "
	"catch exception '{}' with kwargs of {}".format(e, kwargs)
	)

	return func_to_benchmark_dummy
	try:
	traced_model = torch.jit.trace(conv_block, input_tensor, strict=False)
	except Exception as e:
	logging.info(
	"benchmark_conv3d_3x3x3_dw_bn_relu: "
	"catch exception '{}' with kwargs of {}".format(e, kwargs)
	)

	return func_to_benchmark_dummy
	if kwargs["quantize"] is False:
	traced_model = optimize_for_mobile(traced_model)

	logging.info(f"model arch: {traced_model}")

	def func_to_benchmark() -> None:
	try:
	_ = traced_model(input_tensor)
	except Exception as e:
	logging.info(
	"benchmark_conv3d_3x3x3_dw_bn_relu: "
	"catch exception '{}' with kwargs of {}".format(e, kwargs)
	)
	return

	return func_to_benchmark

	benchmark(
	_benchmark_conv3d_3x3x3_dw_bn_relu_forward,
	"benchmark_conv3d_3x3x3_dw_bn_relu",
	kwargs_list,
	num_iters=num_iters,
	warmup_iters=2,
	)

	self.assertTrue(True)

	def test_benchmark_x3d_bottleneck_block(self, num_iters: int = 20) -> None:
	"""
	Benchmark X3dBottleneckBlock.
	Note efficient block X3dBottleneckBlock is designed for mobile cpu with qnnpack
	backend, and benchmarking on server/laptop may have different latency result
	compared to running on mobile cpu.
	Args:
	num_iters (int): number of iterations to perform benchmarking.
	"""
	torch.backends.quantized.engine = "qnnpack"
	kwargs_list = [
	{
	"mode": "original",
	"input_blob_size": (1, 48, 4, 20, 20),
	"in_channels": 48,
	"mid_channels": 108,
	"out_channels": 48,
	"quantize": False,
	},
	{
	"mode": "deployable",
	"input_blob_size": (1, 48, 4, 20, 20),
	"in_channels": 48,
	"mid_channels": 108,
	"out_channels": 48,
	"quantize": False,
	},
	{
	"mode": "original",
	"input_blob_size": (1, 48, 4, 20, 20),
	"in_channels": 48,
	"mid_channels": 108,
	"out_channels": 48,
	"quantize": True,
	},
	{
	"mode": "deployable",
	"input_blob_size": (1, 48, 4, 20, 20),
	"in_channels": 48,
	"mid_channels": 108,
	"out_channels": 48,
	"quantize": True,
	},
	{
	"mode": "deployable",
	"input_blob_size": (1, 48, 4, 20, 20),
	"in_channels": 48,
	"mid_channels": 108,
	"out_channels": 48,
	"quantize": True,
	"native_conv3d_op_qnnpack": True,
	},
	]

	def _benchmark_x3d_bottleneck_forward(**kwargs) -> Callable:
	assert kwargs["mode"] in ("original", "deployable"), (
	"kwargs['mode'] must be either 'original' or 'deployable',"
	"but got {}.".format(kwargs["mode"])
	)
	input_tensor = torch.randn((kwargs["input_blob_size"]))
	conv_block = X3dBottleneckBlock(
	kwargs["in_channels"],
	kwargs["mid_channels"],
	kwargs["out_channels"],
	use_bn=(False, False, False), # Assume BN has been fused for forward
	)

	if kwargs["mode"] == "deployable":
	native_conv3d_op_qnnpack = kwargs.get("native_conv3d_op_qnnpack", False)
	conv_block.convert(
	kwargs["input_blob_size"],
	convert_for_quantize=kwargs["quantize"],
	native_conv3d_op_qnnpack=native_conv3d_op_qnnpack,
	)
	conv_block.eval()

	def func_to_benchmark_dummy() -> None:
	return

	if kwargs["quantize"] is True:
	conv_block = nn.Sequential(
	QuantStub(),
	conv_block,
	DeQuantStub(),
	)

	conv_block.qconfig = get_default_qconfig("qnnpack")
	conv_block = prepare(conv_block)
	try:
	conv_block = convert(conv_block)
	traced_model = torch.jit.trace(
	conv_block, input_tensor, strict=False
	)
	except Exception as e:
	logging.info(
	"benchmark_x3d_bottleneck_forward: "
	"catch exception '{}' with kwargs of {}".format(e, kwargs)
	)

	return func_to_benchmark_dummy

	try:
	traced_model = torch.jit.trace(conv_block, input_tensor, strict=False)
	except Exception as e:
	logging.info(
	"benchmark_x3d_bottleneck_forward: "
	"catch exception '{}' with kwargs of {}".format(e, kwargs)
	)

	return func_to_benchmark_dummy

	if kwargs["quantize"] is False:
	traced_model = optimize_for_mobile(traced_model)

	logging.info(f"model arch: {traced_model}")

	def func_to_benchmark() -> None:
	try:
	_ = traced_model(input_tensor)
	except Exception as e:
	logging.info(
	"benchmark_x3d_bottleneck_forward: "
	"catch exception '{}' with kwargs of {}".format(e, kwargs)
	)
	return

	return func_to_benchmark

	benchmark(
	_benchmark_x3d_bottleneck_forward,
	"benchmark_x3d_bottleneck_forward",
	kwargs_list,
	num_iters=num_iters,
	warmup_iters=2,
	)

	self.assertTrue(True)