Initial commit with folder contents

1004df5 verified over 1 year ago

6.58 kB

	# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: MIT
	#
	# Permission is hereby granted, free of charge, to any person obtaining a
	# copy of this software and associated documentation files (the "Software"),
	# to deal in the Software without restriction, including without limitation
	# the rights to use, copy, modify, merge, publish, distribute, sublicense,
	# and/or sell copies of the Software, and to permit persons to whom the
	# Software is furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	# DEALINGS IN THE SOFTWARE.
	from diffusers.models.transformers.transformer_sd3 import SD3Transformer2DModel
	from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel

	sd3_common_transformer_block_config = {
	"dummy_input": {
	"hidden_states": (2, 4096, 1536),
	"encoder_hidden_states": (2, 333, 1536),
	"temb": (2, 1536),
	},
	"output_names": ["encoder_hidden_states_out", "hidden_states_out"],
	"dynamic_axes": {
	"hidden_states": {0: "batch_size"},
	"encoder_hidden_states": {0: "batch_size"},
	"temb": {0: "steps"},
	},
	}

	ONNX_CONFIG = {
	UNet2DConditionModel: {
	"down_blocks.0": {
	"dummy_input": {
	"hidden_states": (2, 320, 128, 128),
	"temb": (2, 1280),
	},
	"output_names": ["sample", "res_samples_0", "res_samples_1", "res_samples_2"],
	"dynamic_axes": {
	"hidden_states": {0: "batch_size"},
	"temb": {0: "steps"},
	},
	},
	"down_blocks.1": {
	"dummy_input": {
	"hidden_states": (2, 320, 64, 64),
	"temb": (2, 1280),
	"encoder_hidden_states": (2, 77, 2048),
	},
	"output_names": ["sample", "res_samples_0", "res_samples_1", "res_samples_2"],
	"dynamic_axes": {
	"hidden_states": {0: "batch_size"},
	"temb": {0: "steps"},
	"encoder_hidden_states": {0: "batch_size"},
	},
	},
	"down_blocks.2": {
	"dummy_input": {
	"hidden_states": (2, 640, 32, 32),
	"temb": (2, 1280),
	"encoder_hidden_states": (2, 77, 2048),
	},
	"output_names": ["sample", "res_samples_0", "res_samples_1"],
	"dynamic_axes": {
	"hidden_states": {0: "batch_size"},
	"temb": {0: "steps"},
	"encoder_hidden_states": {0: "batch_size"},
	},
	},
	"mid_block": {
	"dummy_input": {
	"hidden_states": (2, 1280, 32, 32),
	"temb": (2, 1280),
	"encoder_hidden_states": (2, 77, 2048),
	},
	"output_names": ["sample"],
	"dynamic_axes": {
	"hidden_states": {0: "batch_size"},
	"temb": {0: "steps"},
	"encoder_hidden_states": {0: "batch_size"},
	},
	},
	"up_blocks.0": {
	"dummy_input": {
	"hidden_states": (2, 1280, 32, 32),
	"res_hidden_states_0": (2, 640, 32, 32),
	"res_hidden_states_1": (2, 1280, 32, 32),
	"res_hidden_states_2": (2, 1280, 32, 32),
	"temb": (2, 1280),
	"encoder_hidden_states": (2, 77, 2048),
	},
	"output_names": ["sample"],
	"dynamic_axes": {
	"hidden_states": {0: "batch_size"},
	"temb": {0: "steps"},
	"encoder_hidden_states": {0: "batch_size"},
	"res_hidden_states_0": {0: "batch_size"},
	"res_hidden_states_1": {0: "batch_size"},
	"res_hidden_states_2": {0: "batch_size"},
	},
	},
	"up_blocks.1": {
	"dummy_input": {
	"hidden_states": (2, 1280, 64, 64),
	"res_hidden_states_0": (2, 320, 64, 64),
	"res_hidden_states_1": (2, 640, 64, 64),
	"res_hidden_states_2": (2, 640, 64, 64),
	"temb": (2, 1280),
	"encoder_hidden_states": (2, 77, 2048),
	},
	"output_names": ["sample"],
	"dynamic_axes": {
	"hidden_states": {0: "batch_size"},
	"temb": {0: "steps"},
	"encoder_hidden_states": {0: "batch_size"},
	"res_hidden_states_0": {0: "batch_size"},
	"res_hidden_states_1": {0: "batch_size"},
	"res_hidden_states_2": {0: "batch_size"},
	},
	},
	"up_blocks.2": {
	"dummy_input": {
	"hidden_states": (2, 640, 128, 128),
	"res_hidden_states_0": (2, 320, 128, 128),
	"res_hidden_states_1": (2, 320, 128, 128),
	"res_hidden_states_2": (2, 320, 128, 128),
	"temb": (2, 1280),
	},
	"output_names": ["sample"],
	"dynamic_axes": {
	"hidden_states": {0: "batch_size"},
	"temb": {0: "steps"},
	"res_hidden_states_0": {0: "batch_size"},
	"res_hidden_states_1": {0: "batch_size"},
	"res_hidden_states_2": {0: "batch_size"},
	},
	},
	},
	SD3Transformer2DModel: {
	**{f"transformer_blocks.{i}": sd3_common_transformer_block_config for i in range(23)},
	"transformer_blocks.23": {
	"dummy_input": {
	"hidden_states": (2, 4096, 1536),
	"encoder_hidden_states": (2, 333, 1536),
	"temb": (2, 1536),
	},
	"output_names": ["hidden_states_out"],
	"dynamic_axes": {
	"hidden_states": {0: "batch_size"},
	"encoder_hidden_states": {0: "batch_size"},
	"temb": {0: "steps"},
	},
	},
	},
	}