instant-ngp build

7873319 about 3 years ago

8.34 kB

	/*
	* Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without modification, are permitted
	* provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright notice, this list of
	* conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright notice, this list of
	* conditions and the following disclaimer in the documentation and/or other materials
	* provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
	* to endorse or promote products derived from this software without specific prior written
	* permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
	* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
	* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
	* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	/** @file cpp_api.cu
	* @author Thomas Müller, NVIDIA
	* @brief API to be consumed by cpp (non-CUDA) programs.
	*/

	#include <tiny-cuda-nn/common.h>
	#include <tiny-cuda-nn/cpp_api.h>
	#include <tiny-cuda-nn/encoding.h>
	#include <tiny-cuda-nn/multi_stream.h>

	#if !defined(TCNN_NO_NETWORKS)
	#include <tiny-cuda-nn/network_with_input_encoding.h>
	#endif

	namespace tcnn { namespace cpp {

	uint32_t batch_size_granularity() {
	return tcnn::batch_size_granularity;
	}

	int cuda_device() {
	return tcnn::cuda_device();
	}

	void set_cuda_device(int device) {
	tcnn::set_cuda_device(device);
	}

	void free_temporary_memory() {
	tcnn::free_all_gpu_memory_arenas();
	}

	bool has_networks() {
	#if defined(TCNN_NO_NETWORKS)
	return false;
	#else
	return true;
	#endif
	}

	template <typename T>
	constexpr EPrecision precision() {
	return std::is_same<T, float>::value ? EPrecision::Fp32 : EPrecision::Fp16;
	}

	EPrecision preferred_precision() {
	return precision<network_precision_t>();
	}

	template <typename T>
	class DifferentiableObject : public Module {
	public:
	DifferentiableObject(tcnn::DifferentiableObject<float, T, T>* model)
	: Module{precision<T>(), precision<T>()}, m_model{model}
	{}

	void inference(cudaStream_t stream, uint32_t n_elements, const float* input, void* output, void* params) override {
	m_model->set_params((T)params, (T)params, nullptr);

	GPUMatrix<float, MatrixLayout::ColumnMajor> input_matrix((float*)input, m_model->input_width(), n_elements);
	GPUMatrix<T, MatrixLayout::ColumnMajor> output_matrix((T*)output, m_model->padded_output_width(), n_elements);

	// Run on our own custom stream to ensure CUDA graph capture is possible.
	// (Significant possible speedup.)
	SyncedMultiStream synced_stream{stream, 2};
	m_model->inference_mixed_precision(synced_stream.get(1), input_matrix, output_matrix);
	}

	Context forward(cudaStream_t stream, uint32_t n_elements, const float* input, void* output, void* params, bool prepare_input_gradients) override {
	m_model->set_params((T)params, (T)params, nullptr);

	GPUMatrix<float, MatrixLayout::ColumnMajor> input_matrix((float*)input, m_model->input_width(), n_elements);
	GPUMatrix<T, MatrixLayout::ColumnMajor> output_matrix((T*)output, m_model->padded_output_width(), n_elements);

	// Run on our own custom stream to ensure CUDA graph capture is possible.
	// (Significant possible speedup.)
	SyncedMultiStream synced_stream{stream, 2};
	return { m_model->forward(synced_stream.get(1), input_matrix, &output_matrix, false, prepare_input_gradients) };
	}

	void backward(cudaStream_t stream, const Context& ctx, uint32_t n_elements, float* dL_dinput, const void* dL_doutput, void* dL_dparams, const float* input, const void* output, const void* params) override {
	m_model->set_params((T)params, (T)params, (T*)dL_dparams);

	GPUMatrix<float, MatrixLayout::ColumnMajor> input_matrix((float*)input, m_model->input_width(), n_elements);
	GPUMatrix<float, MatrixLayout::ColumnMajor> dL_dinput_matrix(dL_dinput, m_model->input_width(), n_elements);

	GPUMatrix<T, MatrixLayout::ColumnMajor> output_matrix((T*)output, m_model->padded_output_width(), n_elements);
	GPUMatrix<T, MatrixLayout::ColumnMajor> dL_doutput_matrix((T*)dL_doutput, m_model->padded_output_width(), n_elements);

	// Run on our own custom stream to ensure CUDA graph capture is possible.
	// (Significant possible speedup.)
	SyncedMultiStream synced_stream{stream, 2};
	m_model->backward(synced_stream.get(1), *ctx.ctx, input_matrix, output_matrix, dL_doutput_matrix, dL_dinput ? &dL_dinput_matrix : nullptr, false, dL_dparams ? EGradientMode::Overwrite : EGradientMode::Ignore);
	}

	void backward_backward_input(cudaStream_t stream, const Context& ctx, uint32_t n_elements, const float* dL_ddLdinput, const float* input, const void* dL_doutput, void* dL_dparams, void* dL_ddLdoutput, float* dL_dinput, const void* params) override {
	// from: dL_ddLdinput
	// to: dL_ddLdoutput, dL_dparams
	m_model->set_params((T)params, (T)params, (T*)dL_dparams);

	GPUMatrix<float, MatrixLayout::ColumnMajor> input_matrix((float*)input, m_model->input_width(), n_elements);
	GPUMatrix<float, MatrixLayout::ColumnMajor> dL_ddLdinput_matrix((float*)dL_ddLdinput, m_model->input_width(), n_elements);

	GPUMatrix<T, MatrixLayout::ColumnMajor> dL_doutput_matrix((T*)dL_doutput, m_model->padded_output_width(), n_elements);
	GPUMatrix<T, MatrixLayout::ColumnMajor> dL_ddLdoutput_matrix((T*)dL_ddLdoutput, m_model->padded_output_width(), n_elements);
	GPUMatrix<float, MatrixLayout::ColumnMajor> dL_dinput_matrix((float*)dL_dinput, m_model->input_width(), n_elements);

	// Run on our own custom stream to ensure CUDA graph capture is possible.
	// (Significant possible speedup.)
	SyncedMultiStream synced_stream{stream, 2};
	m_model->backward_backward_input(synced_stream.get(1), *ctx.ctx, input_matrix, dL_ddLdinput_matrix, dL_doutput_matrix, dL_ddLdoutput ? &dL_ddLdoutput_matrix : nullptr, dL_dinput ? &dL_dinput_matrix : nullptr, false, dL_dparams ? EGradientMode::Overwrite : EGradientMode::Ignore);
	}

	uint32_t n_input_dims() const override {
	return m_model->input_width();
	}

	size_t n_params() const override {
	return m_model->n_params();
	}

	void initialize_params(size_t seed, float* params_full_precision, float scale) override {
	pcg32 rng{seed};
	m_model->initialize_params(rng, params_full_precision, scale);
	}

	uint32_t n_output_dims() const override {
	return m_model->padded_output_width();
	}

	json hyperparams() const override {
	return m_model->hyperparams();
	}

	std::string name() const override {
	return m_model->name();
	}

	private:
	std::shared_ptr<tcnn::DifferentiableObject<float, T, T>> m_model;
	};

	#if !defined(TCNN_NO_NETWORKS)
	Module* create_network_with_input_encoding(uint32_t n_input_dims, uint32_t n_output_dims, const json& encoding, const json& network) {
	return new DifferentiableObject<network_precision_t>{new tcnn::NetworkWithInputEncoding<network_precision_t>{n_input_dims, n_output_dims, encoding, network}};
	}

	Module* create_network(uint32_t n_input_dims, uint32_t n_output_dims, const json& network) {
	return create_network_with_input_encoding(n_input_dims, n_output_dims, {{"otype", "Identity"}}, network);
	}
	#endif // !defined(TCNN_NO_NETWORKS)

	Module* create_encoding(uint32_t n_input_dims, const json& encoding, EPrecision requested_precision) {
	if (requested_precision == EPrecision::Fp32) {
	return new DifferentiableObject<float>{tcnn::create_encoding<float>(n_input_dims, encoding, 0)};
	}
	#if TCNN_HALF_PRECISION
	return new DifferentiableObject<__half>{tcnn::create_encoding<__half>(n_input_dims, encoding, 0)};
	#else
	throw std::runtime_error{"TCNN was not compiled with half-precision support."};
	#endif
	}

	}}