whitbrunn's picture
1231: g0plus dockerfile
38fb1f6 verified
/*
* SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <algorithm>
#include <array>
#include <chrono>
#include <cuda.h>
#include <cuda_profiler_api.h>
#include <functional>
#include <iterator>
#include <limits>
#include <memory>
#include <mutex>
#include <thread>
#include <utility>
#include <vector>
#if defined(__QNX__)
#include <sys/neutrino.h>
#include <sys/syspage.h>
#endif
#include "NvInferRuntime.h"
#include "bfloat16.h"
#include "common.h"
#include "debugTensorWriter.h"
#include "logger.h"
#include "sampleDevice.h"
#include "sampleEngines.h"
#include "sampleInference.h"
#include "sampleOptions.h"
#include "sampleReporting.h"
#include "sampleUtils.h"
#include <cuda.h>
#if CUDA_VERSION >= 11060
#include <cuda_fp8.h>
#endif
using namespace nvinfer1;
#if ENABLE_UNIFIED_BUILDER
using namespace nvinfer2::safe;
// Provide a weak default definition that can be overridden
__attribute__((weak)) std::shared_ptr<sample::SampleSafeRecorder> gSafeRecorder
= std::make_shared<sample::SampleSafeRecorder>(nvinfer2::safe::Severity::kINFO);
#endif
namespace sample
{
#if !TRT_STATIC
std::string const& getRuntimeLibraryName(RuntimeMode const mode)
{
switch (mode)
{
case RuntimeMode::kFULL: return kNVINFER_LIBNAME;
case RuntimeMode::kDISPATCH: return kNVINFER_DISPATCH_LIBNAME;
case RuntimeMode::kLEAN: return kNVINFER_LEAN_LIBNAME;
case RuntimeMode::kSAFE: return kNVINFER_SAFE_LIBNAME;
}
throw std::runtime_error("Unknown runtime mode");
}
#endif // !TRT_STATIC
#if ENABLE_UNIFIED_BUILDER
namespace safe
{
namespace
{
std::function<nvinfer1::ErrorCode(
nvinfer2::safe::ITRTGraph*&, void const*, int64_t, ISafeRecorder&, bool, ISafeMemAllocator*)>
pcreateTRTGraphInternal{};
std::function<nvinfer1::ErrorCode(nvinfer2::safe::ITRTGraph* graph)> pdestroyTRTGraphInternal{};
std::function<nvinfer2::safe::ISafePluginRegistry*(ISafeRecorder& recorder)> pgetSafePluginRegistryInternal{};
} // namespace
//! Track runtime used for the execution of trtexec.
//! Must be tracked as a global variable due to how library init functions APIs are organized.
RuntimeMode gUseRuntime = RuntimeMode::kSAFE;
//!
//! \brief Initialize the NVIDIA Inference Safe Runtime library
//!
//! This function dynamically loads the Safe TensorRT runtime library and initializes
//! function pointers for safe TensorRT operations. It is used to set up the safe runtime
//! environment for inference with safety-certified TensorRT engines.
//!
//! The function performs the following operations:
//! - Dynamically loads the safe TensorRT runtime library
//! - Retrieves and stores function pointers for:
//! - createTRTGraph: Creates a safe TRT graph from serialized engine data
//! - destroyTRTGraph: Destroys a safe TRT graph and releases resources
//! - getSafePluginRegistry: Gets the safe plugin registry for loading plugins
//!
//! \return true if the safe runtime library was successfully loaded and initialized,
//! false otherwise (e.g., in static builds or if library loading fails)
//!
bool initNvinferSafe()
{
#if !TRT_STATIC
static LibraryPtr libnvinfersafePtr{};
auto fetchPtrs = [](samplesCommon::DynamicLibrary* l) {
if (gUseRuntime == RuntimeMode::kSAFE)
{
pcreateTRTGraphInternal = l->symbolAddress<nvinfer2::safe::ErrorCode(nvinfer2::safe::ITRTGraph*&,
void const*, int64_t, ISafeRecorder&, bool, ISafeMemAllocator*)>("createTRTGraph");
pdestroyTRTGraphInternal
= l->symbolAddress<nvinfer2::safe::ErrorCode(nvinfer2::safe::ITRTGraph * graph)>("destroyTRTGraph");
pgetSafePluginRegistryInternal
= l->symbolAddress<nvinfer2::safe::ISafePluginRegistry*(ISafeRecorder & recorder)>(
"getSafePluginRegistry");
}
};
return initLibrary(libnvinfersafePtr, sample::getRuntimeLibraryName(gUseRuntime), fetchPtrs);
#else
return false;
#endif // !TRT_STATIC
}
//!
//! \brief Create a safe TRT graph from serialized engine data
//!
//! This function creates a safe TRT graph from serialized engine data. It is used to create
//! a safe TRT graph for inference with safety-certified TensorRT engines.
//!
nvinfer1::ErrorCode createSafeTRTGraph(nvinfer2::safe::ITRTGraph*& graph, void const* blob, int64_t size,
ISafeRecorder& recorder, bool useManaged, ISafeMemAllocator* allocator)
{
if (!initNvinferSafe())
{
return nvinfer1::ErrorCode::kINTERNAL_ERROR;
}
ASSERT(pcreateTRTGraphInternal != nullptr);
return pcreateTRTGraphInternal(graph, blob, size, recorder, useManaged, allocator);
}
//!
//! \brief Destroy a safe TRT graph and release resources
//!
//! This function destroys a safe TRT graph and releases the associated resources. It is used to clean up
//! the safe TRT graph after inference with safety-certified TensorRT engines.
//!
nvinfer1::ErrorCode destroySafeTRTGraph(nvinfer2::safe::ITRTGraph*& graph)
{
if (!initNvinferSafe())
{
return nvinfer1::ErrorCode::kINTERNAL_ERROR;
}
ASSERT(pdestroyTRTGraphInternal != nullptr);
return pdestroyTRTGraphInternal(graph);
}
//!
//! \brief Get the safe plugin registry for loading plugins
//!
//! This function retrieves the safe plugin registry for loading plugins. It is used to get the safe plugin registry
//! for loading plugins with safety-certified TensorRT engines.
//!
nvinfer2::safe::ISafePluginRegistry* getSafePluginRegistry(ISafeRecorder& recorder)
{
if (!initNvinferSafe())
{
return nullptr;
}
ASSERT(pgetSafePluginRegistryInternal != nullptr);
return pgetSafePluginRegistryInternal(recorder);
}
namespace
{
nvinfer2::safe::TypedArray createTypedArray(void* const ptr, DataType const type, uint64_t bufferSize)
{
switch (type)
{
case DataType::kFLOAT: return nvinfer2::safe::TypedArray(static_cast<float*>(ptr), bufferSize);
case DataType::kHALF: return nvinfer2::safe::TypedArray(static_cast<nvinfer2::safe::half_t*>(ptr), bufferSize);
case DataType::kINT32: return nvinfer2::safe::TypedArray(static_cast<int32_t*>(ptr), bufferSize);
case DataType::kINT8: return nvinfer2::safe::TypedArray(static_cast<int8_t*>(ptr), bufferSize);
default:
{
sample::gLogError << "Invalid tensor DataType encountered." << std::endl;
return nvinfer2::safe::TypedArray{};
}
}
}
} // namespace
} // namespace safe
#endif
template <class TMapType, class TEngineType>
bool validateTensorNames(TMapType const& map, TEngineType const* engine, int32_t const endBindingIndex)
{
// Check if the provided input tensor names match the input tensors of the engine.
// Throw an error if the provided input tensor names cannot be found because it implies a potential typo.
for (auto const& item : map)
{
bool tensorNameFound{false};
for (int32_t b = 0; b < endBindingIndex; ++b)
{
auto const tensorName = engine->getIOTensorName(b);
auto const tensorIOMode = engine->getTensorIOMode(tensorName);
if (tensorIOMode == nvinfer1::TensorIOMode::kINPUT && matchStringWithOneWildcard(item.first, tensorName))
{
tensorNameFound = true;
break;
}
}
if (!tensorNameFound)
{
sample::gLogError << "Cannot find input tensor with name \"" << item.first << "\" in the engine bindings! "
<< "Please make sure the input tensor names are correct." << std::endl;
return false;
}
}
return true;
}
template <class TEngineType>
class FillBindingClosure
{
private:
using InputsMap = std::unordered_map<std::string, std::string>;
using BindingsVector = std::vector<std::unique_ptr<BindingsStd>>;
TEngineType const* mEngine;
nvinfer1::IExecutionContext const* mContext;
InputsMap const& inputs;
BindingsVector& bindings;
int32_t batch;
int32_t endBindingIndex;
int32_t profileIndex;
void fillOneBinding(TensorInfo const& tensorInfo)
{
auto const name = tensorInfo.name;
auto const* bindingInOutStr = tensorInfo.isInput ? "Input" : "Output";
for (auto& binding : bindings)
{
auto const input = findPlausible(inputs, name);
if (tensorInfo.isInput && input != inputs.end())
{
sample::gLogInfo << "Using values loaded from " << input->second << " for input " << name << std::endl;
binding->addBinding(tensorInfo, input->second);
}
else
{
if (tensorInfo.isInput)
{
sample::gLogInfo << "Using random values for input " << name << std::endl;
}
binding->addBinding(tensorInfo);
}
if (tensorInfo.isDynamic)
{
sample::gLogInfo << bindingInOutStr << " binding for " << name
<< " is dynamic and will be created during execution using OutputAllocator."
<< std::endl;
}
else
{
sample::gLogInfo << bindingInOutStr << " binding for " << name << " with dimensions " << tensorInfo.dims
<< " is created." << std::endl;
}
}
}
bool fillAllBindings(int32_t batch, int32_t endBindingIndex)
{
if (!validateTensorNames(inputs, mEngine, endBindingIndex))
{
sample::gLogError << "Invalid tensor names found in --loadInputs flag." << std::endl;
return false;
}
for (int32_t b = 0; b < endBindingIndex; b++)
{
TensorInfo tensorInfo;
tensorInfo.bindingIndex = b;
getTensorInfo(tensorInfo);
tensorInfo.updateVolume(batch);
fillOneBinding(tensorInfo);
}
return true;
}
void getTensorInfo(TensorInfo& tensorInfo);
public:
FillBindingClosure(TEngineType const* _engine, nvinfer1::IExecutionContext const* _context,
InputsMap const& _inputs, BindingsVector& _bindings, int32_t _batch, int32_t _endBindingIndex,
int32_t _profileIndex)
: mEngine(_engine)
, mContext(_context)
, inputs(_inputs)
, bindings(_bindings)
, batch(_batch)
, endBindingIndex(_endBindingIndex)
, profileIndex(_profileIndex)
{
}
bool operator()()
{
return fillAllBindings(batch, endBindingIndex);
}
};
template <>
void FillBindingClosure<nvinfer1::ICudaEngine>::getTensorInfo(TensorInfo& tensorInfo)
{
auto const b = tensorInfo.bindingIndex;
auto const name = mEngine->getIOTensorName(b);
tensorInfo.name = name;
tensorInfo.dims = mContext->getTensorShape(name);
tensorInfo.isDynamic = std::any_of(
tensorInfo.dims.d, tensorInfo.dims.d + tensorInfo.dims.nbDims, [](int32_t dim) { return dim == -1; });
tensorInfo.comps = mEngine->getTensorComponentsPerElement(name, profileIndex);
tensorInfo.strides = mContext->getTensorStrides(name);
tensorInfo.vectorDimIndex = mEngine->getTensorVectorizedDim(name, profileIndex);
tensorInfo.isInput = mEngine->getTensorIOMode(name) == TensorIOMode::kINPUT;
tensorInfo.dataType = mEngine->getTensorDataType(name);
}
namespace
{
bool allocateContextMemory(InferenceEnvironmentStd& iEnv, InferenceOptions const& inference)
{
auto* engine = iEnv.engine.get();
iEnv.deviceMemory.resize(inference.infStreams);
// Delay context memory allocation until input shapes are specified because runtime allocation would require actual
// input shapes.
for (int32_t i = 0; i < inference.infStreams; ++i)
{
auto const& ec = iEnv.contexts.at(i);
if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kSTATIC)
{
sample::gLogInfo << "Created execution context with device memory size: "
<< (engine->getDeviceMemorySize() / 1.0_MiB) << " MiB" << std::endl;
}
else
{
size_t sizeToAlloc{0};
const char* allocReason{nullptr};
if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kPROFILE)
{
auto const p = inference.optProfileIndex;
sizeToAlloc = engine->getDeviceMemorySizeForProfile(p);
allocReason = "current profile";
}
else if (inference.memoryAllocationStrategy == MemoryAllocationStrategy::kRUNTIME)
{
sizeToAlloc = ec->updateDeviceMemorySizeForShapes();
allocReason = "current input shapes";
}
else
{
sample::gLogError << "Unrecognizable memory allocation strategy." << std::endl;
return false;
}
iEnv.deviceMemory.at(i) = TrtDeviceBuffer(sizeToAlloc);
ec->setDeviceMemoryV2(iEnv.deviceMemory.at(i).get(), iEnv.deviceMemory.at(i).getSize());
sample::gLogInfo << "Maximum device memory size across all profiles: "
<< (engine->getDeviceMemorySizeV2() / 1.0_MiB) << " MiB" << std::endl;
sample::gLogInfo << "Only allocated device memory enough for " << allocReason << ": "
<< (sizeToAlloc / 1.0_MiB) << " MiB" << std::endl;
}
}
return true;
}
//! \brief Transform shapeData so that it can be type-punned to array of int32_t.
//!
//! Transform shapeData so if data() is type-punned to (int32_t*), the sequence
//! of values are equal to the original elements of shapeData.
void contractInt64ToInt32(std::vector<int64_t>& shapeData)
{
int64_t const size = shapeData.size();
for (int64_t const& val : shapeData)
{
ASSERT(val <= std::numeric_limits<int32_t>::max() && val >= std::numeric_limits<int32_t>::min()
&& "Value out of range for int32_t conversion");
}
int64_t const* src = shapeData.data();
int32_t* dst = reinterpret_cast<int32_t*>(shapeData.data());
std::copy(src, src + size, dst);
shapeData.resize((size + 1) / 2);
}
} // namespace
bool setUpInference(InferenceEnvironmentBase& iEnv, InferenceOptions const& inference, SystemOptions const& system)
{
#if ENABLE_UNIFIED_BUILDER
if (iEnv.safe)
{
return setUpSafeInference(static_cast<InferenceEnvironmentSafe&>(iEnv), inference, system);
}
#endif
return setUpStdInference(static_cast<InferenceEnvironmentStd&>(iEnv), inference, system);
}
#if ENABLE_UNIFIED_BUILDER
void getSafeTensorInfo(uint32_t profileIndex, nvinfer2::safe::ITRTGraph* safeGraph, TensorInfo& tensorInfo)
{
nvinfer2::safe::TensorDescriptor desc;
auto const b = tensorInfo.bindingIndex;
const char* name = nullptr;
safeGraph->getIOTensorName(name, b);
tensorInfo.name = name;
safeGraph->getIOTensorDescriptor(desc, name);
tensorInfo.dims = desc.shape;
tensorInfo.isDynamic = std::any_of(
tensorInfo.dims.d, tensorInfo.dims.d + tensorInfo.dims.nbDims, [](int32_t dim) { return dim == -1; });
tensorInfo.strides = desc.stride;
tensorInfo.isInput = desc.ioMode == TensorIOMode::kINPUT;
tensorInfo.dataType = desc.dataType;
}
bool setUpSafeInference(InferenceEnvironmentSafe& iEnv, InferenceOptions const& inference, SystemOptions const& system)
{
int32_t device{};
CHECK(cudaGetDevice(&device));
cudaDeviceProp properties;
CHECK(cudaGetDeviceProperties(&properties, device));
int32_t const isIntegrated{properties.integrated};
ASSERT(sample::hasSafeRuntime());
ASSERT(sample::safe::initNvinferSafe());
auto safeEngineBlob = iEnv.engine.getBlob();
SMP_RETVAL_IF_FALSE(safeEngineBlob.data != nullptr, "Engine blob is empty.", false, sample::gLogError);
SMP_RETVAL_IF_FALSE(iEnv.engine.checkDLASafe(),
"Safe DLA engine built with kDLA_STANDALONE should not be infered in TRT!", false, sample::gLogError);
std::unique_ptr<nvinfer2::safe::ITRTGraph> safeGraph;
// Use managed memory on integrated devices when transfers are skipped
// and when it is explicitly requested on the commandline.
bool useManagedMemory{(inference.skipTransfers && isIntegrated) || inference.useManaged};
nvinfer2::safe::ITRTGraph* tempGraph = nullptr;
if (sample::safe::createSafeTRTGraph(
tempGraph, safeEngineBlob.data, safeEngineBlob.size, *gSafeRecorder, true, nullptr)
!= nvinfer2::safe::ErrorCode::kSUCCESS)
{
sample::gLogError << "Create Safe TRT Graph Failed." << std::endl;
}
safeGraph.reset(tempGraph);
// Release serialized blob to save memory space.
iEnv.engine.releaseBlob();
for (int32_t s = 0; s < inference.infStreams; ++s)
{
nvinfer2::safe::ITRTGraph* clonedGraph{nullptr};
safeGraph->clone(clonedGraph, *gSafeRecorder); // return errorcode
iEnv.mClonedGraphs.emplace_back(clonedGraph);
iEnv.bindings.emplace_back(std::make_unique<BindingsSafe>(useManagedMemory));
}
int64_t endBindingIndex = 0;
safeGraph->getNbIOTensors(endBindingIndex);
for (int32_t b = 0; b < endBindingIndex; b++)
{
TensorInfo tensorInfo;
tensorInfo.bindingIndex = b;
getSafeTensorInfo(inference.optProfileIndex, safeGraph.get(), tensorInfo);
tensorInfo.updateVolume(1);
auto const name = tensorInfo.name;
auto const* bindingInOutStr = tensorInfo.isInput ? "Input" : "Output";
for (auto& binding : iEnv.bindings)
{
auto const input = findPlausible(inference.inputs, name);
if (tensorInfo.isInput && input != inference.inputs.end())
{
sample::gLogInfo << "Using values loaded from " << input->second << " for input " << name << std::endl;
binding->addBinding(tensorInfo, input->second);
}
else
{
if (tensorInfo.isInput)
{
sample::gLogInfo << "Using random values for input " << name << std::endl;
}
binding->addBinding(tensorInfo);
}
if (tensorInfo.isDynamic)
{
sample::gLogInfo << bindingInOutStr << " binding for " << name
<< " is dynamic and will be created during execution using OutputAllocator."
<< std::endl;
}
else
{
sample::gLogInfo << bindingInOutStr << " binding for " << name << " with dimensions " << tensorInfo.dims
<< " is created." << std::endl;
}
}
}
return true;
}
#endif
bool setUpStdInference(InferenceEnvironmentStd& iEnv, InferenceOptions const& inference, SystemOptions const& system)
{
int32_t device{};
CHECK(cudaGetDevice(&device));
cudaDeviceProp properties;
CHECK(cudaGetDeviceProperties(&properties, device));
int32_t const isIntegrated{properties.integrated};
// Use managed memory on integrated devices when transfers are skipped
// and when it is explicitly requested on the commandline.
bool useManagedMemory{(inference.skipTransfers && isIntegrated) || inference.useManaged};
using FillStdBindings = FillBindingClosure<nvinfer1::ICudaEngine>;
auto* engine = iEnv.engine.get();
SMP_RETVAL_IF_FALSE(engine != nullptr, "Got invalid engine!", false, sample::gLogError);
// Release serialized blob to save memory space.
iEnv.engine.releaseBlob();
// Setup weight streaming if enabled
if (engine->getStreamableWeightsSize() > 0)
{
auto const& budget = inference.weightStreamingBudget;
int64_t wsBudget = budget.bytes;
if (budget.percent != 100.0)
{
double const percent = budget.percent;
ASSERT(percent < 100.0);
auto const max = engine->getStreamableWeightsSize();
wsBudget = (max >= 0) ? (percent / 100) * (max) : WeightStreamingBudget::kDISABLE;
}
if (wsBudget == WeightStreamingBudget::kDISABLE)
{
wsBudget = engine->getStreamableWeightsSize();
}
else if (wsBudget == WeightStreamingBudget::kAUTOMATIC)
{
wsBudget = engine->getWeightStreamingAutomaticBudget();
}
ASSERT(wsBudget >= 0);
bool success = engine->setWeightStreamingBudgetV2(wsBudget);
SMP_RETVAL_IF_FALSE(success, "Failed to set weight streaming limit!", false, sample::gLogError);
switch (wsBudget)
{
case WeightStreamingBudget::kDISABLE:
{
sample::gLogInfo << "Weight streaming has been disabled at runtime." << std::endl;
break;
}
case WeightStreamingBudget::kAUTOMATIC:
{
sample::gLogInfo << "The weight streaming budget will automatically be chosen by TensorRT." << std::endl;
break;
}
default:
{
sample::gLogInfo << "Weight streaming is enabled with a device memory limit of " << wsBudget << " bytes."
<< std::endl;
break;
}
}
}
int32_t const nbOptProfiles = engine->getNbOptimizationProfiles();
if (inference.optProfileIndex >= nbOptProfiles)
{
sample::gLogError << "Selected profile index " << inference.optProfileIndex
<< " exceeds the number of profiles that the engine holds. " << std::endl;
return false;
}
if (nbOptProfiles > 1 && !inference.setOptProfile)
{
sample::gLogWarning << nbOptProfiles
<< " profiles detected but not set. Running with profile 0. Please use "
"--dumpOptimizationProfile to see all available profiles."
<< std::endl;
}
cudaStream_t setOptProfileStream;
CHECK(cudaStreamCreate(&setOptProfileStream));
for (int32_t s = 0; s < inference.infStreams; ++s)
{
IExecutionContext* ec{nullptr};
//! \return the `ExecutionContextAllocationStrategy` to use for the given allocation strategy, \p s.
auto getExecutionContextAllocationStrategy = [](MemoryAllocationStrategy s) {
return s == MemoryAllocationStrategy::kSTATIC
// Let TRT pre-allocate and manage the memory.
? ExecutionContextAllocationStrategy::kSTATIC
// Allocate based on the current profile or runtime shapes.
: ExecutionContextAllocationStrategy::kUSER_MANAGED;
};
ec = engine->createExecutionContext(getExecutionContextAllocationStrategy(inference.memoryAllocationStrategy));
if (ec == nullptr)
{
sample::gLogError << "Unable to create execution context for stream " << s << "." << std::endl;
return false;
}
ec->setNvtxVerbosity(inference.nvtxVerbosity);
int32_t const persistentCacheLimit
= samplesCommon::getMaxPersistentCacheSize() * inference.persistentCacheRatio;
sample::gLogInfo << "Setting persistentCacheLimit to " << persistentCacheLimit << " bytes." << std::endl;
ec->setPersistentCacheLimit(persistentCacheLimit);
auto setProfile = ec->setOptimizationProfileAsync(inference.optProfileIndex, setOptProfileStream);
CHECK(cudaStreamSynchronize(setOptProfileStream));
if (!setProfile)
{
sample::gLogError << "Set optimization profile failed. " << std::endl;
if (inference.infStreams > 1)
{
sample::gLogError
<< "Please ensure that the engine is built with preview feature profileSharing0806 enabled. "
<< std::endl;
}
return false;
}
iEnv.contexts.emplace_back(ec);
iEnv.bindings.emplace_back(std::make_unique<BindingsStd>(useManagedMemory));
}
CHECK(cudaStreamDestroy(setOptProfileStream));
if (iEnv.profiler)
{
iEnv.contexts.front()->setProfiler(iEnv.profiler.get());
// Always run reportToProfiler() after enqueue launch
iEnv.contexts.front()->setEnqueueEmitsProfile(false);
}
int32_t const endBindingIndex = engine->getNbIOTensors();
// Make sure that the tensor names provided in command-line args actually exist in any of the engine bindings
// to avoid silent typos.
if (!validateTensorNames(inference.shapes, engine, endBindingIndex))
{
sample::gLogError << "Invalid tensor names found in --shapes flag." << std::endl;
return false;
}
for (int32_t b = 0; b < endBindingIndex; ++b)
{
auto const& name = engine->getIOTensorName(b);
auto const& mode = engine->getTensorIOMode(name);
if (mode == TensorIOMode::kINPUT)
{
Dims const dims = iEnv.contexts.front()->getTensorShape(name);
bool isShapeInferenceIO{false};
isShapeInferenceIO = engine->isShapeInferenceIO(name);
bool const hasRuntimeDim = std::any_of(dims.d, dims.d + dims.nbDims, [](int32_t dim) { return dim == -1; });
auto const shape = findPlausible(inference.shapes, name);
if (hasRuntimeDim || isShapeInferenceIO)
{
// Set shapeData to either dimensions of the input (if it has a dynamic shape)
// or set to values of the input (if it is an input shape tensor).
std::vector<int64_t> shapeData;
if (shape == inference.shapes.end())
{
// No information provided. Use default value for missing data.
constexpr int32_t kDEFAULT_VALUE = 1;
if (isShapeInferenceIO)
{
// Set shape tensor to all ones.
shapeData.assign(volume(dims, 0, dims.nbDims), kDEFAULT_VALUE);
sample::gLogWarning << "Values missing for input shape tensor: " << name
<< "Automatically setting values to: " << shapeData << std::endl;
}
else
{
// Use default value for unspecified runtime dimensions.
shapeData.resize(dims.nbDims);
std::transform(dims.d, dims.d + dims.nbDims, shapeData.begin(),
[&](int32_t dimension) { return dimension >= 0 ? dimension : kDEFAULT_VALUE; });
sample::gLogWarning << "Shape missing for input with dynamic shape: " << name
<< "Automatically setting shape to: " << shapeData << std::endl;
}
}
else if (inference.inputs.count(shape->first) && isShapeInferenceIO)
{
// Load shape tensor from file.
int64_t const size = volume(dims, 0, dims.nbDims);
shapeData.resize(size);
auto const& filename = inference.inputs.at(shape->first);
auto dst = reinterpret_cast<char*>(shapeData.data());
loadFromFile(filename, dst, size * sizeof(decltype(shapeData)::value_type));
}
else
{
shapeData = shape->second;
}
int64_t* shapeTensorData{nullptr};
if (isShapeInferenceIO)
{
// Save the data in iEnv, in a way that its address does not change
// before enqueueV3 is called.
DataType const type = engine->getTensorDataType(name);
switch (type)
{
case DataType::kINT64: break;
case DataType::kINT32: contractInt64ToInt32(shapeData); break;
default:
sample::gLogError << "Shape tensor " << name << " has unexpected type " << type << std::endl;
return false;
}
iEnv.inputShapeTensorValues.emplace_back(shapeData);
shapeTensorData = iEnv.inputShapeTensorValues.back().data();
}
for (auto& c : iEnv.contexts)
{
if (isShapeInferenceIO)
{
sample::gLogInfo << "Set input shape tensor " << name << " to: " << shapeData << std::endl;
if (!c->setTensorAddress(name, shapeTensorData))
{
return false;
}
}
else
{
sample::gLogInfo << "Set shape of input tensor " << name << " to: " << shapeData << std::endl;
if (!c->setInputShape(name, toDims(shapeData)))
{
return false;
}
}
}
}
else if (nbOptProfiles && shape != inference.shapes.end())
{
// Check if the provided shape matches the static dimensions in the engine.
for (auto& c : iEnv.contexts)
{
if (!c->setInputShape(name, toDims(shape->second)))
{
sample::gLogError << "The engine was built with static shapes for input tensor " << name
<< " but the provided shapes do not match the static shapes!" << std::endl;
return false;
}
}
}
}
}
// Create Debug Listener and turn on debug states if client requested dumping debug tensors.
if (!inference.debugTensorFileNames.empty() || !inference.dumpAlldebugTensorFormats.empty())
{
iEnv.listener = std::make_unique<DebugTensorWriter>(
inference.debugTensorFileNames, inference.dumpAlldebugTensorFormats, engine->getName(), iEnv.cmdline);
iEnv.contexts.front()->setDebugListener(iEnv.listener.get());
for (auto const& s : inference.debugTensorFileNames)
{
iEnv.contexts.front()->setTensorDebugState(s.first.c_str(), true);
}
if (!inference.dumpAlldebugTensorFormats.empty())
{
iEnv.contexts.front()->setUnfusedTensorsDebugState(true);
}
}
if (!allocateContextMemory(iEnv, inference))
{
return false;
}
auto const* context = iEnv.contexts.front().get();
bool fillBindingsSuccess = FillStdBindings(
engine, context, inference.inputs, iEnv.bindings, 1, endBindingIndex, inference.optProfileIndex)();
return fillBindingsSuccess;
}
TaskInferenceEnvironment::TaskInferenceEnvironment(std::string engineFile, InferenceOptions const& inference,
ReportingOptions const& reporting, int32_t deviceId, int32_t DLACore, int32_t bs)
: iOptions(inference)
, rOptions(reporting)
, device(deviceId)
, batch(bs)
{
BuildEnvironment bEnv(/* isSafe */ false, /* versionCompatible */ false, DLACore, "", getTempfileControlDefaults());
loadEngineToBuildEnv(engineFile, bEnv, sample::gLogError, false);
iEnv = std::make_unique<InferenceEnvironmentStd>(bEnv);
CHECK(cudaSetDevice(device));
SystemOptions system{};
system.device = device;
system.DLACore = DLACore;
if (!setUpStdInference(*iEnv, iOptions, system))
{
sample::gLogError << "Inference set up failed" << std::endl;
}
}
namespace
{
#if defined(__QNX__)
using TimePoint = double;
#else
using TimePoint = std::chrono::time_point<std::chrono::high_resolution_clock>;
#endif
TimePoint getCurrentTime()
{
#if defined(__QNX__)
uint64_t const currentCycles = ClockCycles();
uint64_t const cyclesPerSecond = SYSPAGE_ENTRY(qtime)->cycles_per_sec;
// Return current timestamp in ms.
return static_cast<TimePoint>(currentCycles) * 1000. / cyclesPerSecond;
#else
return std::chrono::high_resolution_clock::now();
#endif
}
//!
//! \struct SyncStruct
//! \brief Threads synchronization structure
//!
struct SyncStruct
{
std::mutex mutex;
TrtCudaStream mainStream;
TrtCudaEvent gpuStart{cudaEventBlockingSync};
TimePoint cpuStart{};
float sleep{};
};
struct Enqueue
{
explicit Enqueue(nvinfer1::IExecutionContext& context)
: mContext(context)
{
}
nvinfer1::IExecutionContext& mContext;
};
#if ENABLE_UNIFIED_BUILDER
struct SafeEnqueue
{
explicit SafeEnqueue(nvinfer2::safe::ITRTGraph& graph)
: mGraph(graph)
{
}
nvinfer2::safe::ITRTGraph& mGraph;
};
#endif
//!
//! \class EnqueueExplicit
//! \brief Functor to enqueue inference with explict batch
//!
class EnqueueExplicit : private Enqueue
{
public:
explicit EnqueueExplicit(nvinfer1::IExecutionContext& context, BindingsStd const& bindings)
: Enqueue(context)
, mBindings(bindings)
{
ASSERT(mBindings.setTensorAddresses(mContext));
}
bool operator()(TrtCudaStream& stream) const
{
try
{
bool const result = mContext.enqueueV3(stream.get());
// Collecting layer timing info from current profile index of execution context, except under capturing
// mode.
if (!isStreamCapturing(stream) && mContext.getProfiler() && !mContext.getEnqueueEmitsProfile()
&& !mContext.reportToProfiler())
{
gLogWarning << "Failed to collect layer timing info from previous enqueueV3()" << std::endl;
}
return result;
}
catch (const std::exception&)
{
return false;
}
return false;
}
private:
// Helper function to check if a stream is in capturing mode.
bool isStreamCapturing(TrtCudaStream& stream) const
{
cudaStreamCaptureStatus status{cudaStreamCaptureStatusNone};
CHECK(cudaStreamIsCapturing(stream.get(), &status));
return status != cudaStreamCaptureStatusNone;
}
BindingsStd const& mBindings;
};
#if ENABLE_UNIFIED_BUILDER
//!
//! \class EnqueueExplicitSafe
//! \brief Functor to safeEnqueue inference with explict batch
//!
class EnqueueExplicitSafe : private SafeEnqueue
{
public:
explicit EnqueueExplicitSafe(nvinfer2::safe::ITRTGraph& graph, BindingsSafe const& bindings)
: SafeEnqueue(graph)
, mBindings(bindings)
{
ASSERT(mBindings.setTensorAddresses(graph));
}
bool operator()(TrtCudaStream& stream) const
{
try
{
bool const result = (mGraph.executeAsync(stream.get()) == nvinfer1::ErrorCode::kSUCCESS);
return result;
}
catch (const std::exception&)
{
return false;
}
return false;
}
private:
BindingsSafe const& mBindings;
};
#endif
//!
//! \class EnqueueGraph
//! \brief Functor to enqueue inference from CUDA Graph
//!
class EnqueueGraph
{
public:
explicit EnqueueGraph(nvinfer1::IExecutionContext& context, TrtCudaGraph& graph)
: mGraph(graph)
, mContext(context)
{
}
bool operator()(TrtCudaStream& stream) const
{
if (mGraph.launch(stream))
{
// Collecting layer timing info from current profile index of execution context
if (mContext.getProfiler() && !mContext.getEnqueueEmitsProfile() && !mContext.reportToProfiler())
{
gLogWarning << "Failed to collect layer timing info from previous CUDA graph launch" << std::endl;
}
return true;
}
return false;
}
TrtCudaGraph& mGraph;
nvinfer1::IExecutionContext& mContext;
};
#if ENABLE_UNIFIED_BUILDER
//!
//! \class EnqueueGraphSafe
//! \brief Functor to enqueue inference from CUDA Graph
//!
class EnqueueGraphSafe
{
public:
explicit EnqueueGraphSafe(nvinfer2::safe::ITRTGraph& graph)
: mGraph(graph)
{
}
bool operator()(TrtCudaStream& stream) const
{
return mGraph.executeAsync(stream.get()) == nvinfer1::ErrorCode::kSUCCESS;
}
nvinfer2::safe::ITRTGraph& mGraph;
};
#endif
using EnqueueFunction = std::function<bool(TrtCudaStream&)>;
enum class StreamType : int32_t
{
kINPUT = 0,
kCOMPUTE = 1,
kOUTPUT = 2,
kNUM = 3
};
enum class EventType : int32_t
{
kINPUT_S = 0,
kINPUT_E = 1,
kCOMPUTE_S = 2,
kCOMPUTE_E = 3,
kOUTPUT_S = 4,
kOUTPUT_E = 5,
kNUM = 6
};
using MultiStream = std::array<TrtCudaStream, static_cast<int32_t>(StreamType::kNUM)>;
using MultiEvent = std::array<std::unique_ptr<TrtCudaEvent>, static_cast<int32_t>(EventType::kNUM)>;
using EnqueueTimes = std::array<TimePoint, 2>;
//!
//! \class IterationBase
//! \brief Inference iteration and streams management
//!
class IterationBase
{
public:
explicit IterationBase(int32_t id, InferenceOptions const& inference, BindingsBase& bindings)
: mBindings(bindings)
, mStreamId(id)
, mDepth(1 + inference.overlap)
, mActive(mDepth)
, mEvents(mDepth)
, mEnqueueTimes(mDepth)
{
for (auto& eventsAtDepth : mEvents)
{
std::generate(eventsAtDepth.begin(), eventsAtDepth.end(),
[&] { return std::make_unique<TrtCudaEvent>(!inference.spin); });
}
}
bool query(bool skipTransfers)
{
if (mActive[mNext])
{
return true;
}
if (!skipTransfers)
{
record(EventType::kINPUT_S, StreamType::kINPUT);
setInputData(false);
record(EventType::kINPUT_E, StreamType::kINPUT);
wait(EventType::kINPUT_E, StreamType::kCOMPUTE); // Wait for input DMA before compute
}
record(EventType::kCOMPUTE_S, StreamType::kCOMPUTE);
recordEnqueueTime();
if (!mEnqueue(getStream(StreamType::kCOMPUTE)))
{
return false;
}
recordEnqueueTime();
record(EventType::kCOMPUTE_E, StreamType::kCOMPUTE);
if (!skipTransfers)
{
wait(EventType::kCOMPUTE_E, StreamType::kOUTPUT); // Wait for compute before output DMA
record(EventType::kOUTPUT_S, StreamType::kOUTPUT);
fetchOutputData(false);
record(EventType::kOUTPUT_E, StreamType::kOUTPUT);
}
mActive[mNext] = true;
moveNext();
return true;
}
float sync(
TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, std::vector<InferenceTrace>& trace, bool skipTransfers)
{
if (mActive[mNext])
{
if (skipTransfers)
{
getEvent(EventType::kCOMPUTE_E).synchronize();
}
else
{
getEvent(EventType::kOUTPUT_E).synchronize();
}
trace.emplace_back(getTrace(cpuStart, gpuStart, skipTransfers));
mActive[mNext] = false;
return getEvent(EventType::kCOMPUTE_S) - gpuStart;
}
return 0;
}
void syncAll(
TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, std::vector<InferenceTrace>& trace, bool skipTransfers)
{
for (int32_t d = 0; d < mDepth; ++d)
{
sync(cpuStart, gpuStart, trace, skipTransfers);
moveNext();
}
}
void wait(TrtCudaEvent& gpuStart)
{
getStream(StreamType::kINPUT).wait(gpuStart);
}
void setInputData(bool sync)
{
mBindings.transferInputToDevice(getStream(StreamType::kINPUT));
// additional sync to avoid overlapping with inference execution.
if (sync)
{
getStream(StreamType::kINPUT).synchronize();
}
}
void fetchOutputData(bool sync)
{
mBindings.transferOutputToHost(getStream(StreamType::kOUTPUT));
// additional sync to avoid overlapping with inference execution.
if (sync)
{
getStream(StreamType::kOUTPUT).synchronize();
}
}
protected:
void moveNext()
{
mNext = mDepth - 1 - mNext;
}
TrtCudaStream& getStream(StreamType t)
{
return mStream[static_cast<int32_t>(t)];
}
TrtCudaEvent& getEvent(EventType t)
{
return *mEvents[mNext][static_cast<int32_t>(t)];
}
void record(EventType e, StreamType s)
{
getEvent(e).record(getStream(s));
}
void recordEnqueueTime()
{
mEnqueueTimes[mNext][enqueueStart] = getCurrentTime();
enqueueStart = 1 - enqueueStart;
}
TimePoint getEnqueueTime(bool start)
{
return mEnqueueTimes[mNext][start ? 0 : 1];
}
void wait(EventType e, StreamType s)
{
getStream(s).wait(getEvent(e));
}
InferenceTrace getTrace(TimePoint const& cpuStart, TrtCudaEvent const& gpuStart, bool skipTransfers)
{
float is
= skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_S) - gpuStart;
float ie
= skipTransfers ? getEvent(EventType::kCOMPUTE_S) - gpuStart : getEvent(EventType::kINPUT_E) - gpuStart;
float os
= skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_S) - gpuStart;
float oe
= skipTransfers ? getEvent(EventType::kCOMPUTE_E) - gpuStart : getEvent(EventType::kOUTPUT_E) - gpuStart;
return InferenceTrace(mStreamId,
std::chrono::duration<float, std::milli>(getEnqueueTime(true) - cpuStart).count(),
std::chrono::duration<float, std::milli>(getEnqueueTime(false) - cpuStart).count(), is, ie,
getEvent(EventType::kCOMPUTE_S) - gpuStart, getEvent(EventType::kCOMPUTE_E) - gpuStart, os, oe);
}
BindingsBase& mBindings;
TrtCudaGraph mGraph;
EnqueueFunction mEnqueue;
int32_t mStreamId{0};
int32_t mNext{0};
int32_t mDepth{2}; // default to double buffer to hide DMA transfers
std::vector<bool> mActive;
MultiStream mStream;
std::vector<MultiEvent> mEvents;
int32_t enqueueStart{0};
std::vector<EnqueueTimes> mEnqueueTimes;
};
//!
//! \class IterationStd
//! \brief Inference iteration and streams management for standard inference
//!
class IterationStd : public IterationBase
{
public:
explicit IterationStd(
int32_t id, InferenceOptions const& inference, nvinfer1::IExecutionContext& context, BindingsStd& bindings)
: IterationBase(id, inference, bindings)
{
createEnqueueFunction(inference, context, bindings);
}
private:
void createEnqueueFunction(
InferenceOptions const& inference, nvinfer1::IExecutionContext& context, BindingsStd& bindings)
{
mEnqueue = EnqueueFunction(EnqueueExplicit(context, bindings));
if (inference.graph)
{
sample::gLogInfo << "Capturing CUDA graph for the current execution context" << std::endl;
TrtCudaStream& stream = getStream(StreamType::kCOMPUTE);
// Avoid capturing initialization calls by executing the enqueue function at least
// once before starting CUDA graph capture.
auto const ret = mEnqueue(stream);
if (!ret)
{
throw std::runtime_error("Inference enqueue failed.");
}
stream.synchronize();
mGraph.beginCapture(stream);
// The built TRT engine may contain operations that are not permitted under CUDA graph capture mode.
// When the stream is capturing, the enqueue call may return false if the current CUDA graph capture fails.
if (mEnqueue(stream))
{
mGraph.endCapture(stream);
mEnqueue = EnqueueFunction(EnqueueGraph(context, mGraph));
sample::gLogInfo << "Successfully captured CUDA graph for the current execution context" << std::endl;
}
else
{
mGraph.endCaptureOnError(stream);
// Ensure any CUDA error has been cleaned up.
CHECK(cudaGetLastError());
sample::gLogWarning << "The built TensorRT engine contains operations that are not permitted under "
"CUDA graph capture mode."
<< std::endl;
sample::gLogWarning << "The specified --useCudaGraph flag has been ignored. The inference will be "
"launched without using CUDA graph launch."
<< std::endl;
}
}
}
};
#if ENABLE_UNIFIED_BUILDER
//!
//! \class IterationSafe
//! \brief Inference iteration and streams management for safe inference
//!
class IterationSafe : public IterationBase
{
public:
explicit IterationSafe(
int32_t id, InferenceOptions const& inference, nvinfer2::safe::ITRTGraph& graph, BindingsSafe& bindings)
: IterationBase(id, inference, bindings)
{
createEnqueueFunction(inference, graph, bindings);
}
private:
void createEnqueueFunction(
InferenceOptions const& inference, nvinfer2::safe::ITRTGraph& graph, BindingsSafe& bindings)
{
mEnqueue = EnqueueFunction(EnqueueExplicitSafe(graph, bindings));
if (inference.graph)
{
sample::gLogInfo << "Capturing CUDA graph for the current execution context" << std::endl;
TrtCudaStream& stream = getStream(StreamType::kCOMPUTE);
// Avoid capturing initialization calls by executing the enqueue function at least
// once before starting CUDA graph capture.
auto const ret = mEnqueue(stream);
if (!ret)
{
throw std::runtime_error("Inference enqueue failed.");
}
stream.synchronize();
mGraph.beginCapture(stream);
// The built TRT engine may contain operations that are not permitted under CUDA graph capture mode.
// When the stream is capturing, the enqueue call may return false if the current CUDA graph capture fails.
if (mEnqueue(stream))
{
mGraph.endCapture(stream);
mEnqueue = EnqueueFunction(EnqueueGraphSafe(graph));
sample::gLogInfo << "Successfully captured CUDA graph for the current execution context" << std::endl;
}
else
{
mGraph.endCaptureOnError(stream);
// Ensure any CUDA error has been cleaned up.
CHECK(cudaGetLastError());
sample::gLogWarning << "The built TensorRT engine contains operations that are not permitted under "
"CUDA graph capture mode."
<< std::endl;
sample::gLogWarning << "The specified --useCudaGraph flag has been ignored. The inference will be "
"launched without using CUDA graph launch."
<< std::endl;
}
}
}
};
#endif
bool inferenceLoop(std::vector<std::unique_ptr<IterationBase>>& iStreams, TimePoint const& cpuStart,
TrtCudaEvent const& gpuStart, int iterations, float maxDurationMs, float warmupMs,
std::vector<InferenceTrace>& trace, bool skipTransfers, float idleMs)
{
float durationMs = 0;
int32_t skip = 0;
if (maxDurationMs == -1.F)
{
sample::gLogWarning << "--duration=-1 is specified, inference will run in an endless loop until"
<< " aborted with CTRL-C (SIGINT)" << std::endl;
while (true)
{
for (auto& s : iStreams)
{
if (!s->query(skipTransfers))
{
return false;
}
}
for (auto& s : iStreams)
{
s->sync(cpuStart, gpuStart, trace, skipTransfers);
}
}
}
for (int32_t i = 0; i < iterations + skip || durationMs < maxDurationMs; ++i)
{
for (auto& s : iStreams)
{
if (!s->query(skipTransfers))
{
return false;
}
}
for (auto& s : iStreams)
{
durationMs = std::max(durationMs, s->sync(cpuStart, gpuStart, trace, skipTransfers));
}
if (durationMs < warmupMs) // Warming up
{
if (durationMs) // Skip complete iterations
{
++skip;
}
continue;
}
if (idleMs != 0.F)
{
std::this_thread::sleep_for(std::chrono::duration<float, std::milli>(idleMs));
}
}
for (auto& s : iStreams)
{
s->syncAll(cpuStart, gpuStart, trace, skipTransfers);
}
return true;
}
void inferenceExecution(InferenceOptions const& inference, InferenceEnvironmentBase& iEnv, SyncStruct& sync,
int32_t const threadIdx, int32_t const streamsPerThread, int32_t device, std::vector<InferenceTrace>& trace,
ReportingOptions const& reporting) noexcept
{
try
{
float warmupMs = inference.warmup;
float durationMs = -1.F;
if (inference.duration != -1.F)
{
durationMs = inference.duration * 1000.F + warmupMs;
}
CHECK(cudaSetDevice(device));
#if ENABLE_UNIFIED_BUILDER
if (iEnv.safe)
{
//! Function to make one iteration:
auto makeIteration = [&](int32_t s) -> std::unique_ptr<IterationSafe> {
int32_t const streamId{threadIdx * streamsPerThread + s};
auto iteration = std::make_unique<IterationSafe>(streamId, inference,
*static_cast<InferenceEnvironmentSafe&>(iEnv).mClonedGraphs[streamId],
*static_cast<InferenceEnvironmentSafe&>(iEnv).bindings[streamId]);
if (inference.skipTransfers)
{
iteration->setInputData(true);
}
return iteration;
};
std::vector<std::unique_ptr<IterationBase>> iStreams;
for (int32_t s = 0; s < streamsPerThread; ++s)
{
iStreams.emplace_back(makeIteration(s));
}
for (auto& s : iStreams)
{
s->wait(sync.gpuStart);
}
std::vector<InferenceTrace> localTrace;
if (!inferenceLoop(iStreams, sync.cpuStart, sync.gpuStart, inference.iterations, durationMs, warmupMs,
localTrace, inference.skipTransfers, inference.idle))
{
std::lock_guard<std::mutex> lock{sync.mutex};
iEnv.error = true;
}
if (inference.skipTransfers)
{
for (auto& s : iStreams)
{
s->fetchOutputData(true);
}
}
std::lock_guard<std::mutex> lock{sync.mutex};
trace.insert(trace.end(), localTrace.begin(), localTrace.end());
return;
}
#endif
//! Function to make one iteration:
auto makeIteration = [&](int32_t s) -> std::unique_ptr<IterationStd> {
int32_t const streamId{threadIdx * streamsPerThread + s};
auto iteration = std::make_unique<IterationStd>(streamId, inference,
*static_cast<InferenceEnvironmentStd&>(iEnv).getContext(streamId),
*static_cast<InferenceEnvironmentStd&>(iEnv).bindings[streamId]);
if (inference.skipTransfers)
{
iteration->setInputData(true);
}
return iteration;
};
std::vector<std::unique_ptr<IterationBase>> iStreams;
for (int32_t s = 0; s < streamsPerThread; ++s)
{
iStreams.emplace_back(makeIteration(s));
}
for (auto& s : iStreams)
{
s->wait(sync.gpuStart);
}
std::vector<InferenceTrace> localTrace;
if (!inferenceLoop(iStreams, sync.cpuStart, sync.gpuStart, inference.iterations, durationMs, warmupMs,
localTrace, inference.skipTransfers, inference.idle))
{
std::lock_guard<std::mutex> lock{sync.mutex};
iEnv.error = true;
}
auto const needOutput = reporting.output || !reporting.exportOutput.empty();
if (inference.skipTransfers && needOutput)
{
for (auto& s : iStreams)
{
s->fetchOutputData(true);
}
}
{
std::lock_guard<std::mutex> lock{sync.mutex};
trace.insert(trace.end(), localTrace.begin(), localTrace.end());
}
}
catch (...)
{
std::lock_guard<std::mutex> lock{sync.mutex};
iEnv.error = true;
}
}
inline std::thread makeThread(InferenceOptions const& inference, InferenceEnvironmentBase& iEnv, SyncStruct& sync,
int32_t threadIdx, int32_t streamsPerThread, int32_t device, std::vector<InferenceTrace>& trace,
ReportingOptions const& reporting)
{
return std::thread(inferenceExecution, std::cref(inference), std::ref(iEnv), std::ref(sync), threadIdx,
streamsPerThread, device, std::ref(trace), std::cref(reporting));
}
} // namespace
bool runInference(InferenceOptions const& inference, InferenceEnvironmentBase& iEnv, int32_t device,
std::vector<InferenceTrace>& trace, ReportingOptions const& reporting)
{
CHECK(cudaProfilerStart());
trace.resize(0);
SyncStruct sync;
sync.sleep = inference.sleep;
sync.mainStream.sleep(&sync.sleep);
sync.cpuStart = getCurrentTime();
sync.gpuStart.record(sync.mainStream);
// When multiple streams are used, trtexec can run inference in two modes:
// (1) if inference.threads is true, then run each stream on each thread.
// (2) if inference.threads is false, then run all streams on the same thread.
int32_t const numThreads = inference.threads ? inference.infStreams : 1;
int32_t const streamsPerThread = inference.threads ? 1 : inference.infStreams;
std::vector<std::thread> threads;
for (int32_t threadIdx = 0; threadIdx < numThreads; ++threadIdx)
{
threads.emplace_back(makeThread(inference, iEnv, sync, threadIdx, streamsPerThread, device, trace, reporting));
}
for (auto& th : threads)
{
th.join();
}
CHECK(cudaProfilerStop());
auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; };
std::sort(trace.begin(), trace.end(), cmpTrace);
return !iEnv.error;
}
bool runMultiTasksInference(std::vector<std::unique_ptr<TaskInferenceEnvironment>>& tEnvList)
{
CHECK(cudaProfilerStart());
cudaSetDeviceFlags(cudaDeviceScheduleSpin);
SyncStruct sync;
sync.sleep = 0;
sync.mainStream.sleep(&sync.sleep);
sync.cpuStart = getCurrentTime();
sync.gpuStart.record(sync.mainStream);
std::vector<std::thread> threads;
for (size_t i = 0; i < tEnvList.size(); ++i)
{
auto& tEnv = tEnvList[i];
threads.emplace_back(makeThread(
tEnv->iOptions, *(tEnv->iEnv), sync, /*threadIdx*/ 0, /*streamsPerThread*/ 1, tEnv->device, tEnv->trace,
tEnv->rOptions));
}
for (auto& th : threads)
{
th.join();
}
CHECK(cudaProfilerStop());
auto cmpTrace = [](InferenceTrace const& a, InferenceTrace const& b) { return a.h2dStart < b.h2dStart; };
for (auto& tEnv : tEnvList)
{
std::sort(tEnv->trace.begin(), tEnv->trace.end(), cmpTrace);
}
return std::none_of(tEnvList.begin(), tEnvList.end(),
[](std::unique_ptr<TaskInferenceEnvironment>& tEnv) { return tEnv->iEnv->error; });
}
namespace
{
size_t reportGpuMemory()
{
static size_t prevFree{0};
size_t free{0};
size_t total{0};
size_t newlyAllocated{0};
CHECK(cudaMemGetInfo(&free, &total));
sample::gLogInfo << "Free GPU memory = " << free / 1024.0_MiB << " GiB";
if (prevFree != 0)
{
newlyAllocated = (prevFree - free);
sample::gLogInfo << ", newly allocated GPU memory = " << newlyAllocated / 1024.0_MiB << " GiB";
}
sample::gLogInfo << ", total GPU memory = " << total / 1024.0_MiB << " GiB" << std::endl;
prevFree = free;
return newlyAllocated;
}
} // namespace
//! Returns true if deserialization is slower than expected or fails.
bool timeDeserialize(InferenceEnvironmentBase& iEnv, SystemOptions const& sys)
{
constexpr int32_t kNB_ITERS{20};
std::unique_ptr<IRuntime> rt{createRuntime()};
std::unique_ptr<ICudaEngine> engine;
SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError);
auto timeDeserializeFn = [&]() -> float {
bool deserializeOK{false};
engine.reset(nullptr);
auto startClock = std::chrono::high_resolution_clock::now();
SMP_RETVAL_IF_FALSE(!iEnv.safe, "Safe inference is not supported!", false, sample::gLogError);
for (auto const& pluginPath : sys.dynamicPlugins)
{
rt->getPluginRegistry().loadLibrary(pluginPath.c_str());
}
auto& reader = iEnv.engine.getFileReader();
auto& asyncReader = iEnv.engine.getAsyncFileReader();
ASSERT(reader.isOpen() || asyncReader.isOpen());
if (asyncReader.isOpen())
{
asyncReader.reset();
engine.reset(rt->deserializeCudaEngine(asyncReader));
}
else
{
reader.reset();
engine.reset(rt->deserializeCudaEngine(reader));
}
deserializeOK = (engine != nullptr);
auto endClock = std::chrono::high_resolution_clock::now();
// return NAN if deserialization failed.
return deserializeOK ? std::chrono::duration<float, std::milli>(endClock - startClock).count() : NAN;
};
// Warmup the caches to make sure that cache thrashing isn't throwing off the results
{
sample::gLogInfo << "Begin deserialization warmup..." << std::endl;
for (int32_t i = 0, e = 2; i < e; ++i)
{
timeDeserializeFn();
}
}
sample::gLogInfo << "Begin deserialization engine timing..." << std::endl;
float const first = timeDeserializeFn();
// Check if first deserialization succeeded.
if (std::isnan(first))
{
sample::gLogError << "Engine deserialization failed." << std::endl;
return true;
}
sample::gLogInfo << "First deserialization time = " << first << " milliseconds" << std::endl;
// Record initial gpu memory state.
reportGpuMemory();
float totalTime{0.F};
for (int32_t i = 0; i < kNB_ITERS; ++i)
{
totalTime += timeDeserializeFn();
}
auto const averageTime = totalTime / kNB_ITERS;
// reportGpuMemory sometimes reports zero after a single deserialization of a small engine,
// so use the size of memory for all the iterations.
auto const totalEngineSizeGpu = reportGpuMemory();
sample::gLogInfo << "Total deserialization time = " << totalTime << " milliseconds in " << kNB_ITERS
<< " iterations, average time = " << averageTime << " milliseconds, first time = " << first
<< " milliseconds." << std::endl;
sample::gLogInfo << "Deserialization Bandwidth = " << 1E-6 * totalEngineSizeGpu / totalTime << " GB/s" << std::endl;
// If the first deserialization is more than tolerance slower than
// the average deserialization, return true, which means an error occurred.
// The tolerance is set to 2x since the deserialization time is quick and susceptible
// to caching issues causing problems in the first timing.
auto const tolerance = 2.0F;
bool const isSlowerThanExpected = first > averageTime * tolerance;
if (isSlowerThanExpected)
{
sample::gLogInfo << "First deserialization time divided by average time is " << (first / averageTime)
<< ". Exceeds tolerance of " << tolerance << "x." << std::endl;
}
return isSlowerThanExpected;
}
std::string getLayerInformation(
nvinfer1::ICudaEngine* engine, nvinfer1::IExecutionContext* context, nvinfer1::LayerInformationFormat format)
{
auto runtime = std::unique_ptr<IRuntime>{createRuntime()};
auto inspector = std::unique_ptr<IEngineInspector>(engine->createEngineInspector());
if (context != nullptr)
{
inspector->setExecutionContext(context);
}
std::string result = inspector->getEngineInformation(format);
return result;
}
void Binding::fill(std::string const& fileName)
{
loadFromFile(fileName, static_cast<char*>(buffer->getHostBuffer()), buffer->getSize());
}
void Binding::fill()
{
switch (dataType)
{
case nvinfer1::DataType::kBOOL:
{
fillBuffer<bool>(buffer->getHostBuffer(), volume, 0, 1);
break;
}
case nvinfer1::DataType::kINT32:
{
fillBuffer<int32_t>(buffer->getHostBuffer(), volume, -128, 127);
break;
}
case nvinfer1::DataType::kINT64:
{
fillBuffer<int64_t>(buffer->getHostBuffer(), volume, -128, 127);
break;
}
case nvinfer1::DataType::kINT8:
{
fillBuffer<int8_t>(buffer->getHostBuffer(), volume, -128, 127);
break;
}
case nvinfer1::DataType::kFLOAT:
{
fillBuffer<float>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
break;
}
case nvinfer1::DataType::kHALF:
{
fillBuffer<__half>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
break;
}
case nvinfer1::DataType::kBF16:
{
fillBuffer<BFloat16>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
break;
}
case nvinfer1::DataType::kUINT8:
{
fillBuffer<uint8_t>(buffer->getHostBuffer(), volume, 0, 255);
break;
}
case nvinfer1::DataType::kFP8:
#if CUDA_VERSION < 11060
ASSERT(false && "FP8 is not supported");
#else
{
fillBuffer<__nv_fp8_e4m3>(buffer->getHostBuffer(), volume, -1.0F, 1.0F);
break;
}
#endif
case nvinfer1::DataType::kINT4:
{
// int4 is implemented as packing two elements into a single byte,
// so all possible bit patterns of the two int4 elements coincides with all possible bit patterns of
// an uint8.
fillBuffer<uint8_t>(buffer->getHostBuffer(), volume, 0, 255);
break;
}
case DataType::kFP4: ASSERT(false && "FP4 is not supported");
case DataType::kE8M0: ASSERT(false && "E8M0 is not supported");
}
}
void Binding::dump(std::ostream& os, Dims dims, Dims strides, int32_t vectorDim, int32_t spv,
std::string const separator /*= " "*/) const
{
void* outputBuffer{};
if (outputAllocator != nullptr)
{
outputBuffer = outputAllocator->getBuffer()->getHostBuffer();
// Overwrite dimensions with those reported by the output allocator.
dims = outputAllocator->getFinalDims();
os << "Final shape is " << dims << " reported by the output allocator." << std::endl;
}
else
{
outputBuffer = buffer->getHostBuffer();
}
switch (dataType)
{
case nvinfer1::DataType::kBOOL:
{
dumpBuffer<bool>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
break;
}
case nvinfer1::DataType::kINT32:
{
dumpBuffer<int32_t>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
break;
}
case nvinfer1::DataType::kINT8:
{
dumpBuffer<int8_t>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
break;
}
case nvinfer1::DataType::kFLOAT:
{
dumpBuffer<float>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
break;
}
case nvinfer1::DataType::kHALF:
{
dumpBuffer<__half>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
break;
}
case nvinfer1::DataType::kBF16:
{
dumpBuffer<BFloat16>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
break;
}
case nvinfer1::DataType::kUINT8:
{
dumpBuffer<uint8_t>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
break;
}
case nvinfer1::DataType::kINT64:
{
dumpBuffer<int64_t>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
break;
}
case nvinfer1::DataType::kFP8:
#if CUDA_VERSION < 11060
ASSERT(false && "FP8 is not supported");
#else
{
dumpBuffer<__nv_fp8_e4m3>(outputBuffer, separator, os, dims, strides, vectorDim, spv);
break;
}
#endif
case nvinfer1::DataType::kINT4:
{
dumpInt4Buffer(outputBuffer, separator, os, dims, strides, vectorDim, spv);
break;
}
case nvinfer1::DataType::kFP4: ASSERT(false && "FP4 is not supported");
case nvinfer1::DataType::kE8M0: ASSERT(false && "E8M0 is not supported");
}
}
void BindingsBase::addBinding(TensorInfo const& tensorInfo, std::string const& fileName /*= ""*/)
{
auto const b = tensorInfo.bindingIndex;
while (mBindings.size() <= static_cast<size_t>(b))
{
mBindings.emplace_back();
mDevicePointers.emplace_back();
}
mNames[tensorInfo.name] = b;
mBindings[b].isInput = tensorInfo.isInput;
mBindings[b].volume = tensorInfo.vol;
mBindings[b].dataType = tensorInfo.dataType;
//! Make a UnifiedMirroredBuffer if useManaged or Discrete othereise:
auto makeBuffer = [](bool useManaged) -> std::unique_ptr<IMirroredBuffer> {
if (useManaged)
{
return std::make_unique<UnifiedMirroredBuffer>();
}
else
{
return std::make_unique<DiscreteMirroredBuffer>();
}
};
if (tensorInfo.isDynamic)
{
ASSERT(!tensorInfo.isInput); // Only output shape can be possibly unknown because of DDS.
if (mBindings[b].outputAllocator == nullptr)
{
mBindings[b].outputAllocator = std::make_unique<OutputAllocator>(makeBuffer(mUseManaged));
}
}
else
{
if (mBindings[b].buffer == nullptr)
{
mBindings[b].buffer = makeBuffer(mUseManaged);
}
// Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
// even for empty tensors, so allocate a dummy byte.
if (tensorInfo.vol == 0)
{
mBindings[b].buffer->allocate(1);
}
else
{
mBindings[b].buffer->allocate(samplesCommon::getNbBytes(tensorInfo.dataType, tensorInfo.vol));
}
mDevicePointers[b] = mBindings[b].buffer->getDeviceBuffer();
}
if (tensorInfo.isInput)
{
if (fileName.empty())
{
fill(b);
}
else
{
fill(b, fileName);
}
}
}
void** BindingsBase::getDeviceBuffers()
{
return mDevicePointers.data();
}
void BindingsBase::transferInputToDevice(TrtCudaStream& stream)
{
for (auto& b : mNames)
{
if (mBindings[b.second].isInput)
{
mBindings[b.second].buffer->hostToDevice(stream);
}
}
}
void BindingsBase::transferOutputToHost(TrtCudaStream& stream)
{
for (auto& b : mNames)
{
if (!mBindings[b.second].isInput)
{
if (mBindings[b.second].outputAllocator != nullptr)
{
mBindings[b.second].outputAllocator->getBuffer()->deviceToHost(stream);
}
else
{
mBindings[b.second].buffer->deviceToHost(stream);
}
}
}
}
void BindingsStd::dumpBindingValues(nvinfer1::IExecutionContext const& context, int32_t binding, std::ostream& os,
std::string const& separator /*= " "*/, int32_t batch /*= 1*/) const
{
auto const tensorName = context.getEngine().getIOTensorName(binding);
Dims dims = context.getTensorShape(tensorName);
Dims strides = context.getTensorStrides(tensorName);
int32_t vectorDim = context.getEngine().getTensorVectorizedDim(tensorName);
int32_t const spv = context.getEngine().getTensorComponentsPerElement(tensorName);
mBindings[binding].dump(os, dims, strides, vectorDim, spv, separator);
}
namespace
{
Dims getBindingDimensions(nvinfer1::IExecutionContext const& context, std::string const& name)
{
return context.getTensorShape(name.c_str());
}
} // namespace
void BindingsStd::dumpRawBindingToFiles(nvinfer1::IExecutionContext const& context, std::ostream& os) const
{
os << "Dumping I/O Bindings to RAW Files:" << std::endl;
for (auto const& n : mNames)
{
auto name = n.first;
auto bIndex = n.second;
auto const& binding = mBindings[bIndex];
void* outputBuffer{};
if (binding.outputAllocator != nullptr)
{
outputBuffer = binding.outputAllocator->getBuffer()->getHostBuffer();
}
else
{
outputBuffer = binding.buffer->getHostBuffer();
}
Dims dims = getBindingDimensions(context, name);
std::string dimsStr;
std::string dotStr;
for (int32_t i = 0; i < dims.nbDims; i++)
{
dimsStr += dotStr + std::to_string(dims.d[i]);
dotStr = ".";
}
std::string const bindingTypeStr = (binding.isInput ? "input" : "output");
std::stringstream fileNameStream;
fileNameStream << name << "." << bindingTypeStr << "." << dimsStr << "." << binding.dataType << ".raw";
std::string fileName = genFilenameSafeString(fileNameStream.str());
os << "Writing file for " << bindingTypeStr << " binding " << name << " (with datatype " << binding.dataType
<< " and dimensions " << dimsStr << ") to " << fileName << std::endl;
std::ofstream f(fileName, std::ios::out | std::ios::binary);
ASSERT(f && "Cannot open file for write");
f.write(static_cast<char*>(outputBuffer), samplesCommon::getNbBytes(binding.dataType, binding.volume));
f.close();
}
}
void BindingsStd::dumpBindingDimensions(
std::string const& name, nvinfer1::IExecutionContext const& context, std::ostream& os) const
{
auto const dims = context.getTensorShape(name.c_str());
// Do not add a newline terminator, because the caller may be outputting a JSON string.
os << dims;
}
std::unordered_map<std::string, int> BindingsBase::getBindings(std::function<bool(Binding const&)> predicate) const
{
std::unordered_map<std::string, int> bindings;
for (auto const& n : mNames)
{
auto const binding = n.second;
if (predicate(mBindings[binding]))
{
bindings.insert(n);
}
}
return bindings;
}
bool BindingsStd::setTensorAddresses(nvinfer1::IExecutionContext& context) const
{
for (auto const& b : mNames)
{
auto const name = b.first.c_str();
auto const location = context.getEngine().getTensorLocation(name);
if (location == TensorLocation::kDEVICE)
{
if (mBindings[b.second].outputAllocator != nullptr)
{
if (!context.setOutputAllocator(name, mBindings[b.second].outputAllocator.get()))
{
return false;
}
}
else
{
if (!context.setTensorAddress(name, mDevicePointers[b.second]))
{
return false;
}
}
}
}
return true;
}
#if ENABLE_UNIFIED_BUILDER
namespace
{
Dims getBindingDimensions(ITRTGraph& graph, std::string const& name)
{
nvinfer2::safe::TensorDescriptor desc;
graph.getIOTensorDescriptor(desc, name.c_str());
return desc.shape;
}
} // namespace
void BindingsSafe::dumpBindingDimensions(std::string const& name, ITRTGraph const& graph, std::ostream& os) const
{
// Do not add a newline terminator, because the caller may be outputting a JSON string.
os << getBindingDimensions(const_cast<ITRTGraph&>(graph), name);
}
void BindingsSafe::dumpBindingValues(ITRTGraph const& graph, int32_t binding, std::ostream& os,
std::string const& separator /*= " "*/, int32_t batch /*= 1*/) const
{
char const* tensorName;
graph.getIOTensorName(tensorName, binding);
nvinfer2::safe::TensorDescriptor desc;
graph.getIOTensorDescriptor(desc, tensorName);
Dims dims = desc.shape;
Dims strides = desc.stride;
// int32_t vectorDim = desc.vectorizedDim;
// int32_t const spv = desc.componentsPerVector;
mBindings[binding].dump(os, dims, strides, -1, -1, separator);
}
void BindingsSafe::dumpRawBindingToFiles(ITRTGraph& graph, std::ostream& os) const
{
os << "Dumping I/O Bindings to RAW Files:" << std::endl;
for (auto const& n : mNames)
{
auto name = n.first;
auto bIndex = n.second;
auto const& binding = mBindings[bIndex];
void* outputBuffer{};
if (binding.outputAllocator != nullptr)
{
outputBuffer = binding.outputAllocator->getBuffer()->getHostBuffer();
}
else
{
outputBuffer = binding.buffer->getHostBuffer();
}
Dims dims = getBindingDimensions(graph, name);
std::string dimsStr;
std::string dotStr;
for (int32_t i = 0; i < dims.nbDims; i++)
{
dimsStr += dotStr + std::to_string(dims.d[i]);
dotStr = ".";
}
std::string const bindingTypeStr = (binding.isInput ? "input" : "output");
std::stringstream fileName;
fileName << genFilenameSafeString(name) << "." << bindingTypeStr << "." << dimsStr << "." << binding.dataType
<< ".raw";
os << "Writing file for " << bindingTypeStr << " binding " << name << " (with datatype " << binding.dataType
<< " and dimensions " << dimsStr << ") to " << fileName.str() << std::endl;
std::ofstream f(fileName.str(), std::ios::out | std::ios::binary);
ASSERT(f && "Cannot open file for write");
f.write(static_cast<char*>(outputBuffer), samplesCommon::getNbBytes(binding.dataType, binding.volume));
f.close();
}
}
bool BindingsSafe::setTensorAddresses(ITRTGraph& graph) const
{
for (auto const& b : mNames)
{
auto const name = b.first.c_str();
nvinfer2::safe::TensorDescriptor desc;
graph.getIOTensorDescriptor(desc, name);
bool onGpu = desc.memPlacement == nvinfer2::safe::MemoryPlacement::kGPU
|| desc.memPlacement == nvinfer2::safe::MemoryPlacement::kNONE;
if (onGpu)
{
if (mBindings[b.second].outputAllocator != nullptr)
{
nvinfer2::safe::TypedArray tensor = safe::createTypedArray(
mBindings[b.second].outputAllocator->getBuffer(), desc.dataType, desc.sizeInBytes);
graph.setIOTensorAddress(name, tensor);
}
else
{
nvinfer2::safe::TypedArray tensor
= safe::createTypedArray(mDevicePointers[b.second], desc.dataType, desc.sizeInBytes);
graph.setIOTensorAddress(name, tensor);
}
}
}
return true;
}
#endif
} // namespace sample