Ai_spatial_modeling

Running on Zero

App Files Files Community

Ai_spatial_modeling / extensions /nvdiffrast /nvdiffrast /common /antialias.cu

JeffreyXiang

update

15fe7bc 11 months ago

raw

history blame

19.8 kB

	// Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
	//
	// NVIDIA CORPORATION and its licensors retain all intellectual property
	// and proprietary rights in and to this software, related documentation
	// and any modifications thereto. Any use, reproduction, disclosure or
	// distribution of this software and related documentation without an express
	// license agreement from NVIDIA CORPORATION is strictly prohibited.

	#include "antialias.h"

	//------------------------------------------------------------------------
	// Helpers.

	#define F32_MAX (3.402823466e+38f)
	static __forceinline__ __device__ bool same_sign(float a, float b) { return (__float_as_int(a) ^ __float_as_int(b)) >= 0; }
	static __forceinline__ __device__ bool rational_gt(float n0, float n1, float d0, float d1) { return (n0d1 > n1d0) == same_sign(d0, d1); }
	static __forceinline__ __device__ int max_idx3(float n0, float n1, float n2, float d0, float d1, float d2)
	{
	bool g10 = rational_gt(n1, n0, d1, d0);
	bool g20 = rational_gt(n2, n0, d2, d0);
	bool g21 = rational_gt(n2, n1, d2, d1);
	if (g20 && g21) return 2;
	if (g10) return 1;
	return 0;
	}

	//------------------------------------------------------------------------
	// Format of antialiasing work items stored in work buffer. Usually accessed directly as int4.

	struct AAWorkItem
	{
	enum
	{
	EDGE_MASK = 3, // Edge index in lowest bits.
	FLAG_DOWN_BIT = 2, // Down instead of right.
	FLAG_TRI1_BIT = 3, // Edge is from other pixel's triangle.
	};

	int px, py; // Pixel x, y.
	unsigned int pz_flags; // High 16 bits = pixel z, low 16 bits = edge index and flags.
	float alpha; // Antialiasing alpha value. Zero if no AA.
	};

	//------------------------------------------------------------------------
	// Hash functions. Adapted from public-domain code at http://www.burtleburtle.net/bob/hash/doobs.html

	#define JENKINS_MAGIC (0x9e3779b9u)
	static __device__ __forceinline__ void jenkins_mix(unsigned int& a, unsigned int& b, unsigned int& c)
	{
	a -= b; a -= c; a ^= (c>>13);
	b -= c; b -= a; b ^= (a<<8);
	c -= a; c -= b; c ^= (b>>13);
	a -= b; a -= c; a ^= (c>>12);
	b -= c; b -= a; b ^= (a<<16);
	c -= a; c -= b; c ^= (b>>5);
	a -= b; a -= c; a ^= (c>>3);
	b -= c; b -= a; b ^= (a<<10);
	c -= a; c -= b; c ^= (b>>15);
	}

	// Helper class for hash index iteration. Implements simple odd-skip linear probing with a key-dependent skip.
	class HashIndex
	{
	public:
	__device__ __forceinline__ HashIndex(const AntialiasKernelParams& p, uint64_t key)
	{
	m_mask = (p.allocTriangles << AA_LOG_HASH_ELEMENTS_PER_TRIANGLE(p.allocTriangles)) - 1; // This should work until triangle count exceeds 1073741824.
	m_idx = (uint32_t)(key & 0xffffffffu);
	m_skip = (uint32_t)(key >> 32);
	uint32_t dummy = JENKINS_MAGIC;
	jenkins_mix(m_idx, m_skip, dummy);
	m_idx &= m_mask;
	m_skip &= m_mask;
	m_skip \|= 1;
	}
	__device__ __forceinline__ int get(void) const { return m_idx; }
	__device__ __forceinline__ void next(void) { m_idx = (m_idx + m_skip) & m_mask; }
	private:
	uint32_t m_idx, m_skip, m_mask;
	};

	static __device__ __forceinline__ void hash_insert(const AntialiasKernelParams& p, uint64_t key, int v)
	{
	HashIndex idx(p, key);
	while(1)
	{
	uint64_t prev = atomicCAS((unsigned long long*)&p.evHash[idx.get()], 0, (unsigned long long)key);
	if (prev == 0 \|\| prev == key)
	break;
	idx.next();
	}
	int* q = (int*)&p.evHash[idx.get()];
	int a = atomicCAS(q+2, 0, v);
	if (a != 0 && a != v)
	atomicCAS(q+3, 0, v);
	}

	static __device__ __forceinline__ int2 hash_find(const AntialiasKernelParams& p, uint64_t key)
	{
	HashIndex idx(p, key);
	while(1)
	{
	uint4 entry = p.evHash[idx.get()];
	uint64_t k = ((uint64_t)entry.x) \| (((uint64_t)entry.y) << 32);
	if (k == key \|\| k == 0)
	return make_int2((int)entry.z, (int)entry.w);
	idx.next();
	}
	}

	static __device__ __forceinline__ void evhash_insert_vertex(const AntialiasKernelParams& p, int va, int vb, int vn)
	{
	if (va == vb)
	return;

	uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
	uint64_t v1 = (uint32_t)max(va, vb) + 1;
	uint64_t vk = v0 \| (v1 << 32); // hash key
	hash_insert(p, vk, vn + 1);
	}

	static __forceinline__ __device__ int evhash_find_vertex(const AntialiasKernelParams& p, int va, int vb, int vr)
	{
	if (va == vb)
	return -1;

	uint64_t v0 = (uint32_t)min(va, vb) + 1; // canonical vertex order
	uint64_t v1 = (uint32_t)max(va, vb) + 1;
	uint64_t vk = v0 \| (v1 << 32); // hash key
	int2 vn = hash_find(p, vk) - 1;
	if (vn.x == vr) return vn.y;
	if (vn.y == vr) return vn.x;
	return -1;
	}

	//------------------------------------------------------------------------
	// Mesh analysis kernel.

	__global__ void AntialiasFwdMeshKernel(const AntialiasKernelParams p)
	{
	int idx = threadIdx.x + blockIdx.x * blockDim.x;
	if (idx >= p.numTriangles)
	return;

	int v0 = p.tri[idx * 3 + 0];
	int v1 = p.tri[idx * 3 + 1];
	int v2 = p.tri[idx * 3 + 2];

	if (v0 < 0 \|\| v0 >= p.numVertices \|\|
	v1 < 0 \|\| v1 >= p.numVertices \|\|
	v2 < 0 \|\| v2 >= p.numVertices)
	return;

	if (v0 == v1 \|\| v1 == v2 \|\| v2 == v0)
	return;

	evhash_insert_vertex(p, v1, v2, v0);
	evhash_insert_vertex(p, v2, v0, v1);
	evhash_insert_vertex(p, v0, v1, v2);
	}

	//------------------------------------------------------------------------
	// Discontinuity finder kernel.

	__global__ void AntialiasFwdDiscontinuityKernel(const AntialiasKernelParams p)
	{
	// Calculate pixel position.
	int px = blockIdx.x * AA_DISCONTINUITY_KERNEL_BLOCK_WIDTH + threadIdx.x;
	int py = blockIdx.y * AA_DISCONTINUITY_KERNEL_BLOCK_HEIGHT + threadIdx.y;
	int pz = blockIdx.z;
	if (px >= p.width \|\| py >= p.height \|\| pz >= p.n)
	return;

	// Pointer to our TriIdx and fetch.
	int pidx0 = ((px + p.width * (py + p.height * pz)) << 2) + 3;
	float tri0 = p.rasterOut[pidx0]; // These can stay as float, as we only compare them against each other.

	// Look right, clamp at edge.
	int pidx1 = pidx0;
	if (px < p.width - 1)
	pidx1 += 4;
	float tri1 = p.rasterOut[pidx1];

	// Look down, clamp at edge.
	int pidx2 = pidx0;
	if (py < p.height - 1)
	pidx2 += p.width << 2;
	float tri2 = p.rasterOut[pidx2];

	// Determine amount of work.
	int count = 0;
	if (tri1 != tri0) count = 1;
	if (tri2 != tri0) count += 1;
	if (!count)
	return; // Exit warp.

	// Coalesce work counter update to once per CTA.
	__shared__ int s_temp;
	s_temp = 0;
	__syncthreads();
	int idx = atomicAdd(&s_temp, count);
	__syncthreads();
	if (idx == 0)
	{
	int base = atomicAdd(&p.workBuffer[0].x, s_temp);
	s_temp = base + 1; // don't clobber the counters in first slot.
	}
	__syncthreads();
	idx += s_temp;

	// Write to memory.
	if (tri1 != tri0) p.workBuffer[idx++] = make_int4(px, py, (pz << 16), 0);
	if (tri2 != tri0) p.workBuffer[idx] = make_int4(px, py, (pz << 16) + (1 << AAWorkItem::FLAG_DOWN_BIT), 0);
	}

	//------------------------------------------------------------------------
	// Forward analysis kernel.

	__global__ void AntialiasFwdAnalysisKernel(const AntialiasKernelParams p)
	{
	__shared__ int s_base;
	int workCount = p.workBuffer[0].x;
	for(;;)
	{
	// Persistent threads work fetcher.
	__syncthreads();
	if (threadIdx.x == 0)
	s_base = atomicAdd(&p.workBuffer[0].y, AA_ANALYSIS_KERNEL_THREADS_PER_BLOCK);
	__syncthreads();
	int thread_idx = s_base + threadIdx.x;
	if (thread_idx >= workCount)
	return;

	int4* pItem = p.workBuffer + thread_idx + 1;
	int4 item = *pItem;
	int px = item.x;
	int py = item.y;
	int pz = (int)(((unsigned int)item.z) >> 16);
	int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;

	int pixel0 = px + p.width * (py + p.height * pz);
	int pixel1 = pixel0 + (d ? p.width : 1);
	float2 zt0 = ((float2*)p.rasterOut)[(pixel0 << 1) + 1];
	float2 zt1 = ((float2*)p.rasterOut)[(pixel1 << 1) + 1];
	int tri0 = float_to_triidx(zt0.y) - 1;
	int tri1 = float_to_triidx(zt1.y) - 1;

	// Select triangle based on background / depth.
	int tri = (tri0 >= 0) ? tri0 : tri1;
	if (tri0 >= 0 && tri1 >= 0)
	tri = (zt0.x < zt1.x) ? tri0 : tri1;
	if (tri == tri1)
	{
	// Calculate with respect to neighbor pixel if chose that triangle.
	px += 1 - d;
	py += d;
	}

	// Bail out if triangle index is corrupt.
	if (tri < 0 \|\| tri >= p.numTriangles)
	continue;

	// Fetch vertex indices.
	int vi0 = p.tri[tri * 3 + 0];
	int vi1 = p.tri[tri * 3 + 1];
	int vi2 = p.tri[tri * 3 + 2];

	// Bail out if vertex indices are corrupt.
	if (vi0 < 0 \|\| vi0 >= p.numVertices \|\|
	vi1 < 0 \|\| vi1 >= p.numVertices \|\|
	vi2 < 0 \|\| vi2 >= p.numVertices)
	continue;

	// Fetch opposite vertex indices. Use vertex itself (always silhouette) if no opposite vertex exists.
	int op0 = evhash_find_vertex(p, vi2, vi1, vi0);
	int op1 = evhash_find_vertex(p, vi0, vi2, vi1);
	int op2 = evhash_find_vertex(p, vi1, vi0, vi2);

	// Instance mode: Adjust vertex indices based on minibatch index.
	if (p.instance_mode)
	{
	int vbase = pz * p.numVertices;
	vi0 += vbase;
	vi1 += vbase;
	vi2 += vbase;
	if (op0 >= 0) op0 += vbase;
	if (op1 >= 0) op1 += vbase;
	if (op2 >= 0) op2 += vbase;
	}

	// Fetch vertex positions.
	float4 p0 = ((float4*)p.pos)[vi0];
	float4 p1 = ((float4*)p.pos)[vi1];
	float4 p2 = ((float4*)p.pos)[vi2];
	float4 o0 = (op0 < 0) ? p0 : ((float4*)p.pos)[op0];
	float4 o1 = (op1 < 0) ? p1 : ((float4*)p.pos)[op1];
	float4 o2 = (op2 < 0) ? p2 : ((float4*)p.pos)[op2];

	// Project vertices to pixel space.
	float w0 = 1.f / p0.w;
	float w1 = 1.f / p1.w;
	float w2 = 1.f / p2.w;
	float ow0 = 1.f / o0.w;
	float ow1 = 1.f / o1.w;
	float ow2 = 1.f / o2.w;
	float fx = (float)px + .5f - p.xh;
	float fy = (float)py + .5f - p.yh;
	float x0 = p0.x * w0 * p.xh - fx;
	float y0 = p0.y * w0 * p.yh - fy;
	float x1 = p1.x * w1 * p.xh - fx;
	float y1 = p1.y * w1 * p.yh - fy;
	float x2 = p2.x * w2 * p.xh - fx;
	float y2 = p2.y * w2 * p.yh - fy;
	float ox0 = o0.x * ow0 * p.xh - fx;
	float oy0 = o0.y * ow0 * p.yh - fy;
	float ox1 = o1.x * ow1 * p.xh - fx;
	float oy1 = o1.y * ow1 * p.yh - fy;
	float ox2 = o2.x * ow2 * p.xh - fx;
	float oy2 = o2.y * ow2 * p.yh - fy;

	// Signs to kill non-silhouette edges.
	float bb = (x1-x0)(y2-y0) - (x2-x0)(y1-y0); // Triangle itself.
	float a0 = (x1-ox0)(y2-oy0) - (x2-ox0)(y1-oy0); // Wings.
	float a1 = (x2-ox1)(y0-oy1) - (x0-ox1)(y2-oy1);
	float a2 = (x0-ox2)(y1-oy2) - (x1-ox2)(y0-oy2);

	// If no matching signs anywhere, skip the rest.
	if (same_sign(a0, bb) \|\| same_sign(a1, bb) \|\| same_sign(a2, bb))
	{
	// XY flip for horizontal edges.
	if (d)
	{
	swap(x0, y0);
	swap(x1, y1);
	swap(x2, y2);
	}

	float dx0 = x2 - x1;
	float dx1 = x0 - x2;
	float dx2 = x1 - x0;
	float dy0 = y2 - y1;
	float dy1 = y0 - y2;
	float dy2 = y1 - y0;

	// Check if an edge crosses between us and the neighbor pixel.
	float dc = -F32_MAX;
	float ds = (tri == tri0) ? 1.f : -1.f;
	float d0 = ds * (x1dy0 - y1dx0);
	float d1 = ds * (x2dy1 - y2dx1);
	float d2 = ds * (x0dy2 - y0dx2);

	if (same_sign(y1, y2)) d0 = -F32_MAX, dy0 = 1.f;
	if (same_sign(y2, y0)) d1 = -F32_MAX, dy1 = 1.f;
	if (same_sign(y0, y1)) d2 = -F32_MAX, dy2 = 1.f;

	int di = max_idx3(d0, d1, d2, dy0, dy1, dy2);
	if (di == 0 && same_sign(a0, bb) && fabsf(dy0) >= fabsf(dx0)) dc = d0 / dy0;
	if (di == 1 && same_sign(a1, bb) && fabsf(dy1) >= fabsf(dx1)) dc = d1 / dy1;
	if (di == 2 && same_sign(a2, bb) && fabsf(dy2) >= fabsf(dx2)) dc = d2 / dy2;
	float eps = .0625f; // Expect no more than 1/16 pixel inaccuracy.

	// Adjust output image if a suitable edge was found.
	if (dc > -eps && dc < 1.f + eps)
	{
	dc = fminf(fmaxf(dc, 0.f), 1.f);
	float alpha = ds * (.5f - dc);
	const float* pColor0 = p.color + pixel0 * p.channels;
	const float* pColor1 = p.color + pixel1 * p.channels;
	float* pOutput = p.output + (alpha > 0.f ? pixel0 : pixel1) * p.channels;
	for (int i=0; i < p.channels; i++)
	atomicAdd(&pOutput[i], alpha * (pColor1[i] - pColor0[i]));

	// Rewrite the work item's flags and alpha. Keep original px, py.
	unsigned int flags = pz << 16;
	flags \|= di;
	flags \|= d << AAWorkItem::FLAG_DOWN_BIT;
	flags \|= (__float_as_uint(ds) >> 31) << AAWorkItem::FLAG_TRI1_BIT;
	((int2*)pItem)[1] = make_int2(flags, __float_as_int(alpha));
	}
	}
	}
	}

	//------------------------------------------------------------------------
	// Gradient kernel.

	__global__ void AntialiasGradKernel(const AntialiasKernelParams p)
	{
	// Temporary space for coalesced atomics.
	CA_DECLARE_TEMP(AA_GRAD_KERNEL_THREADS_PER_BLOCK);
	__shared__ int s_base; // Work counter communication across entire CTA.

	int workCount = p.workBuffer[0].x;

	for(;;)
	{
	// Persistent threads work fetcher.
	__syncthreads();
	if (threadIdx.x == 0)
	s_base = atomicAdd(&p.workBuffer[0].y, AA_GRAD_KERNEL_THREADS_PER_BLOCK);
	__syncthreads();
	int thread_idx = s_base + threadIdx.x;
	if (thread_idx >= workCount)
	return;

	// Read work item filled out by forward kernel.
	int4 item = p.workBuffer[thread_idx + 1];
	unsigned int amask = __ballot_sync(0xffffffffu, item.w);
	if (item.w == 0)
	continue; // No effect.

	// Unpack work item and replicate setup from forward analysis kernel.
	int px = item.x;
	int py = item.y;
	int pz = (int)(((unsigned int)item.z) >> 16);
	int d = (item.z >> AAWorkItem::FLAG_DOWN_BIT) & 1;
	float alpha = __int_as_float(item.w);
	int tri1 = (item.z >> AAWorkItem::FLAG_TRI1_BIT) & 1;
	int di = item.z & AAWorkItem::EDGE_MASK;
	float ds = __int_as_float(__float_as_int(1.0) \| (tri1 << 31));
	int pixel0 = px + p.width * (py + p.height * pz);
	int pixel1 = pixel0 + (d ? p.width : 1);
	int tri = float_to_triidx(p.rasterOut[((tri1 ? pixel1 : pixel0) << 2) + 3]) - 1;
	if (tri1)
	{
	px += 1 - d;
	py += d;
	}

	// Bail out if triangle index is corrupt.
	bool triFail = (tri < 0 \|\| tri >= p.numTriangles);
	amask = __ballot_sync(amask, !triFail);
	if (triFail)
	continue;

	// Outgoing color gradients.
	float* pGrad0 = p.gradColor + pixel0 * p.channels;
	float* pGrad1 = p.gradColor + pixel1 * p.channels;

	// Incoming color gradients.
	const float* pDy = p.dy + (alpha > 0.f ? pixel0 : pixel1) * p.channels;

	// Position gradient weight based on colors and incoming gradients.
	float dd = 0.f;
	const float* pColor0 = p.color + pixel0 * p.channels;
	const float* pColor1 = p.color + pixel1 * p.channels;

	// Loop over channels and accumulate.
	for (int i=0; i < p.channels; i++)
	{
	float dy = pDy[i];
	if (dy != 0.f)
	{
	// Update position gradient weight.
	dd += dy * (pColor1[i] - pColor0[i]);

	// Update color gradients. No coalescing because all have different targets.
	float v = alpha * dy;
	atomicAdd(&pGrad0[i], -v);
	atomicAdd(&pGrad1[i], v);
	}
	}

	// If position weight is zero, skip the rest.
	bool noGrad = (dd == 0.f);
	amask = __ballot_sync(amask, !noGrad);
	if (noGrad)
	continue;

	// Fetch vertex indices of the active edge and their positions.
	int i1 = (di < 2) ? (di + 1) : 0;
	int i2 = (i1 < 2) ? (i1 + 1) : 0;
	int vi1 = p.tri[3 * tri + i1];
	int vi2 = p.tri[3 * tri + i2];

	// Bail out if vertex indices are corrupt.
	bool vtxFail = (vi1 < 0 \|\| vi1 >= p.numVertices \|\| vi2 < 0 \|\| vi2 >= p.numVertices);
	amask = __ballot_sync(amask, !vtxFail);
	if (vtxFail)
	continue;

	// Instance mode: Adjust vertex indices based on minibatch index.
	if (p.instance_mode)
	{
	vi1 += pz * p.numVertices;
	vi2 += pz * p.numVertices;
	}

	// Fetch vertex positions.
	float4 p1 = ((float4*)p.pos)[vi1];
	float4 p2 = ((float4*)p.pos)[vi2];

	// Project vertices to pixel space.
	float pxh = p.xh;
	float pyh = p.yh;
	float fx = (float)px + .5f - pxh;
	float fy = (float)py + .5f - pyh;

	// XY flip for horizontal edges.
	if (d)
	{
	swap(p1.x, p1.y);
	swap(p2.x, p2.y);
	swap(pxh, pyh);
	swap(fx, fy);
	}

	// Gradient calculation setup.
	float w1 = 1.f / p1.w;
	float w2 = 1.f / p2.w;
	float x1 = p1.x * w1 * pxh - fx;
	float y1 = p1.y * w1 * pyh - fy;
	float x2 = p2.x * w2 * pxh - fx;
	float y2 = p2.y * w2 * pyh - fy;
	float dx = x2 - x1;
	float dy = y2 - y1;
	float db = x1dy - y1dx;

	// Compute inverse delta-y with epsilon.
	float ep = copysignf(1e-3f, dy); // ~1/1000 pixel.
	float iy = 1.f / (dy + ep);

	// Compute position gradients.
	float dby = db * iy;
	float iw1 = -w1 * iy * dd;
	float iw2 = w2 * iy * dd;
	float gp1x = iw1 * pxh * y2;
	float gp2x = iw2 * pxh * y1;
	float gp1y = iw1 * pyh * (dby - x2);
	float gp2y = iw2 * pyh * (dby - x1);
	float gp1w = -(p1.x * gp1x + p1.y * gp1y) * w1;
	float gp2w = -(p2.x * gp2x + p2.y * gp2y) * w2;

	// XY flip the gradients.
	if (d)
	{
	swap(gp1x, gp1y);
	swap(gp2x, gp2y);
	}

	// Kill position gradients if alpha was saturated.
	if (fabsf(alpha) >= 0.5f)
	{
	gp1x = gp1y = gp1w = 0.f;
	gp2x = gp2y = gp2w = 0.f;
	}

	// Initialize coalesced atomics. Match both triangle ID and edge index.
	// Also note that some threads may be inactive.
	CA_SET_GROUP_MASK(tri ^ (di << 30), amask);

	// Accumulate gradients.
	caAtomicAdd3_xyw(p.gradPos + 4 * vi1, gp1x, gp1y, gp1w);
	caAtomicAdd3_xyw(p.gradPos + 4 * vi2, gp2x, gp2y, gp2w);
	}
	}

	//------------------------------------------------------------------------