| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| #version 460 |
| #extension GL_EXT_shader_8bit_storage : require |
| #extension GL_EXT_shader_16bit_storage : require |
| #extension GL_EXT_shader_explicit_arithmetic_types : require |
| #extension GL_EXT_shader_explicit_arithmetic_types_int8 : require |
| #extension GL_EXT_shader_explicit_arithmetic_types_float16 : require |
| #extension GL_EXT_shader_explicit_arithmetic_types_float32 : require |
| #extension GL_GOOGLE_include_directive : enable |
| #extension GL_ARM_tensors : require |
|
|
| |
| #include "typedefs.h" |
| #include "common.h" |
|
|
| |
|
|
| struct TensorElement |
| { |
| int8_t4 wh_rgb_col_r; |
| int8_t4 col_gb_dm_fback_r; |
| int8_t4 fback_gba_ld; |
| }; |
|
|
| |
| layout (set=0, binding=0) uniform mediump sampler2D _ColourTex; |
| layout (set=0, binding=1) uniform highp sampler2D _DepthTex; |
| layout (set=0, binding=2) uniform mediump sampler2D _MotionVectorTex; |
| layout (set=0, binding=3) uniform mediump sampler2D _HistoryTex; |
| layout (set=0, binding=4) uniform lowp sampler2D _FeedbackTensor; |
| layout (set=0, binding=5) uniform highp sampler2D _DepthTm1Tex; |
| layout (set=0, binding=6) uniform lowp sampler2D _LumaDerivTm1Tex; |
| layout (set=0, binding=7) uniform lowp sampler2D _NearestDepthCoordTm1Tex; |
|
|
| |
| layout (set=1, binding=0) uniform writeonly tensorARM<int8_t, 4> _PreprocessTensor; |
| layout (set=1, binding=1, rg8) uniform writeonly lowp image2D _PreProcessLumaDerivOut; |
| layout (set=1, binding=3, r8) uniform writeonly lowp image2D _NearestDepthCoordOut; |
|
|
| |
| layout(push_constant, std430) uniform PushConstants { |
| |
| layout(offset = 0) float4 _DeviceToViewDepth; |
| layout(offset = 16) float4 _JitterOffset; |
| layout(offset = 32) float4 _JitterOffsetTm1; |
| layout(offset = 48) float4 _ScaleFactor; |
|
|
| |
| layout(offset = 64) int32_t2 _OutputDims; |
| layout(offset = 72) int32_t2 _InputDims; |
| layout(offset = 80) float2 _InvOutputDims; |
| layout(offset = 88) float2 _InvInputDims; |
| layout(offset = 96) half4 _QuantParams; |
| layout(offset = 104) half4 _MotionDisThreshPad; |
|
|
| |
| layout(offset = 112) half2 _Exposure; |
| layout(offset = 116) half2 _HistoryPad; |
|
|
| |
| layout(offset = 120) int32_t2 _Padding; |
|
|
| |
| }; |
|
|
| |
| #define _Scale _ScaleFactor.xy |
| #define _InvScale _ScaleFactor.zw |
| #define _Exposure _Exposure.x |
| #define _InvExposure _Exposure.y |
| #define _JitterOffsetPix _JitterOffset.xy |
| #define _JitterOffsetUv _JitterOffset.zw |
| #define _JitterOffsetTm1Pix _JitterOffsetTm1.xy |
| #define _JitterOffsetTm1Uv _JitterOffsetTm1.zw |
| #define _MotionWarpThresh _MotionDisThreshPad.x |
| #define _MotionDisThresh _MotionDisThreshPad.y |
| #define _DisocclusionScale _MotionDisThreshPad.z |
| #define _NotHistoryReset _HistoryPad.x |
|
|
| |
| |
| |
|
|
| #ifndef _InputQuantParams |
| |
| #define _InputQuantParams _QuantParams.xy |
| #endif |
| #ifndef _FeedbackQuantParams |
| |
| #define _FeedbackQuantParams _QuantParams.zw |
| #endif |
|
|
| |
|
|
| #ifdef INVERTED_DEPTH |
| #define MAX_DEPTH 0.f |
| #else |
| #define MAX_DEPTH 1.f |
| #endif |
|
|
|
|
| |
|
|
| bool IsOnScreen(int32_t2 pos, int32_t2 size) |
| { |
| return all(lessThan(uint32_t2(pos), uint32_t2(size))); |
| } |
|
|
|
|
| half2 LoadMotion(int32_t2 pixel) |
| { |
| return half2(texelFetch(_MotionVectorTex, pixel, 0).rg); |
| } |
|
|
|
|
| half3 LoadColour(int32_t2 pixel) |
| { |
| return Tonemap(SafeColour(half3(texelFetch(_ColourTex, pixel, 0).rgb) * _Exposure)); |
| } |
|
|
|
|
| int32_t2 LoadDepthNearestDepthOffsetTm1(int32_t2 pixel) |
| { |
| int32_t2 is_oob = int32_t2(IsOnScreen(pixel, _InputDims)); |
| pixel = clamp(pixel, int32_t2(0), _InputDims - int32_t2(1)); |
|
|
| half encNorm = half(texelFetch(_NearestDepthCoordTm1Tex, pixel, 0).r); |
| int32_t code = int32_t(encNorm * 255.0 + 0.5); |
|
|
| |
| return DecodeNearestDepthCoord(code) * is_oob; |
| } |
|
|
| void GatherReconstructedPreviousDepthRQuad(float2 fUV, inout float4 depthQuad) |
| { |
| int32_t2 offset = LoadDepthNearestDepthOffsetTm1(int32_t2(fUV * _InputDims)); |
| float2 offset_uv = float2(offset) * _InvInputDims; |
| depthQuad = textureGather(_DepthTm1Tex, fUV + offset_uv, 0).wzxy; |
| } |
|
|
|
|
| half3 WarpHistory(float2 uv) |
| { |
| return Tonemap(SafeColour(half3(textureLod(_HistoryTex, uv, 0).rgb) * _Exposure)); |
| } |
|
|
|
|
| half4 WarpFeedback(float2 uv) |
| { |
| return Dequantize(half4(textureLod(_FeedbackTensor, uv, 0)), _FeedbackQuantParams); |
| } |
|
|
|
|
| half2 WarpLumaDerivative(float2 uv) |
| { |
| return half2(textureLod(_LumaDerivTm1Tex, uv, 0).rg); |
| } |
|
|
|
|
| half2 CalculateLumaDerivative(float2 reproj_uv, half3 jittered_colour, half disocclusion_mask) |
| { |
| const half DIS_THRESH = 0.01HF; |
| const half DERIV_MIN = 0.05HF; |
| const half DERIV_MAX = 0.3HF; |
| const half DERIV_POW = 1.5HF; |
| const half DERIV_ALPHA = 0.1HF; |
| const half DERIV_MAX_R = rcp(DERIV_MAX); |
| const half DERIV_MAX_POW_R = rcp(pow(DERIV_MAX, DERIV_POW)); |
|
|
| |
| |
| |
| half2 h = WarpLumaDerivative(reproj_uv); |
| half luma_tm1 = h.y; |
| half derivative_tm1 = h.x; |
|
|
| |
| |
| |
| half luma_t = Luminance(jittered_colour); |
| half derivative_t = abs(luma_t - luma_tm1); |
|
|
| |
| |
| |
| |
| |
| half clipped = min(derivative_t, DERIV_MAX); |
|
|
| |
| clipped *= step(DERIV_MIN, derivative_t); |
|
|
| |
| |
| half curved = clipped * sqrt(clipped) * DERIV_MAX_POW_R; |
|
|
| |
| |
| |
| |
| |
| |
| half alpha_scale = mix(DERIV_ALPHA, |
| DERIV_ALPHA * 0.1HF, |
| clamp(derivative_tm1, 0.HF, DERIV_MAX) * DERIV_MAX_R); |
|
|
| half derivative = mix(derivative_tm1, curved, alpha_scale); |
|
|
| |
| |
| |
| derivative *= step(disocclusion_mask, DIS_THRESH); |
|
|
| |
| return half2(derivative, luma_t); |
| } |
|
|
|
|
| void FindNearestDepth(int32_t2 iPxPos, int32_t2 iPxSize, out float fNearestDepth, out int32_t2 fNearestDepthOffset) |
| { |
| |
| |
| |
| |
| |
|
|
| int32_t iSampleIndex = 0; |
| const int32_t iSampleCount = 9; |
| |
| const int32_t2 iSampleOffsets[iSampleCount] = { |
| int32_t2(+0, +0).yx, |
| int32_t2(+1, +0).yx, |
| int32_t2(+0, +1).yx, |
| int32_t2(+0, -1).yx, |
| int32_t2(-1, +0).yx, |
| int32_t2(-1, +1).yx, |
| int32_t2(+1, +1).yx, |
| int32_t2(-1, -1).yx, |
| int32_t2(+1, -1).yx, |
| }; |
|
|
| |
| float depth[9]; |
| depth[0] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(+0, +0).yx).r); |
| depth[1] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(+1, +0).yx).r); |
| depth[2] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(+0, +1).yx).r); |
| depth[3] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(+0, -1).yx).r); |
| depth[4] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(-1, +0).yx).r); |
| depth[5] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(-1, +1).yx).r); |
| depth[6] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(+1, +1).yx).r); |
| depth[7] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(-1, -1).yx).r); |
| depth[8] = float(texelFetchOffset(_DepthTex, iPxPos, 0, int32_t2(+1, -1).yx).r); |
|
|
| |
| fNearestDepth = depth[0]; |
| fNearestDepthOffset = iSampleOffsets[0]; |
| #pragma unroll |
| for (iSampleIndex = 1; iSampleIndex < iSampleCount; ++iSampleIndex) { |
|
|
| int32_t2 iPos = iPxPos + iSampleOffsets[iSampleIndex]; |
| if (IsOnScreen(iPos, iPxSize)) { |
|
|
| float fNdDepth = depth[iSampleIndex]; |
| #ifdef INVERTED_DEPTH |
| if (fNdDepth > fNearestDepth) { |
| #else |
| if (fNdDepth < fNearestDepth) { |
| #endif |
| fNearestDepth = fNdDepth; |
| fNearestDepthOffset = iSampleOffsets[iSampleIndex]; |
| } |
| } |
| } |
| } |
|
|
|
|
| int32_t2 RenderSize() |
| { |
| return int32_t2(_InputDims); |
| } |
|
|
|
|
| float2 ComputeNdc(float2 fPxPos, int32_t2 iSize) |
| { |
| |
| |
| |
| |
| |
|
|
| return fPxPos.yx / float2(iSize.yx) * float2(2.0f, -2.0f) + float2(-1.0f, 1.0f); |
| } |
|
|
|
|
| float GetViewSpaceDepth(float fDeviceDepth) |
| { |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| const float4 fDeviceToViewDepth = _DeviceToViewDepth; |
|
|
| return (fDeviceToViewDepth[1] / (fDeviceDepth - fDeviceToViewDepth[0])); |
| } |
|
|
|
|
| float3 GetViewSpacePosition(int32_t2 iViewportPos, int32_t2 iViewportSize, float fDeviceDepth) |
| { |
| |
| |
| |
| |
| |
|
|
| const float4 fDeviceToViewDepth = _DeviceToViewDepth; |
|
|
| const float Z = GetViewSpaceDepth(fDeviceDepth); |
|
|
| const float2 fNdcPos = ComputeNdc(iViewportPos, iViewportSize); |
| const float X = fDeviceToViewDepth[2] * fNdcPos.x * Z; |
| const float Y = fDeviceToViewDepth[3] * fNdcPos.y * Z; |
|
|
| return float3(X, Y, Z); |
| } |
|
|
|
|
| struct BilinearSamplingData |
| { |
| int32_t2 iOffsets[4]; |
| float fWeights[4]; |
| int32_t2 iBasePos; |
| float2 fQuadCenterUv; |
| }; |
|
|
|
|
| BilinearSamplingData GetBilinearSamplingData(float2 fUv, int32_t2 iSize) |
| { |
| |
| |
| |
| |
| |
|
|
| BilinearSamplingData data; |
|
|
| float2 fPxSample = (fUv * iSize) - float2(0.5f, 0.5f); |
| data.iBasePos = int32_t2(floor(fPxSample)); |
| data.fQuadCenterUv = (fPxSample + 0.5f) / float2(iSize); |
| float2 fPxFrac = fract(fPxSample); |
|
|
| data.iOffsets[0] = int32_t2(0, 0); |
| data.iOffsets[2] = int32_t2(1, 0); |
| data.iOffsets[1] = int32_t2(0, 1); |
| data.iOffsets[3] = int32_t2(1, 1); |
|
|
| data.fWeights[0] = (1.f - fPxFrac.x) * (1.f - fPxFrac.y); |
| data.fWeights[1] = (fPxFrac.x) * (1.f - fPxFrac.y); |
| data.fWeights[2] = (1.f - fPxFrac.x) * (fPxFrac.y); |
| data.fWeights[3] = (fPxFrac.x) * (fPxFrac.y); |
|
|
| return data; |
| } |
|
|
|
|
| float ComputeDepthClip(float2 fUvSample, float fCurrentDepthSample) |
| { |
| |
| |
| |
| |
| |
|
|
| const float fReconstructedDepthBilinearWeightThreshold = 0.1f; |
| float fCurrentDepthViewSpace = GetViewSpaceDepth(fCurrentDepthSample); |
| BilinearSamplingData bilinearInfo = GetBilinearSamplingData(fUvSample, RenderSize()); |
|
|
| float fDepth = 0.0f; |
| float fWeightSum = 0.0f; |
|
|
| float4 fPrevDepthSamples; |
| GatherReconstructedPreviousDepthRQuad(bilinearInfo.fQuadCenterUv, fPrevDepthSamples); |
| |
| |
|
|
| for (int32_t iSampleIndex = 0; iSampleIndex < 4; iSampleIndex++) |
| { |
| const int32_t2 iOffset = bilinearInfo.iOffsets[iSampleIndex]; |
| const int32_t2 iSamplePos = bilinearInfo.iBasePos + iOffset; |
|
|
| const float fWeight = bilinearInfo.fWeights[iSampleIndex]; |
| const bool onscreen = IsOnScreen(iSamplePos, RenderSize()); |
| fWeightSum += onscreen ? 0.f : fWeight; |
| if (onscreen) |
| { |
| if (fWeight > fReconstructedDepthBilinearWeightThreshold) |
| { |
| const float fPrevDepthSample = fPrevDepthSamples[iSampleIndex]; |
| const float fPrevNearestDepthViewSpace = GetViewSpaceDepth(fPrevDepthSample); |
| const float fDepthDiff = fCurrentDepthViewSpace - fPrevNearestDepthViewSpace; |
|
|
| if (fDepthDiff > 0.0f) { |
|
|
| #ifdef INVERTED_DEPTH |
| const float fPlaneDepth = min(fPrevDepthSample, fCurrentDepthSample); |
| #else |
| const float fPlaneDepth = max(fPrevDepthSample, fCurrentDepthSample); |
| #endif |
|
|
| const float3 fCenter = GetViewSpacePosition(int32_t2(RenderSize() * 0.5f), RenderSize(), fPlaneDepth); |
| const float3 fCorner = GetViewSpacePosition(int32_t2(0, 0), RenderSize(), fPlaneDepth); |
|
|
| const float fHalfViewportWidth = length(float2(RenderSize())); |
| const float fDepthThreshold = max(fCurrentDepthViewSpace, fPrevNearestDepthViewSpace); |
|
|
| const float Ksep = 1.37e-05f; |
| const float Kfov = length(fCorner) / length(fCenter); |
| const float fRequiredDepthSeparation = Ksep * Kfov * fHalfViewportWidth * fDepthThreshold; |
|
|
| const float fResolutionFactor = saturate(length(float2(RenderSize())) / length(float2(1920.0f, 1080.0f))); |
| const float fPower = lerp(1.0f, 3.0f, fResolutionFactor); |
| fDepth += pow(saturate(float(fRequiredDepthSeparation / fDepthDiff)), fPower) * fWeight; |
| fWeightSum += fWeight; |
| } |
| } |
| } |
| } |
|
|
| return (fWeightSum > 0) ? saturate(1.0f - fDepth / fWeightSum) : 0.0f; |
| } |
|
|
|
|
| void WriteLumaDerivative(int32_t2 pixel, half2 derivative) |
| { |
| imageStore(_PreProcessLumaDerivOut, pixel, half4(derivative, half2(0.f, 1.f))); |
| } |
|
|
|
|
| void WriteNearestDepthOffset(int32_t2 pixel, uint8_t offset) |
| { |
| half enc_norm = half(offset) / 255.HF; |
| imageStore(_NearestDepthCoordOut, pixel, half4(enc_norm, 0.HF, 0.HF, 1.HF)); |
| } |
|
|
|
|
| void WriteToTensor(int32_t2 outputPixel, half3 input_colour, half3 history, half disocclusion_mask, half luma_derivative, half4 temporal_feedback) |
| { |
| TensorElement te; |
| te.wh_rgb_col_r = Quantize(half4(history.rgb, input_colour.r), _InputQuantParams); |
| te.col_gb_dm_fback_r = Quantize(half4(input_colour.gb, disocclusion_mask, temporal_feedback.r), _InputQuantParams); |
| te.fback_gba_ld = Quantize(half4(temporal_feedback.gba, luma_derivative), _InputQuantParams); |
| |
| int8_t t0[12] = |
| { |
| te.wh_rgb_col_r.x, |
| te.wh_rgb_col_r.y, |
| te.wh_rgb_col_r.z, |
| te.wh_rgb_col_r.w, |
| te.col_gb_dm_fback_r.x, |
| te.col_gb_dm_fback_r.y, |
| te.col_gb_dm_fback_r.z, |
| te.col_gb_dm_fback_r.w, |
| te.fback_gba_ld.x, |
| te.fback_gba_ld.y, |
| te.fback_gba_ld.z, |
| te.fback_gba_ld.w |
| }; |
| tensorWriteARM(_PreprocessTensor, uint[](0, outputPixel.y, outputPixel.x, 0), t0); |
| } |
|
|
|
|
| |
| layout(local_size_x = 16, local_size_y = 16) in; |
| void main() |
| { |
| int32_t2 input_pixel = int32_t2(gl_GlobalInvocationID.xy); |
| if (any(greaterThanEqual(input_pixel, _InputDims))) return; |
|
|
| float2 uv = (float2(input_pixel) + 0.5f) * _InvInputDims; |
|
|
| |
| |
| |
| float depth_dilated = float(0.f); |
| int32_t2 nearest_pixel_offset = int32_t2(0); |
| FindNearestDepth(input_pixel, RenderSize(), depth_dilated, nearest_pixel_offset); |
|
|
| |
| |
| |
| half2 motion = LoadMotion(input_pixel + nearest_pixel_offset); |
| |
| |
| half2 motion_pix = motion * half2(RenderSize()); |
| motion *= half(dot(motion_pix, motion_pix) > _MotionWarpThresh); |
|
|
| |
| float2 reproj_uv = uv - float2(motion); |
| float2 unjitter_tm1_uv = reproj_uv - _JitterOffsetTm1Uv; |
|
|
| |
| |
| |
| half disocclusion_mask = half(ComputeDepthClip(unjitter_tm1_uv, depth_dilated)); |
|
|
| |
| |
| half dm_scale = dot(motion_pix, motion_pix) > _MotionDisThresh ? half(1.0f) : _DisocclusionScale; |
| disocclusion_mask = disocclusion_mask * dm_scale; |
|
|
| |
| |
| |
| half3 warped_history = WarpHistory(reproj_uv); |
|
|
| |
| |
| |
| half3 jittered_colour = LoadColour(input_pixel); |
|
|
| |
| |
| |
| |
| half2 luma_derivative = CalculateLumaDerivative(reproj_uv, jittered_colour, disocclusion_mask); |
|
|
| |
| |
| |
| half4 temporal_feedback = WarpFeedback(reproj_uv); |
|
|
| |
| |
| |
| uint8_t enc_depth_offset = EncodeNearestDepthCoord(nearest_pixel_offset); |
|
|
| |
| |
| |
| |
| WriteToTensor( |
| input_pixel, |
| jittered_colour, |
| warped_history, |
| disocclusion_mask, |
| luma_derivative.x, |
| temporal_feedback |
| ); |
| |
| |
| WriteNearestDepthOffset(input_pixel, enc_depth_offset); |
| |
| |
| WriteLumaDerivative(input_pixel, luma_derivative); |
| } |
|
|