diff --git a/Assets/Settings/Mobile/Mobile_High.asset b/Assets/Settings/Mobile/Mobile_High.asset index f896eca..8e18294 100644 --- a/Assets/Settings/Mobile/Mobile_High.asset +++ b/Assets/Settings/Mobile/Mobile_High.asset @@ -28,7 +28,7 @@ MonoBehaviour: m_SupportsHDR: 1 m_HDRColorBufferPrecision: 0 m_MSAA: 1 - m_RenderScale: 0.6667969 + m_RenderScale: 1 m_UpscalingFilter: 1 m_FsrOverrideSharpness: 1 m_FsrSharpness: 1 @@ -73,7 +73,7 @@ MonoBehaviour: m_UseAdaptivePerformance: 1 m_ColorGradingMode: 0 m_ColorGradingLutSize: 32 - m_UseFastSRGBLinearConversion: 1 + m_UseFastSRGBLinearConversion: 0 m_SupportDataDrivenLensFlare: 0 m_ShadowType: 1 m_LocalShadowsSupported: 0 @@ -114,5 +114,5 @@ MonoBehaviour: m_PrefilterNativeRenderPass: 1 m_ShaderVariantLogLevel: 0 m_ShadowCascades: 0 - superResolution: 16 + superResolution: 3 vrsRate: 0 diff --git a/Assets/Settings/Mobile/Mobile_High_Renderer.asset b/Assets/Settings/Mobile/Mobile_High_Renderer.asset index 37f656a..0cdc1ce 100644 --- a/Assets/Settings/Mobile/Mobile_High_Renderer.asset +++ b/Assets/Settings/Mobile/Mobile_High_Renderer.asset @@ -125,6 +125,27 @@ MonoBehaviour: superResolution: 4 spatialUpScaleSettings: sharpness: 0.35 +--- !u!114 &-4150689035588204578 +MonoBehaviour: + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_GameObject: {fileID: 0} + m_Enabled: 1 + m_EditorHideFlags: 0 + m_Script: {fileID: 11500000, guid: a0b8751ae563c82438124306d98fffd5, type: 3} + m_Name: HziCullingFeature + m_EditorClassIdentifier: + m_Active: 1 + settings: + UseTowCullPass: 0 + UseTextureAABB: 1 + UseCompute: 0 + CullMat: {fileID: 2100000, guid: ce13e27ede10980489261083a699b449, type: 2} + CullShader: {fileID: 7200000, guid: 029fde5b4200eeb4e919fe924f26fdd8, type: 3} + CullTextureSize: 64 + UseThreeFrameReadback: 1 --- !u!114 &-2621301742936824463 MonoBehaviour: m_ObjectHideFlags: 0 @@ -203,9 +224,10 @@ MonoBehaviour: settings: PyramidFunc: 0 SkipThreeMip: 0 - ComputeShader: {fileID: 0} - Spd: {fileID: 0} - CopyDepth: {fileID: 0} + UseThreeFrameReadback: 0 + ComputeShader: {fileID: 7200000, guid: b10c09f63c5a5864ca12e07b1b361d56, type: 3} + Spd: {fileID: 7200000, guid: 2c0519b45f80f5c47b4ee3edb2e7931b, type: 3} + CopyDepth: {fileID: 2100000, guid: 5f8ec7ea87b60a1448091fade6a0e68f, type: 2} --- !u!114 &-1133205096357012623 MonoBehaviour: m_ObjectHideFlags: 0 @@ -271,7 +293,8 @@ MonoBehaviour: - {fileID: 4071882023117421450} - {fileID: -7284859345190182597} - {fileID: -1629415145513658388} - m_RendererFeatureMap: bc3f630842f2e70dd6a559c442a94bfd4529d15534f2d3de228858dca8d12222716523fbf3439fdb7a327b7bff4bdd446ac59dfa966ffa88ca6373cd5da9013d6cff55ca297e5e908a7b3653203b82383b2141bb05fbe69aec5704e48e2763e9 + - {fileID: -4150689035588204578} + m_RendererFeatureMap: bc3f630842f2e70dd6a559c442a94bfd4529d15534f2d3de228858dca8d12222716523fbf3439fdb7a327b7bff4bdd446ac59dfa966ffa88ca6373cd5da9013d6cff55ca297e5e908a7b3653203b82383b2141bb05fbe69aec5704e48e2763e9de07a83c50ca65c6 m_UseNativeRenderPass: 0 postProcessData: {fileID: 11400000, guid: 41439944d30ece34e96484bdb6645b55, type: 2} shaders: @@ -421,7 +444,7 @@ MonoBehaviour: m_Name: XESS2 m_EditorClassIdentifier: m_Active: 0 - Quality: 104 + Quality: 106 --- !u!114 &4395980931634258890 MonoBehaviour: m_ObjectHideFlags: 0 diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/CullDebug.cs b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/CullDebug.cs new file mode 100644 index 0000000..72b5e6c --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/CullDebug.cs @@ -0,0 +1,17 @@ +using UnityEngine; + +public class CullDebug : MonoBehaviour +{ + public int Index; + public Vector3 Position; + public Vector3 Size; + public bool IsCulled = false; + + private void OnDrawGizmos() + { + var col = Gizmos.color; + Gizmos.color = Color.green; + Gizmos.DrawWireCube(Position, Size); + Gizmos.color = col; + } +} diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/CullDebug.cs.meta b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/CullDebug.cs.meta new file mode 100644 index 0000000..166d53a --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/CullDebug.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: dab00d3922eb1ec4494d2db2fbb0c6c4 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/HierarchicalZFeature.cs b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/HierarchicalZFeature.cs index 0926496..81e5790 100644 --- a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/HierarchicalZFeature.cs +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/HierarchicalZFeature.cs @@ -1,7 +1,6 @@ -using Codice.Client.GameUI.Update; using System; -using Unity.Burst.CompilerServices; using UnityEngine; +using UnityEngine.Experimental.Rendering; using UnityEngine.Rendering; using UnityEngine.Rendering.Universal; @@ -21,6 +20,7 @@ namespace X.Rendering.Feature { public EDepthPyramidFunc PyramidFunc = EDepthPyramidFunc.CopyDepth; public bool SkipThreeMip; + // 留历史 depth,做 tow pass cull public bool UseThreeFrameReadback; public ComputeShader ComputeShader; public ComputeShader Spd; @@ -56,11 +56,6 @@ namespace X.Rendering.Feature internal class HizPass : ScriptableRenderPass, IDisposable { - private static readonly int depthTextureId = Shader.PropertyToID("_CameraDepthTexture"); - private static readonly int depthInputId = Shader.PropertyToID("_InputDepth"); - private static readonly int depthPyramidTexId = Shader.PropertyToID("_DepthPyramidTexture"); - private static readonly int inputScaleAndMaxIndexId = Shader.PropertyToID("_InputScaleAndMaxIndex"); - private readonly Settings settings; private ProfilingSampler profiler; @@ -71,18 +66,22 @@ namespace X.Rendering.Feature private int mipLevelCount; private Vector2Int[] mipLevelOffsets; private Vector2Int[] mipLevelSizes; - private Vector4[] mipOffsetAndSizes = new Vector4[32]; + private Vector4[] mipOffsetAndSizes = new Vector4[16]; private RTHandle[] depthPyramidTexs = new RTHandle[3]; private static string[] depthPyramidNames = new string[3] { "_DepthPyramidTextureA", "_DepthPyramidTextureB", "_DepthPyramidTextureC" }; + RenderTexture SpdAtomicCounter; + public HizPass(Settings settings) { renderPassEvent = RenderPassEvent.BeforeRenderingPostProcessing; profiler = new("DepthPyramid"); this.settings = settings; - mipLevelOffsets = new Vector2Int[32]; - mipLevelSizes = new Vector2Int[32]; + mipLevelOffsets = new Vector2Int[16]; + mipLevelSizes = new Vector2Int[16]; + SpdAtomicCounter = new RenderTexture(1, 1, 0, GraphicsFormat.R32_UInt) { name = "FSR2_SpdAtomicCounter", enableRandomWrite = true }; + SpdAtomicCounter.Create(); } public static int HizIndex { get; private set; } @@ -112,8 +111,9 @@ namespace X.Rendering.Feature cachedSkipThreeMip = settings.SkipThreeMip; mip0SizeNOP = viewportSize; - int resizeX = Mathf.IsPowerOfTwo(viewportSize.x) ? viewportSize.x : Mathf.NextPowerOfTwo(viewportSize.x); - int resizeY = Mathf.IsPowerOfTwo(viewportSize.y) ? viewportSize.y : Mathf.NextPowerOfTwo(viewportSize.y); + // PowerOfTwo 不会留缝隙 + int resizeX = settings.PyramidFunc == EDepthPyramidFunc.SPD || Mathf.IsPowerOfTwo(viewportSize.x) ? viewportSize.x : Mathf.NextPowerOfTwo(viewportSize.x); + int resizeY = settings.PyramidFunc == EDepthPyramidFunc.SPD || Mathf.IsPowerOfTwo(viewportSize.y) ? viewportSize.y : Mathf.NextPowerOfTwo(viewportSize.y); Vector2Int hardwareTextureSize = new Vector2Int(resizeX, resizeY); mipLevelOffsets[0] = Vector2Int.zero; mipLevelSizes[0] = Vector2Int.zero; @@ -179,17 +179,27 @@ namespace X.Rendering.Feature var sampleName = "DepthDownSample"; cmd.BeginSample(sampleName); cmd.SetViewProjectionMatrices(Matrix4x4.identity, Matrix4x4.identity); - cmd.SetGlobalTexture(depthInputId, depthTex); + cmd.SetGlobalTexture(HizShaderIds.DepthInputId, depthTex); for (int i = 1; i < mipLevelCount; i++) { + var index = i; + if (settings.SkipThreeMip && i < 3 + 1) + { + continue; + } + else if (settings.SkipThreeMip && i == 3 + 1) + { + index = 1; + } + var mipSize = mipLevelSizes[i]; - var inputMipSize = i == 1 ? mip0SizeNOP : mipLevelSizes[i - 1]; + var inputMipSize = index == 1 ? mip0SizeNOP : mipLevelSizes[i - 1]; var texId = depthMipId[i]; - cmd.SetGlobalVector(inputScaleAndMaxIndexId, new Vector4(inputMipSize.x / (float)mipSize.x, inputMipSize.y / (float)mipSize.y, inputMipSize.x - 1, inputMipSize.y - 1)); + cmd.SetGlobalVector(HizShaderIds.InputScaleAndMaxIndexId, new Vector4(inputMipSize.x / (float)mipSize.x, inputMipSize.y / (float)mipSize.y, inputMipSize.x - 1, inputMipSize.y - 1)); cmd.GetTemporaryRT(texId, mipSize.x, mipSize.y, 0, FilterMode.Point, RenderTextureFormat.RFloat); cmd.SetRenderTarget(texId, RenderBufferLoadAction.DontCare, RenderBufferStoreAction.Store); cmd.DrawMesh(RenderingUtils.fullscreenMesh, Matrix4x4.identity, settings.CopyDepth, 0, 0); - cmd.SetGlobalTexture(depthInputId, texId); + cmd.SetGlobalTexture(HizShaderIds.DepthInputId, texId); } cmd.EndSample(sampleName); @@ -212,8 +222,13 @@ namespace X.Rendering.Feature cmd.SetRenderTarget(hizRt, RenderBufferLoadAction.DontCare, RenderBufferStoreAction.Store); for (int i = 1; i < mipLevelCount; i++) { + if (settings.SkipThreeMip && i < 3 + 1) + { + continue; + } + var texId = depthMipId[i]; - cmd.SetGlobalTexture(depthInputId, texId); + cmd.SetGlobalTexture(HizShaderIds.DepthInputId, texId); var mipSize = mipLevelSizes[i]; var mipOffset = mipLevelOffsets[i]; mipOffsetAndSizes[i] = new (mipOffset.x, mipOffset.y, mipSize.x, mipSize.y); @@ -221,7 +236,7 @@ namespace X.Rendering.Feature cmd.DrawMesh(RenderingUtils.fullscreenMesh, Matrix4x4.identity, settings.CopyDepth, 0, 1); } - cmd.SetGlobalTexture(depthPyramidTexId, hizRt); + cmd.SetGlobalTexture(HizShaderIds.DepthPyramidTexId, hizRt); cmd.EndSample(sampleName); for (int i = 0; i < depthMipId.Length; i++) @@ -273,7 +288,7 @@ namespace X.Rendering.Feature } } - cmd.SetComputeTextureParam(settings.ComputeShader, kernelId, depthInputId, i == 0 ? depthTex : hizBuffer); + cmd.SetComputeTextureParam(settings.ComputeShader, kernelId, HizShaderIds.DepthInputId, i == 0 ? depthTex : hizBuffer); cmd.SetComputeTextureParam(settings.ComputeShader, kernelId, "_DepthMipChain", hizBuffer); int inputMipIndex = startMip; @@ -303,19 +318,51 @@ namespace X.Rendering.Feature cmd.DispatchCompute(settings.ComputeShader, kernelId, Mathf.CeilToInt(outputMipSize.x / 8f), Mathf.CeilToInt(outputMipSize.y / 8f), 1); mipCnt = mipCnt - 4; } - cmd.SetGlobalTexture(depthPyramidTexId, hizBuffer); + cmd.SetGlobalTexture(HizShaderIds.DepthPyramidTexId, hizBuffer); } private void DoSpdDepth(CommandBuffer cmd, Texture depthTex) { + var hizIndex = GetHizIndex(); + RTHandle hizBuffer = depthPyramidTexs[hizIndex]; + RenderingUtils.ReAllocateIfNeeded(ref hizBuffer, new RenderTextureDescriptor() + { + width = depthPyramidTextureSize.x, + height = depthPyramidTextureSize.y, + dimension = TextureDimension.Tex2D, + colorFormat = RenderTextureFormat.RFloat, + msaaSamples = 1, + enableRandomWrite = true, + }, filterMode: FilterMode.Point, name: depthPyramidNames[hizIndex]); + depthPyramidTexs[hizIndex] = hizBuffer; + + var dispatchX = Mathf.CeilToInt(mip0SizeNOP.x / 64f); + var dispatchY = Mathf.CeilToInt(mip0SizeNOP.y / 64f); + cmd.SetComputeIntParam(settings.Spd, "mips", mipLevelCount); + cmd.SetComputeIntParam(settings.Spd, "numWorkGroups", dispatchX); + + for (int i = 0; i < mipLevelCount; i++) + { + var mipSize = mipLevelSizes[i]; + var mipOffset = mipLevelOffsets[i]; + mipOffsetAndSizes[i] = new(mipOffset.x, mipOffset.y, mipSize.x, mipSize.y); + } + cmd.SetComputeVectorArrayParam(settings.Spd, "_MipOffsetAndSizeArray", mipOffsetAndSizes); + + cmd.SetComputeTextureParam(settings.Spd, 0, "_InputDepth", depthTex); + cmd.SetComputeTextureParam(settings.Spd, 0, "_OutDepth", hizBuffer); + cmd.SetComputeTextureParam(settings.Spd, 0, "rw_internal_global_atomic", SpdAtomicCounter); + + cmd.DispatchCompute(settings.Spd, 0, dispatchX, dispatchY, 1); + cmd.SetGlobalTexture(HizShaderIds.DepthPyramidTexId, hizBuffer); } public override void Execute(ScriptableRenderContext context, ref RenderingData renderingData) { var cmd = renderingData.commandBuffer; using var soc = new ProfilingScope(cmd, profiler); - Texture depthTex = Shader.GetGlobalTexture(depthTextureId); + Texture depthTex = Shader.GetGlobalTexture(HizShaderIds.CameraDepthTextureId); if(depthTex == null) { return; @@ -337,10 +384,20 @@ namespace X.Rendering.Feature default: break; } + + cmd.SetGlobalVector(HizShaderIds.Mip0SizeId, new Vector4(mip0SizeNOP.x, mip0SizeNOP.y, 0, 0)); + cmd.SetGlobalVector(HizShaderIds.MipmapLevelMinMaxIndexId, new Vector4(1, mipLevelCount, 0, 0)); + cmd.SetGlobalVectorArray(HizShaderIds.MipOffsetAndSizeArrayId, mipOffsetAndSizes); } public void Dispose() { + if (SpdAtomicCounter != null) + { + SpdAtomicCounter.Release(); + SpdAtomicCounter = null; + } + for (int i = 0; i < depthPyramidTexs.Length; i++) { var rt = depthPyramidTexs[i]; diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/HizShaderIds.cs b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/HizShaderIds.cs new file mode 100644 index 0000000..d854d29 --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/HizShaderIds.cs @@ -0,0 +1,21 @@ +using UnityEngine; + +namespace X.Rendering.Feature +{ + internal static class HizShaderIds + { + internal static readonly int CameraDepthTextureId = Shader.PropertyToID("_CameraDepthTexture"); + internal static readonly int DepthInputId = Shader.PropertyToID("_InputDepth"); + internal static readonly int DepthPyramidTexId = Shader.PropertyToID("_DepthPyramidTexture"); + internal static readonly int InputScaleAndMaxIndexId = Shader.PropertyToID("_InputScaleAndMaxIndex"); + + internal static readonly int ObjectAABBCenterId = Shader.PropertyToID("_ObjectAABBTexture0"); + internal static readonly int ObjectAABBSizeId = Shader.PropertyToID("_ObjectAABBTexture1"); + internal static readonly int GPUCullingVPId = Shader.PropertyToID("_GPUCullingVP"); + internal static readonly int MipmapLevelMinMaxIndexId = Shader.PropertyToID("_MipmapLevelMinMaxIndex"); + internal static readonly int Mip0SizeId = Shader.PropertyToID("_Mip0Size"); + internal static readonly int MipOffsetAndSizeArrayId = Shader.PropertyToID("_MipOffsetAndSize"); + + + } +} diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/HizShaderIds.cs.meta b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/HizShaderIds.cs.meta new file mode 100644 index 0000000..d0c970e --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/HizShaderIds.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: f821bd9bdc49933439e3e92efe04bee3 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/HziCullingFeature.cs b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/HziCullingFeature.cs index d3b147f..398aa04 100644 --- a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/HziCullingFeature.cs +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/HziCullingFeature.cs @@ -1,10 +1,146 @@ using System; +using System.Collections.Generic; +using Unity.Collections; +using Unity.Mathematics; using UnityEngine; +using UnityEngine.Experimental.Rendering; using UnityEngine.Rendering; using UnityEngine.Rendering.Universal; +using static X.Rendering.Feature.HziCullingFeature; namespace X.Rendering.Feature { + public class SceneRenderObjects + { + private static SceneRenderObjects intstance; + public static SceneRenderObjects Instance => intstance; + + private float4[] centers; + private float4[] sizes; + + private float3x2[] aabbs; + + List renderers = new(); + + static SceneRenderObjects() + { + intstance = new SceneRenderObjects(); + } + + public void Register(Renderer renderer) + { + if(!renderers.Contains(renderer)) + { + renderers.Add(renderer); + } + } + + public void UnRegister(Renderer renderer) + { + renderers.Remove(renderer); + } + + public void UpdateAABB(ComputeBuffer aabbBuffer) + { + if(aabbBuffer == null || aabbs.Length != renderers.Count) + { + aabbs = new float3x2[renderers.Count]; + } + for (int i = 0; i < renderers.Count; i++) + { + var rdr = renderers[i]; + if (rdr != null) + { + Bounds bounds = rdr.bounds; + aabbs[i] = new float3x2(bounds.center,bounds.size); + } + else + { + aabbs[i] = float3x2.zero; + } + } + + aabbBuffer.SetData(aabbs); + } + + public void UpdateAABB(Texture2D centerTex, Texture2D sizeTex) + { + if (centers == null || centers.Length != centerTex.width * centerTex.height) + { + centers = new float4[centerTex.width * centerTex.height]; + sizes = new float4[centerTex.width * centerTex.height]; + } + + if (renderers.Count == 0) + { + foreach (var item in GameObject.FindObjectsOfType()) + { + Register(item); + } + } + + int nullCnt = 0; + for (int i = 0; i < renderers.Count; i++) + { + var rdr = renderers[i]; + if (rdr != null) + { + Bounds bounds = rdr.bounds; + centers[i] = new float4(bounds.center, 1); + sizes[i] = new float4(bounds.size, 1); + } + else + { + nullCnt++; + centers[i] = float4.zero; + sizes[i] = float4.zero; + } + } + + if (nullCnt == renderers.Count) + { + renderers.Clear(); + } + + centerTex.SetPixelData(centers, 0); + centerTex.Apply(); + sizeTex.SetPixelData(sizes, 0); + sizeTex.Apply(); + } + + internal void ApplyCull(CullResult cullResult) + { + if (cullResult.ReadDone) + { + for (var i = 0; i < renderers.Count; i++) + { + var rdr = renderers[i]; + if (rdr) + { +#if UNITY_EDITOR + if(rdr.GetComponent() is CullDebug cullDebug && cullDebug) + { + cullDebug.Index = i; + cullDebug.IsCulled = cullResult.ResultArray[i] > 0; + cullDebug.Position = centers[i].xyz; + cullDebug.Size = sizes[i].xyz; + } + +#endif + //if (cullResult.ResultArray[i] > 0) + //{ + // rdr.renderingLayerMask = 0; + //} + //else + { + rdr.renderingLayerMask = 1; + } + } + } + } + } + } + public class HziCullingFeature : ScriptableRendererFeature { [Serializable] @@ -13,6 +149,13 @@ namespace X.Rendering.Feature //TODO: 通过使用上一帧的摄像机位置(包括矩阵)和上一帧的深度图做剔除,储存已经被剔除的物体和未被剔除的物体,然后绘制未被剔除的物体到GBuffer(包含深度图), //再二次生成HiZ DepthTexture,并对已经被剔除的物体使用一遍新的深度和当前摄像机的位置做一次剔除判断 public bool UseTowCullPass; + public bool UseTextureAABB = true; + public bool UseCompute; + public Material CullMat; + public ComputeShader CullShader; + [Range(32, 128)] + public int CullTextureSize = 64; + public bool UseThreeFrameReadback = true; } [SerializeField] @@ -32,23 +175,160 @@ namespace X.Rendering.Feature cullPass = new(settings); } - internal class CullPass : ScriptableRenderPass + protected override void Dispose(bool disposing) + { + base.Dispose(disposing); + cullPass.Dispose(); + } + + internal class CullResult : IDisposable + { + public RenderTexture ResultTex; + public NativeArray ResultArray; + public bool ReadDone = false; + public Action ReadBackAction; + + private void ReadBack(AsyncGPUReadbackRequest readback) + { + if (readback.done && !readback.hasError) + { + var data = readback.GetData(); + if (ResultArray.IsCreated) + { + ResultArray.CopyFrom(data); + } + ReadDone = true; + } + else + { + ReadDone = false; + } + } + + public CullResult CreateResources(int texSize) + { + ResultArray.Dispose(); + ResultArray = new NativeArray(texSize * texSize, Allocator.Persistent); + if(ResultTex != null) + { + ResultTex.Release(); + } + + ResultTex = new(texSize, texSize, 0, GraphicsFormat.R16_SFloat); + ResultTex.filterMode = FilterMode.Point; + ReadBackAction = ReadBack; + return this; + } + + public void Dispose() + { + if (ResultTex != null) + { + ResultTex.Release(); + } + ResultTex = null; + ResultArray.Dispose(); + } + } + + internal class CullPass : ScriptableRenderPass, IDisposable { private Settings settings; private ProfilingSampler profiler; - + Texture2D centerTex; + Texture2D sizeTex; + CullResult[] cullResults; public CullPass(Settings settings) { profiler = new("Hi-Z Culling"); this.settings = settings; + int texSize = settings.CullTextureSize; + centerTex = new Texture2D(texSize, texSize, GraphicsFormat.R32G32B32A32_SFloat, TextureCreationFlags.None); + centerTex.filterMode = FilterMode.Point; + sizeTex = new Texture2D(texSize, texSize, GraphicsFormat.R32G32B32A32_SFloat, TextureCreationFlags.None); + sizeTex.filterMode = FilterMode.Point; + cullResults = new CullResult[3] + { + new CullResult().CreateResources(texSize), + new CullResult().CreateResources(texSize), + new CullResult().CreateResources(texSize), + }; + + renderPassEvent = RenderPassEvent.AfterRendering; + } + + public void Dispose() + { + Texture.DestroyImmediate(centerTex); + Texture.DestroyImmediate(sizeTex); + for (int i = 0; i < cullResults.Length; i++) + { + cullResults[i]?.Dispose(); + } + } + + private int GetHizIndex() + { + if (settings.UseThreeFrameReadback) + { + return Time.frameCount % 3; + } + + return 0; + } + + private int GetLastCullIndex(int hizIndex) + { + switch (hizIndex) + { + case 0: + return 1; + case 1: + return 2; + case 2: + return 0; + default: + throw new ArgumentException("参数错误 hizIndex:" + hizIndex); + } } public override void Execute(ScriptableRenderContext context, ref RenderingData renderingData) { - var cmd = renderingData.commandBuffer; + var cmd = CommandBufferPool.Get("HizBuf"); using var soc = new ProfilingScope(cmd, profiler); + cmd.Clear(); + if (settings.UseTextureAABB) + { + SceneRenderObjects.Instance.UpdateAABB(centerTex, sizeTex); + } + var hizIndex = GetHizIndex(); + var cullResult = cullResults[hizIndex]; + SceneRenderObjects.Instance.ApplyCull(cullResults[GetLastCullIndex(hizIndex)]); + cullResult.ReadDone = false; + var world2Project = renderingData.cameraData.GetGPUProjectionMatrix() * renderingData.cameraData.GetViewMatrix(); + if(settings.UseCompute) + { + } + else + { + world2Project.m11 *= -1; + world2Project.m13 *= -1; + + cmd.SetGlobalMatrix(HizShaderIds.GPUCullingVPId, world2Project); + cmd.SetGlobalTexture(HizShaderIds.ObjectAABBCenterId, centerTex); + cmd.SetGlobalTexture(HizShaderIds.ObjectAABBSizeId, sizeTex); + cmd.SetRenderTarget(cullResult.ResultTex, RenderBufferLoadAction.DontCare, RenderBufferStoreAction.Store); + cmd.DrawMesh(RenderingUtils.fullscreenMesh, Matrix4x4.identity, settings.CullMat, 0, 0); + var temp = new NativeArray(centerTex.width * sizeTex.height, Allocator.Persistent); + cmd.RequestAsyncReadbackIntoNativeArray(ref temp, cullResult.ResultTex, 0, cullResult.ReadBackAction); + //cmd.RequestAsyncReadbackIntoNativeArray(ref cullResult.ResultArray, cullResult.ResultTex, 0, cullResult.ReadBackAction); + } + context.ExecuteCommandBuffer(cmd); + CommandBufferPool.Release(cmd); } + + } } } diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/Hidden_HizCull.mat b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/Hidden_HizCull.mat new file mode 100644 index 0000000..28de900 --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/Hidden_HizCull.mat @@ -0,0 +1,29 @@ +%YAML 1.1 +%TAG !u! tag:unity3d.com,2011: +--- !u!21 &2100000 +Material: + serializedVersion: 8 + m_ObjectHideFlags: 0 + m_CorrespondingSourceObject: {fileID: 0} + m_PrefabInstance: {fileID: 0} + m_PrefabAsset: {fileID: 0} + m_Name: Hidden_HizCull + m_Shader: {fileID: 4800000, guid: 1cd43651e6c755046991faf6246110fa, type: 3} + m_Parent: {fileID: 0} + m_ModifiedSerializedProperties: 0 + m_ValidKeywords: [] + m_InvalidKeywords: [] + m_LightmapFlags: 4 + m_EnableInstancingVariants: 0 + m_DoubleSidedGI: 0 + m_CustomRenderQueue: -1 + stringTagMap: {} + disabledShaderPasses: [] + m_LockedProperties: + m_SavedProperties: + serializedVersion: 3 + m_TexEnvs: [] + m_Ints: [] + m_Floats: [] + m_Colors: [] + m_BuildTextureStacks: [] diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/Hidden_HizCull.mat.meta b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/Hidden_HizCull.mat.meta new file mode 100644 index 0000000..d90cc9b --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/Hidden_HizCull.mat.meta @@ -0,0 +1,8 @@ +fileFormatVersion: 2 +guid: ce13e27ede10980489261083a699b449 +NativeFormatImporter: + externalObjects: {} + mainObjectFileID: 2100000 + userData: + assetBundleName: + assetBundleVariant: diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/HizCull.compute b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/HizCull.compute new file mode 100644 index 0000000..ad8fcb5 --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/HizCull.compute @@ -0,0 +1,14 @@ +// Each #kernel tells which function to compile; you can have many kernels +#pragma kernel CSMain + +// Create a RenderTexture with enableRandomWrite flag and set it +// with cs.SetTexture +RWTexture2D Result; + +[numthreads(8,8,1)] +void CSMain (uint3 id : SV_DispatchThreadID) +{ + // TODO: insert actual code here! + + Result[id.xy] = float4(id.x & id.y, (id.x & 15)/15.0, (id.y & 15)/15.0, 0.0); +} diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/HizCull.compute.meta b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/HizCull.compute.meta new file mode 100644 index 0000000..142db07 --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/HizCull.compute.meta @@ -0,0 +1,7 @@ +fileFormatVersion: 2 +guid: 029fde5b4200eeb4e919fe924f26fdd8 +ComputeShaderImporter: + externalObjects: {} + userData: + assetBundleName: + assetBundleVariant: diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/HizCull.shader b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/HizCull.shader new file mode 100644 index 0000000..fe1cfe2 --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/HizCull.shader @@ -0,0 +1,135 @@ +Shader "Hidden/HizCull" +{ + SubShader + { + Pass + { + HLSLPROGRAM + #pragma vertex Vertex + #pragma fragment CullFrag + #pragma enable_d3d11_debug_symbols + + #include "Packages/com.unity.render-pipelines.universal/ShaderLibrary/Core.hlsl" + + struct Attributes + { + float4 positionOS : POSITION; + float2 texcoord : TEXCOORD0; + }; + + struct Varyings + { + float2 uv : TEXCOORD0; + float4 positionCS : SV_POSITION; + }; + + Varyings Vertex(Attributes input) + { + Varyings output = (Varyings)0; + output.positionCS = TransformObjectToHClip(input.positionOS.xyz); + output.uv = input.texcoord; + return output; + } + + TEXTURE2D(_ObjectAABBTexture0); + SAMPLER(sampler_ObjectAABBTexture0); + TEXTURE2D(_ObjectAABBTexture1); + SAMPLER(sampler_ObjectAABBTexture1); + + + TEXTURE2D(_DepthPyramidTexture); + SAMPLER(sampler_DepthPyramidTexture); + float4x4 _GPUCullingVP; + float2 _MipmapLevelMinMaxIndex; + float2 _Mip0Size; + float4 _MipOffsetAndSize[16]; + + static const float3 aggressiveExtentArray[8] = + { + float3(1, 1, 1), + float3(1, 1, -1), + float3(1, -1, 1), + float3(1, -1, -1), + float3(-1, 1, 1), + float3(-1, 1, -1), + float3(-1, -1, 1), + float3(-1, -1, -1) + }; + + half CullFrag(Varyings input) : SV_Target + { + float2 uv = input.uv; + float4 aabbCenter = SAMPLE_TEXTURE2D_LOD(_ObjectAABBTexture0, sampler_ObjectAABBTexture0, uv,0.0); + float4 aabbSize = SAMPLE_TEXTURE2D_LOD(_ObjectAABBTexture1, sampler_ObjectAABBTexture1, uv, 0.0); + float3 aabbExtent = aabbSize.xyz * 0.5;//贴图可以直接存extent + UNITY_BRANCH + if (aabbCenter.a == 0.0) + { + return 1; + } + + #ifdef UNITY_REVERSED_Z + float minZ = 0; + #else + float minZ = 1; + #endif + + float2 maxXY = 0; + float2 minXY = 1; + for(uint i = 0; i < 8; ++i) + { + float3 boxCorner = aabbCenter + aabbExtent * aggressiveExtentArray[i]; + float4 clipPos = mul(_GPUCullingVP, float4(boxCorner, 1)); + clipPos /= clipPos.w; + minXY = min(clipPos.xy, minXY); + maxXY = max(clipPos.xy, maxXY); + #ifdef UNITY_REVERSED_Z + minZ = max(minZ, clipPos.z); + #else + minZ = min(minZ, clipPos.z); + #endif + } + + float4 boxUVs = float4(minXY, maxXY); + boxUVs = saturate(boxUVs * 0.5 + 0.5); + float2 size = (boxUVs.zw - boxUVs.xy) * _Mip0Size.xy; + float mip = (log2(max(size.x, size.y) / 2.0f)); + + mip = ceil(mip); + mip = clamp(mip, _MipmapLevelMinMaxIndex.x, _MipmapLevelMinMaxIndex.y); + + // float level_lower = max(mip - 1, 0); + // float2 scale = exp2(-level_lower) / _Mip0Size.xy; + // float2 a = floor(boxUVs.xy*scale); + // float2 b = ceil(boxUVs.zw*scale); + // float2 dims = b - a; + // + // // Use the lower level if we only touch <= 2 texels in both dimensions + // if (dims.x <= 2 && dims.y <= 2) + // mip = level_lower; + + float4 offsetAndSize = _MipOffsetAndSize[mip]; + int4 pxMinMax = boxUVs * offsetAndSize.zwzw + offsetAndSize.xyxy; + + float4 depth = float4(LOAD_TEXTURE2D_LOD(_DepthPyramidTexture, pxMinMax.xy,0).r, + LOAD_TEXTURE2D_LOD(_DepthPyramidTexture, pxMinMax.zy,0).r, + LOAD_TEXTURE2D_LOD(_DepthPyramidTexture, pxMinMax.xw,0).r, + LOAD_TEXTURE2D_LOD(_DepthPyramidTexture, pxMinMax.zw,0).r + ); + + #ifdef UNITY_REVERSED_Z + depth.xy = min(depth.xy, depth.zw); + depth.x = min(depth.x, depth.y); + return minZ >= depth.x ? 0 : 1; + #else + depth.xy = max(depth.xy, depth.zw); + depth.x = max(depth.x, depth.y); + return minZ <= depth.x ? 1 : 0; + #endif + } + + ENDHLSL + } + + } +} diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/HizCull.shader.meta b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/HizCull.shader.meta new file mode 100644 index 0000000..e228c2d --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/HizCull.shader.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 1cd43651e6c755046991faf6246110fa +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/SPD.compute b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/SPD.compute index ad8fcb5..1e792e0 100644 --- a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/SPD.compute +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/SPD.compute @@ -1,14 +1,126 @@ // Each #kernel tells which function to compile; you can have many kernels #pragma kernel CSMain +#define FFX_SPD_NO_WAVE_OPERATIONS 1 +// #pragma multi_compile FFX_SPD_NO_WAVE_OPERATIONS _ -// Create a RenderTexture with enableRandomWrite flag and set it -// with cs.SetTexture -RWTexture2D Result; +#define FFX_GPU 1 +#define FFX_HLSL 1 +#define FFX_HALF 1 -[numthreads(8,8,1)] -void CSMain (uint3 id : SV_DispatchThreadID) -{ - // TODO: insert actual code here! +#define FFXM_GPU 1 +#define FFXM_HALF 1 +#define FFXM_HLSL 1 - Result[id.xy] = float4(id.x & id.y, (id.x & 15)/15.0, (id.y & 15)/15.0, 0.0); +#include "ffx/ffx_core.h" +#include "ffx/ffx_common_types.h" + +FFX_GROUPSHARED FfxFloat32 spdIntermediate[16][16]; +void SpdStoreIntermediate(FfxUInt32 x, FfxUInt32 y, FfxFloat32x4 value) { + spdIntermediate[x][y] = value.x; } + +FfxFloat32x4 SpdLoadIntermediate(FfxUInt32 x, FfxUInt32 y) { + FfxFloat32 f = spdIntermediate[x][y]; + return FfxFloat32x4(f.x, f.x, f.x, f.x); +} + +FFX_GROUPSHARED FfxFloat32 spdIntermediateH[16][16]; +void SpdStoreIntermediateH(FfxUInt32 x, FfxUInt32 y, FfxFloat32x4 value) { + spdIntermediateH[x][y] = value.x; +} + +FfxFloat32x4 SpdLoadIntermediateH(FfxUInt32 x, FfxUInt32 y) { + FfxFloat32 f = spdIntermediateH[x][y]; + return FfxFloat32x4(f.x, f.x, f.x, f.x); +} + +FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3) { + #if FFX_SSSR_OPTION_INVERTED_DEPTH + return max(max(v0, v1), max(v2, v3)); + #else + return min(min(v0, v1), min(v2, v3)); + #endif +} + +FfxFloat16x4 SpdReduce4H(FfxFloat16x4 v0, FfxFloat16x4 v1, FfxFloat16x4 v2, FfxFloat16x4 v3) { + #if FFX_SSSR_OPTION_INVERTED_DEPTH + return max(max(v0, v1), max(v2, v3)); + #else + return min(min(v0, v1), min(v2, v3)); + #endif +} +Texture2D _InputDepth; +RWTexture2D _OutDepth; +float4 _MipOffsetAndSizeArray[16]; + +FfxFloat32x4 SpdLoad(FfxInt32x2 coordinate, FfxUInt32 slice) +{ + return _InputDepth[coordinate].xxxx; // 5 -> 6 as we store a copy of the depth buffer at index 0 +} + +void SpdStore(FfxInt32x2 pix, FfxFloat32x4 outValue, FfxUInt32 coordinate, FfxUInt32 slice) +{ + uint4 cur = _MipOffsetAndSizeArray[coordinate + 1]; + _OutDepth[pix + cur.xy] = outValue.x; // + 1 as we store a copy of the depth buffer at index 0 +} + +FfxFloat32x4 SpdLoadH(FfxInt32x2 coordinate, FfxUInt32 slice) +{ + return _InputDepth[coordinate].xxxx; // 5 -> 6 as we store a copy of the depth buffer at index 0 +} + +void SpdStoreH(FfxInt32x2 pix, FfxFloat32x4 outValue, FfxUInt32 coordinate, FfxUInt32 slice) +{ + uint4 cur = _MipOffsetAndSizeArray[coordinate + 1]; + _OutDepth[pix + cur.xy] = outValue.x; // + 1 as we store a copy of the depth buffer at index 0 +} + +FfxFloat32x4 SpdLoadSourceImage(FfxInt32x2 coordinate, FfxUInt32 slice) +{ + return _InputDepth[coordinate].xxxx; +} + +FfxFloat32x4 SpdLoadSourceImageH(FfxInt32x2 coordinate, FfxUInt32 slice) +{ + return _InputDepth[coordinate].xxxx; +} + +FfxUInt32 mips; +FfxUInt32 numWorkGroups; +FFX_GROUPSHARED FfxUInt32 spdCounter; +RWTexture2D rw_internal_global_atomic; + +void IncreaseAtomicCounter(FFX_PARAMETER_IN FfxUInt32 slice, FFX_PARAMETER_INOUT FfxUInt32 counter) +{ + InterlockedAdd(rw_internal_global_atomic[FfxInt32x2(0, 0)] , 1, counter); +} + +void ResetAtomicCounter(FFX_PARAMETER_IN FfxUInt32 slice) +{ + rw_internal_global_atomic[FfxInt32x2(0, 0)] = 0; +} + +FfxUInt32 SpdGetAtomicCounter() +{ + return spdCounter; +} + +void SpdResetAtomicCounter(FfxUInt32 slice) +{ + ResetAtomicCounter(slice); +} + +void SpdIncreaseAtomicCounter(FfxUInt32 slice) +{ + IncreaseAtomicCounter(slice, spdCounter); +} +#include "ffx/ffx_spd.h" + +[numthreads(256,1,1)] +void CSMain (uint LocalThreadIndex : SV_GroupIndex, uint3 WorkGroupId : SV_GroupID) +{ + SpdDownsampleH(WorkGroupId.xy, LocalThreadIndex, mips, numWorkGroups, WorkGroupId.z, 0); +} + + + diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx.meta b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx.meta new file mode 100644 index 0000000..16c6d26 --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx.meta @@ -0,0 +1,8 @@ +fileFormatVersion: 2 +guid: 7ffb36fd571dced4d9a653922e758cea +folderAsset: yes +DefaultImporter: + externalObjects: {} + userData: + assetBundleName: + assetBundleVariant: diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_common_types.h b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_common_types.h new file mode 100644 index 0000000..2c4f0ba --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_common_types.h @@ -0,0 +1,558 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (C) 2024 Advanced Micro Devices, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#ifndef FFX_COMMON_TYPES_H +#define FFX_COMMON_TYPES_H + +#if defined(FFX_CPU) +#define FFX_PARAMETER_IN +#define FFX_PARAMETER_OUT +#define FFX_PARAMETER_INOUT +#define FFX_PARAMETER_UNIFORM +#elif defined(FFX_HLSL) +#define FFX_PARAMETER_IN in +#define FFX_PARAMETER_OUT out +#define FFX_PARAMETER_INOUT inout +#define FFX_PARAMETER_UNIFORM uniform +#elif defined(FFX_GLSL) +#define FFX_PARAMETER_IN in +#define FFX_PARAMETER_OUT out +#define FFX_PARAMETER_INOUT inout +#define FFX_PARAMETER_UNIFORM const //[cacao_placeholder] until a better fit is found! +#endif // #if defined(FFX_CPU) + +#if defined(FFX_CPU) +/// A typedef for a boolean value. +/// +/// @ingroup CPUTypes +typedef bool FfxBoolean; + +/// A typedef for a unsigned 8bit integer. +/// +/// @ingroup CPUTypes +typedef uint8_t FfxUInt8; + +/// A typedef for a unsigned 16bit integer. +/// +/// @ingroup CPUTypes +typedef uint16_t FfxUInt16; + +/// A typedef for a unsigned 32bit integer. +/// +/// @ingroup CPUTypes +typedef uint32_t FfxUInt32; + +/// A typedef for a unsigned 64bit integer. +/// +/// @ingroup CPUTypes +typedef uint64_t FfxUInt64; + +/// A typedef for a signed 8bit integer. +/// +/// @ingroup CPUTypes +typedef int8_t FfxInt8; + +/// A typedef for a signed 16bit integer. +/// +/// @ingroup CPUTypes +typedef int16_t FfxInt16; + +/// A typedef for a signed 32bit integer. +/// +/// @ingroup CPUTypes +typedef int32_t FfxInt32; + +/// A typedef for a signed 64bit integer. +/// +/// @ingroup CPUTypes +typedef int64_t FfxInt64; + +/// A typedef for a floating point value. +/// +/// @ingroup CPUTypes +typedef float FfxFloat32; + +/// A typedef for a 2-dimensional floating point value. +/// +/// @ingroup CPUTypes +typedef float FfxFloat32x2[2]; + +/// A typedef for a 3-dimensional floating point value. +/// +/// @ingroup CPUTypes +typedef float FfxFloat32x3[3]; + +/// A typedef for a 4-dimensional floating point value. +/// +/// @ingroup CPUTypes +typedef float FfxFloat32x4[4]; + +/// A typedef for a 2x2 floating point matrix. +/// +/// @ingroup CPUTypes +typedef float FfxFloat32x2x2[4]; + +/// A typedef for a 3x3 floating point matrix. +/// +/// @ingroup CPUTypes +typedef float FfxFloat32x3x3[9]; + +/// A typedef for a 3x4 floating point matrix. +/// +/// @ingroup CPUTypes +typedef float FfxFloat32x3x4[12]; + +/// A typedef for a 4x4 floating point matrix. +/// +/// @ingroup CPUTypes +typedef float FfxFloat32x4x4[16]; + +/// A typedef for a 2-dimensional 32bit signed integer. +/// +/// @ingroup CPUTypes +typedef int32_t FfxInt32x2[2]; + +/// A typedef for a 3-dimensional 32bit signed integer. +/// +/// @ingroup CPUTypes +typedef int32_t FfxInt32x3[3]; + +/// A typedef for a 4-dimensional 32bit signed integer. +/// +/// @ingroup CPUTypes +typedef int32_t FfxInt32x4[4]; + +/// A typedef for a 2-dimensional 32bit usigned integer. +/// +/// @ingroup CPUTypes +typedef uint32_t FfxUInt32x2[2]; + +/// A typedef for a 3-dimensional 32bit unsigned integer. +/// +/// @ingroup CPUTypes +typedef uint32_t FfxUInt32x3[3]; + +/// A typedef for a 4-dimensional 32bit unsigned integer. +/// +/// @ingroup CPUTypes +typedef uint32_t FfxUInt32x4[4]; +#endif // #if defined(FFX_CPU) + +#if defined(FFX_HLSL) + +#define FfxFloat32Mat4 matrix +#define FfxFloat32Mat3 matrix + +/// A typedef for a boolean value. +/// +/// @ingroup HLSLTypes +typedef bool FfxBoolean; + +#if FFX_HLSL_SM>=62 + +/// @defgroup HLSL62Types HLSL 6.2 And Above Types +/// HLSL 6.2 and above type defines for all commonly used variables +/// +/// @ingroup HLSLTypes + +/// A typedef for a floating point value. +/// +/// @ingroup HLSL62Types +typedef float32_t FfxFloat32; + +/// A typedef for a 2-dimensional floating point value. +/// +/// @ingroup HLSL62Types +typedef float32_t2 FfxFloat32x2; + +/// A typedef for a 3-dimensional floating point value. +/// +/// @ingroup HLSL62Types +typedef float32_t3 FfxFloat32x3; + +/// A typedef for a 4-dimensional floating point value. +/// +/// @ingroup HLSL62Types +typedef float32_t4 FfxFloat32x4; + +/// A [cacao_placeholder] typedef for matrix type until confirmed. +typedef float4x4 FfxFloat32x4x4; +typedef float3x4 FfxFloat32x3x4; +typedef float3x3 FfxFloat32x3x3; +typedef float2x2 FfxFloat32x2x2; + +/// A typedef for a unsigned 32bit integer. +/// +/// @ingroup HLSL62Types +typedef uint32_t FfxUInt32; + +/// A typedef for a 2-dimensional 32bit unsigned integer. +/// +/// @ingroup HLSL62Types +typedef uint32_t2 FfxUInt32x2; + +/// A typedef for a 3-dimensional 32bit unsigned integer. +/// +/// @ingroup HLSL62Types +typedef uint32_t3 FfxUInt32x3; + +/// A typedef for a 4-dimensional 32bit unsigned integer. +/// +/// @ingroup HLSL62Types +typedef uint32_t4 FfxUInt32x4; + +/// A typedef for a signed 32bit integer. +/// +/// @ingroup HLSL62Types +typedef int32_t FfxInt32; + +/// A typedef for a 2-dimensional signed 32bit integer. +/// +/// @ingroup HLSL62Types +typedef int32_t2 FfxInt32x2; + +/// A typedef for a 3-dimensional signed 32bit integer. +/// +/// @ingroup HLSL62Types +typedef int32_t3 FfxInt32x3; + +/// A typedef for a 4-dimensional signed 32bit integer. +/// +/// @ingroup HLSL62Types +typedef int32_t4 FfxInt32x4; + +#else // #if FFX_HLSL_SM>=62 + +/// @defgroup HLSLBaseTypes HLSL 6.1 And Below Types +/// HLSL 6.1 and below type defines for all commonly used variables +/// +/// @ingroup HLSLTypes + +#define FfxFloat32 float +#define FfxFloat32x2 float2 +#define FfxFloat32x3 float3 +#define FfxFloat32x4 float4 + +/// A [cacao_placeholder] typedef for matrix type until confirmed. +#define FfxFloat32x4x4 float4x4 +#define FfxFloat32x3x4 float3x4 +#define FfxFloat32x3x3 float3x3 +#define FfxFloat32x2x2 float2x2 + +/// A typedef for a unsigned 32bit integer. +/// +/// @ingroup GPU +typedef uint FfxUInt32; +typedef uint2 FfxUInt32x2; +typedef uint3 FfxUInt32x3; +typedef uint4 FfxUInt32x4; + +typedef int FfxInt32; +typedef int2 FfxInt32x2; +typedef int3 FfxInt32x3; +typedef int4 FfxInt32x4; + +#endif // #if FFX_HLSL_SM>=62 + +#if FFX_HALF + +#if FFX_HLSL_SM >= 62 + +typedef float16_t FfxFloat16; +typedef float16_t2 FfxFloat16x2; +typedef float16_t3 FfxFloat16x3; +typedef float16_t4 FfxFloat16x4; + +/// A typedef for an unsigned 16bit integer. +/// +/// @ingroup HLSLTypes +typedef uint16_t FfxUInt16; +typedef uint16_t2 FfxUInt16x2; +typedef uint16_t3 FfxUInt16x3; +typedef uint16_t4 FfxUInt16x4; + +/// A typedef for a signed 16bit integer. +/// +/// @ingroup HLSLTypes +typedef int16_t FfxInt16; +typedef int16_t2 FfxInt16x2; +typedef int16_t3 FfxInt16x3; +typedef int16_t4 FfxInt16x4; +#else // #if FFX_HLSL_SM>=62 +typedef min16float FfxFloat16; +typedef min16float2 FfxFloat16x2; +typedef min16float3 FfxFloat16x3; +typedef min16float4 FfxFloat16x4; + +/// A typedef for an unsigned 16bit integer. +/// +/// @ingroup HLSLTypes +typedef min16uint FfxUInt16; +typedef min16uint2 FfxUInt16x2; +typedef min16uint3 FfxUInt16x3; +typedef min16uint4 FfxUInt16x4; + +/// A typedef for a signed 16bit integer. +/// +/// @ingroup HLSLTypes +typedef min16int FfxInt16; +typedef min16int2 FfxInt16x2; +typedef min16int3 FfxInt16x3; +typedef min16int4 FfxInt16x4; +#endif // #if FFX_HLSL_SM>=62 + +#endif // FFX_HALF + +#endif // #if defined(FFX_HLSL) + +#if defined(FFX_GLSL) + +#define FfxFloat32Mat4 mat4 +#define FfxFloat32Mat3 mat3 + +/// A typedef for a boolean value. +/// +/// @ingroup GLSLTypes +#define FfxBoolean bool +#define FfxFloat32 float +#define FfxFloat32x2 vec2 +#define FfxFloat32x3 vec3 +#define FfxFloat32x4 vec4 +#define FfxUInt32 uint +#define FfxUInt32x2 uvec2 +#define FfxUInt32x3 uvec3 +#define FfxUInt32x4 uvec4 +#define FfxInt32 int +#define FfxInt32x2 ivec2 +#define FfxInt32x3 ivec3 +#define FfxInt32x4 ivec4 + +/// A [cacao_placeholder] typedef for matrix type until confirmed. +#define FfxFloat32x4x4 mat4 +#define FfxFloat32x3x4 mat4x3 +#define FfxFloat32x3x3 mat3 +#define FfxFloat32x2x2 mat2 + +#if FFX_HALF +#define FfxFloat16 float16_t +#define FfxFloat16x2 f16vec2 +#define FfxFloat16x3 f16vec3 +#define FfxFloat16x4 f16vec4 +#define FfxUInt16 uint16_t +#define FfxUInt16x2 u16vec2 +#define FfxUInt16x3 u16vec3 +#define FfxUInt16x4 u16vec4 +#define FfxInt16 int16_t +#define FfxInt16x2 i16vec2 +#define FfxInt16x3 i16vec3 +#define FfxInt16x4 i16vec4 +#endif // FFX_HALF +#endif // #if defined(FFX_GLSL) + +// Global toggles: +// #define FFX_HALF (1) +// #define FFX_HLSL_SM (62) + +#if FFX_HALF + +#if FFX_HLSL_SM >= 62 + +#define FFX_MIN16_SCALAR( TypeName, BaseComponentType ) typedef BaseComponentType##16_t TypeName; +#define FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL ) typedef vector TypeName; +#define FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix TypeName; + +#define FFX_16BIT_SCALAR( TypeName, BaseComponentType ) typedef BaseComponentType##16_t TypeName; +#define FFX_16BIT_VECTOR( TypeName, BaseComponentType, COL ) typedef vector TypeName; +#define FFX_16BIT_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix TypeName; + +#else //FFX_HLSL_SM>=62 + +#define FFX_MIN16_SCALAR( TypeName, BaseComponentType ) typedef min16##BaseComponentType TypeName; +#define FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL ) typedef vector TypeName; +#define FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix TypeName; + +#define FFX_16BIT_SCALAR( TypeName, BaseComponentType ) FFX_MIN16_SCALAR( TypeName, BaseComponentType ); +#define FFX_16BIT_VECTOR( TypeName, BaseComponentType, COL ) FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL ); +#define FFX_16BIT_MATRIX( TypeName, BaseComponentType, ROW, COL ) FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ); + +#endif //FFX_HLSL_SM>=62 + +#else //FFX_HALF + +#define FFX_MIN16_SCALAR( TypeName, BaseComponentType ) typedef BaseComponentType TypeName; +#define FFX_MIN16_VECTOR( TypeName, BaseComponentType, COL ) typedef vector TypeName; +#define FFX_MIN16_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix TypeName; + +#define FFX_16BIT_SCALAR( TypeName, BaseComponentType ) typedef BaseComponentType TypeName; +#define FFX_16BIT_VECTOR( TypeName, BaseComponentType, COL ) typedef vector TypeName; +#define FFX_16BIT_MATRIX( TypeName, BaseComponentType, ROW, COL ) typedef matrix TypeName; + +#endif //FFX_HALF + +#if defined(FFX_GPU) +// Common typedefs: +#if defined(FFX_HLSL) +FFX_MIN16_SCALAR( FFX_MIN16_F , float ); +FFX_MIN16_VECTOR( FFX_MIN16_F2, float, 2 ); +FFX_MIN16_VECTOR( FFX_MIN16_F3, float, 3 ); +FFX_MIN16_VECTOR( FFX_MIN16_F4, float, 4 ); + +FFX_MIN16_SCALAR( FFX_MIN16_I, int ); +FFX_MIN16_VECTOR( FFX_MIN16_I2, int, 2 ); +FFX_MIN16_VECTOR( FFX_MIN16_I3, int, 3 ); +FFX_MIN16_VECTOR( FFX_MIN16_I4, int, 4 ); + +FFX_MIN16_SCALAR( FFX_MIN16_U, uint ); +FFX_MIN16_VECTOR( FFX_MIN16_U2, uint, 2 ); +FFX_MIN16_VECTOR( FFX_MIN16_U3, uint, 3 ); +FFX_MIN16_VECTOR( FFX_MIN16_U4, uint, 4 ); + +FFX_16BIT_SCALAR( FFX_F16_t , float ); +FFX_16BIT_VECTOR( FFX_F16_t2, float, 2 ); +FFX_16BIT_VECTOR( FFX_F16_t3, float, 3 ); +FFX_16BIT_VECTOR( FFX_F16_t4, float, 4 ); + +FFX_16BIT_SCALAR( FFX_I16_t, int ); +FFX_16BIT_VECTOR( FFX_I16_t2, int, 2 ); +FFX_16BIT_VECTOR( FFX_I16_t3, int, 3 ); +FFX_16BIT_VECTOR( FFX_I16_t4, int, 4 ); + +FFX_16BIT_SCALAR( FFX_U16_t, uint ); +FFX_16BIT_VECTOR( FFX_U16_t2, uint, 2 ); +FFX_16BIT_VECTOR( FFX_U16_t3, uint, 3 ); +FFX_16BIT_VECTOR( FFX_U16_t4, uint, 4 ); + +#define TYPEDEF_MIN16_TYPES(Prefix) \ +typedef FFX_MIN16_F Prefix##_F; \ +typedef FFX_MIN16_F2 Prefix##_F2; \ +typedef FFX_MIN16_F3 Prefix##_F3; \ +typedef FFX_MIN16_F4 Prefix##_F4; \ +typedef FFX_MIN16_I Prefix##_I; \ +typedef FFX_MIN16_I2 Prefix##_I2; \ +typedef FFX_MIN16_I3 Prefix##_I3; \ +typedef FFX_MIN16_I4 Prefix##_I4; \ +typedef FFX_MIN16_U Prefix##_U; \ +typedef FFX_MIN16_U2 Prefix##_U2; \ +typedef FFX_MIN16_U3 Prefix##_U3; \ +typedef FFX_MIN16_U4 Prefix##_U4; + +#define TYPEDEF_16BIT_TYPES(Prefix) \ +typedef FFX_16BIT_F Prefix##_F; \ +typedef FFX_16BIT_F2 Prefix##_F2; \ +typedef FFX_16BIT_F3 Prefix##_F3; \ +typedef FFX_16BIT_F4 Prefix##_F4; \ +typedef FFX_16BIT_I Prefix##_I; \ +typedef FFX_16BIT_I2 Prefix##_I2; \ +typedef FFX_16BIT_I3 Prefix##_I3; \ +typedef FFX_16BIT_I4 Prefix##_I4; \ +typedef FFX_16BIT_U Prefix##_U; \ +typedef FFX_16BIT_U2 Prefix##_U2; \ +typedef FFX_16BIT_U3 Prefix##_U3; \ +typedef FFX_16BIT_U4 Prefix##_U4; + +#define TYPEDEF_FULL_PRECISION_TYPES(Prefix) \ +typedef FfxFloat32 Prefix##_F; \ +typedef FfxFloat32x2 Prefix##_F2; \ +typedef FfxFloat32x3 Prefix##_F3; \ +typedef FfxFloat32x4 Prefix##_F4; \ +typedef FfxInt32 Prefix##_I; \ +typedef FfxInt32x2 Prefix##_I2; \ +typedef FfxInt32x3 Prefix##_I3; \ +typedef FfxInt32x4 Prefix##_I4; \ +typedef FfxUInt32 Prefix##_U; \ +typedef FfxUInt32x2 Prefix##_U2; \ +typedef FfxUInt32x3 Prefix##_U3; \ +typedef FfxUInt32x4 Prefix##_U4; +#endif // #if defined(FFX_HLSL) + +#if defined(FFX_GLSL) + +#if FFX_HALF + +#define FFX_MIN16_F float16_t +#define FFX_MIN16_F2 f16vec2 +#define FFX_MIN16_F3 f16vec3 +#define FFX_MIN16_F4 f16vec4 + +#define FFX_MIN16_I int16_t +#define FFX_MIN16_I2 i16vec2 +#define FFX_MIN16_I3 i16vec3 +#define FFX_MIN16_I4 i16vec4 + +#define FFX_MIN16_U uint16_t +#define FFX_MIN16_U2 u16vec2 +#define FFX_MIN16_U3 u16vec3 +#define FFX_MIN16_U4 u16vec4 + +#define FFX_16BIT_F float16_t +#define FFX_16BIT_F2 f16vec2 +#define FFX_16BIT_F3 f16vec3 +#define FFX_16BIT_F4 f16vec4 + +#define FFX_16BIT_I int16_t +#define FFX_16BIT_I2 i16vec2 +#define FFX_16BIT_I3 i16vec3 +#define FFX_16BIT_I4 i16vec4 + +#define FFX_16BIT_U uint16_t +#define FFX_16BIT_U2 u16vec2 +#define FFX_16BIT_U3 u16vec3 +#define FFX_16BIT_U4 u16vec4 + +#else // FFX_HALF + +#define FFX_MIN16_F float +#define FFX_MIN16_F2 vec2 +#define FFX_MIN16_F3 vec3 +#define FFX_MIN16_F4 vec4 + +#define FFX_MIN16_I int +#define FFX_MIN16_I2 ivec2 +#define FFX_MIN16_I3 ivec3 +#define FFX_MIN16_I4 ivec4 + +#define FFX_MIN16_U uint +#define FFX_MIN16_U2 uvec2 +#define FFX_MIN16_U3 uvec3 +#define FFX_MIN16_U4 uvec4 + +#define FFX_16BIT_F float +#define FFX_16BIT_F2 vec2 +#define FFX_16BIT_F3 vec3 +#define FFX_16BIT_F4 vec4 + +#define FFX_16BIT_I int +#define FFX_16BIT_I2 ivec2 +#define FFX_16BIT_I3 ivec3 +#define FFX_16BIT_I4 ivec4 + +#define FFX_16BIT_U uint +#define FFX_16BIT_U2 uvec2 +#define FFX_16BIT_U3 uvec3 +#define FFX_16BIT_U4 uvec4 + +#endif // FFX_HALF + +#endif // #if defined(FFX_GLSL) + +#endif // #if defined(FFX_GPU) +#endif // #ifndef FFX_COMMON_TYPES_H diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_common_types.h.meta b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_common_types.h.meta new file mode 100644 index 0000000..0668405 --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_common_types.h.meta @@ -0,0 +1,27 @@ +fileFormatVersion: 2 +guid: 7db508df1f69cd949909e1597a58f09b +PluginImporter: + externalObjects: {} + serializedVersion: 2 + iconMap: {} + executionOrder: {} + defineConstraints: [] + isPreloaded: 0 + isOverridable: 1 + isExplicitlyReferenced: 0 + validateReferences: 1 + platformData: + - first: + Any: + second: + enabled: 1 + settings: {} + - first: + Editor: Editor + second: + enabled: 0 + settings: + DefaultValueInitialized: true + userData: + assetBundleName: + assetBundleVariant: diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core.h b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core.h new file mode 100644 index 0000000..d1ed144 --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core.h @@ -0,0 +1,80 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (C) 2024 Advanced Micro Devices, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +/// @defgroup FfxGPU GPU +/// The FidelityFX SDK GPU References +/// +/// @ingroup ffxSDK + +/// @defgroup FfxHLSL HLSL References +/// FidelityFX SDK HLSL GPU References +/// +/// @ingroup FfxGPU + +/// @defgroup FfxGLSL GLSL References +/// FidelityFX SDK GLSL GPU References +/// +/// @ingroup FfxGPU + +/// @defgroup FfxGPUEffects FidelityFX GPU References +/// FidelityFX Effect GPU Reference Documentation +/// +/// @ingroup FfxGPU + +/// @defgroup GPUCore GPU Core +/// GPU defines and functions +/// +/// @ingroup FfxGPU + +#if !defined(FFX_CORE_H) +#define FFX_CORE_H + +#ifdef __hlsl_dx_compiler +#pragma dxc diagnostic push +#pragma dxc diagnostic ignored "-Wambig-lit-shift" +#endif //__hlsl_dx_compiler + +#include "ffx_common_types.h" + +#if defined(FFX_CPU) + #include "ffx_core_cpu.h" +#endif // #if defined(FFX_CPU) + +#if defined(FFX_GLSL) && defined(FFX_GPU) + #include "ffx_core_glsl.h" +#endif // #if defined(FFX_GLSL) && defined(FFX_GPU) + +#if defined(FFX_HLSL) && defined(FFX_GPU) + #include "ffx_core_hlsl.h" +#endif // #if defined(FFX_HLSL) && defined(FFX_GPU) + +#if defined(FFX_GPU) + #include "ffx_core_gpu_common.h" + #include "ffx_core_gpu_common_half.h" + #include "ffx_core_portability.h" +#endif // #if defined(FFX_GPU) + +#ifdef __hlsl_dx_compiler +#pragma dxc diagnostic pop +#endif //__hlsl_dx_compiler + +#endif // #if !defined(FFX_CORE_H) diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core.h.meta b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core.h.meta new file mode 100644 index 0000000..09ddd81 --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core.h.meta @@ -0,0 +1,27 @@ +fileFormatVersion: 2 +guid: 59fa5955b83fd5a45a2c8698750e7314 +PluginImporter: + externalObjects: {} + serializedVersion: 2 + iconMap: {} + executionOrder: {} + defineConstraints: [] + isPreloaded: 0 + isOverridable: 1 + isExplicitlyReferenced: 0 + validateReferences: 1 + platformData: + - first: + Any: + second: + enabled: 1 + settings: {} + - first: + Editor: Editor + second: + enabled: 0 + settings: + DefaultValueInitialized: true + userData: + assetBundleName: + assetBundleVariant: diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_gpu_common.h b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_gpu_common.h new file mode 100644 index 0000000..9f88c94 --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_gpu_common.h @@ -0,0 +1,2736 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (C) 2024 Advanced Micro Devices, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +/// A define for a true value in a boolean expression. +/// +/// @ingroup GPUCore +#define FFX_TRUE (true) + +/// A define for a false value in a boolean expression. +/// +/// @ingroup GPUCore +#define FFX_FALSE (false) + +/// A define value for positive infinity. +/// +/// @ingroup GPUCore +#define FFX_POSITIVE_INFINITY_FLOAT ffxAsFloat(0x7f800000u) + +/// A define value for negative infinity. +/// +/// @ingroup GPUCore +#define FFX_NEGATIVE_INFINITY_FLOAT ffxAsFloat(0xff800000u) + +/// A define value for PI. +/// +/// @ingroup GPUCore +#define FFX_PI (3.14159) + +FFX_STATIC const FfxFloat32 FFX_FP16_MIN = 6.10e-05f; +FFX_STATIC const FfxFloat32 FFX_FP16_MAX = 65504.0f; +FFX_STATIC const FfxFloat32 FFX_TONEMAP_EPSILON = 1.0f / FFX_FP16_MAX; + +#define FFX_HAS_FLAG(v, f) ((v & f) == f) + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat32 ffxMin(FfxFloat32 x, FfxFloat32 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxMin(FfxFloat32x2 x, FfxFloat32x2 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxMin(FfxFloat32x3 x, FfxFloat32x3 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxMin(FfxFloat32x4 x, FfxFloat32x4 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt32 ffxMin(FfxInt32 x, FfxInt32 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt32x2 ffxMin(FfxInt32x2 x, FfxInt32x2 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt32x3 ffxMin(FfxInt32x3 x, FfxInt32x3 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt32x4 ffxMin(FfxInt32x4 x, FfxInt32x4 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt32 ffxMin(FfxUInt32 x, FfxUInt32 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxMin(FfxUInt32x2 x, FfxUInt32x2 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt32x3 ffxMin(FfxUInt32x3 x, FfxUInt32x3 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt32x4 ffxMin(FfxUInt32x4 x, FfxUInt32x4 y) +{ + return min(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat32 ffxMax(FfxFloat32 x, FfxFloat32 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxMax(FfxFloat32x2 x, FfxFloat32x2 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxMax(FfxFloat32x3 x, FfxFloat32x3 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxMax(FfxFloat32x4 x, FfxFloat32x4 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt32 ffxMax(FfxInt32 x, FfxInt32 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt32x2 ffxMax(FfxInt32x2 x, FfxInt32x2 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt32x3 ffxMax(FfxInt32x3 x, FfxInt32x3 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt32x4 ffxMax(FfxInt32x4 x, FfxInt32x4 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt32 ffxMax(FfxUInt32 x, FfxUInt32 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxMax(FfxUInt32x2 x, FfxUInt32x2 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt32x3 ffxMax(FfxUInt32x3 x, FfxUInt32x3 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt32x4 ffxMax(FfxUInt32x4 x, FfxUInt32x4 y) +{ + return max(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPUCore +FfxFloat32 ffxPow(FfxFloat32 x, FfxFloat32 y) +{ + return pow(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxPow(FfxFloat32x2 x, FfxFloat32x2 y) +{ + return pow(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxPow(FfxFloat32x3 x, FfxFloat32x3 y) +{ + return pow(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxPow(FfxFloat32x4 x, FfxFloat32x4 y) +{ + return pow(x, y); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPUCore +FfxFloat32 ffxSqrt(FfxFloat32 x) +{ + return sqrt(x); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxSqrt(FfxFloat32x2 x) +{ + return sqrt(x); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxSqrt(FfxFloat32x3 x) +{ + return sqrt(x); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxSqrt(FfxFloat32x4 x) +{ + return sqrt(x); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPUCore +FfxFloat32 ffxCopySignBit(FfxFloat32 d, FfxFloat32 s) +{ + return ffxAsFloat(ffxAsUInt32(d) | (ffxAsUInt32(s) & FfxUInt32(0x80000000u))); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxCopySignBit(FfxFloat32x2 d, FfxFloat32x2 s) +{ + return ffxAsFloat(ffxAsUInt32(d) | (ffxAsUInt32(s) & ffxBroadcast2(0x80000000u))); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxCopySignBit(FfxFloat32x3 d, FfxFloat32x3 s) +{ + return ffxAsFloat(ffxAsUInt32(d) | (ffxAsUInt32(s) & ffxBroadcast3(0x80000000u))); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxCopySignBit(FfxFloat32x4 d, FfxFloat32x4 s) +{ + return ffxAsFloat(ffxAsUInt32(d) | (ffxAsUInt32(s) & ffxBroadcast4(0x80000000u))); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against 0. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or position. +/// +/// @ingroup GPUCore +FfxFloat32 ffxIsSigned(FfxFloat32 m) +{ + return ffxSaturate(m * FfxFloat32(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against 0. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or position. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxIsSigned(FfxFloat32x2 m) +{ + return ffxSaturate(m * ffxBroadcast2(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against 0. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or position. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxIsSigned(FfxFloat32x3 m) +{ + return ffxSaturate(m * ffxBroadcast3(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against for have the sign set. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or positive. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxIsSigned(FfxFloat32x4 m) +{ + return ffxSaturate(m * ffxBroadcast4(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPUCore +FfxFloat32 ffxIsGreaterThanZero(FfxFloat32 m) +{ + return ffxSaturate(m * FfxFloat32(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxIsGreaterThanZero(FfxFloat32x2 m) +{ + return ffxSaturate(m * ffxBroadcast2(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxIsGreaterThanZero(FfxFloat32x3 m) +{ + return ffxSaturate(m * ffxBroadcast3(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxIsGreaterThanZero(FfxFloat32x4 m) +{ + return ffxSaturate(m * ffxBroadcast4(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// Convert a 32bit floating point value to sortable integer. +/// +/// - If sign bit=0, flip the sign bit (positives). +/// - If sign bit=1, flip all bits (negatives). +/// +/// The function has the side effects that: +/// - Larger integers are more positive values. +/// - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage). +/// +/// @param [in] value The floating point value to make sortable. +/// +/// @returns +/// The sortable integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxFloatToSortableInteger(FfxUInt32 value) +{ + return value ^ ((ffxAShrSU1(value, FfxUInt32(31))) | FfxUInt32(0x80000000)); +} + +/// Convert a sortable integer to a 32bit floating point value. +/// +/// The function has the side effects that: +/// - If sign bit=1, flip the sign bit (positives). +/// - If sign bit=0, flip all bits (negatives). +/// +/// @param [in] value The floating point value to make sortable. +/// +/// @returns +/// The sortable integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxSortableIntegerToFloat(FfxUInt32 value) +{ + return value ^ ((~ffxAShrSU1(value, FfxUInt32(31))) | FfxUInt32(0x80000000)); +} + +/// Calculate a low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat32 ffxApproximateSqrt(FfxFloat32 value) +{ + return ffxAsFloat((ffxAsUInt32(value) >> FfxUInt32(1)) + FfxUInt32(0x1fbc4639)); +} + +/// Calculate a low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat32 ffxApproximateReciprocal(FfxFloat32 value) +{ + return ffxAsFloat(FfxUInt32(0x7ef07ebb) - ffxAsUInt32(value)); +} + +/// Calculate a medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPUCore +FfxFloat32 ffxApproximateReciprocalMedium(FfxFloat32 value) +{ + FfxFloat32 b = ffxAsFloat(FfxUInt32(0x7ef19fff) - ffxAsUInt32(value)); + return b * (-b * value + FfxFloat32(2.0)); +} + +/// Calculate a low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal square root for. +/// +/// @returns +/// An approximation of the reciprocal square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat32 ffxApproximateReciprocalSquareRoot(FfxFloat32 value) +{ + return ffxAsFloat(FfxUInt32(0x5f347d74) - (ffxAsUInt32(value) >> FfxUInt32(1))); +} + +/// Calculate a low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxApproximateSqrt(FfxFloat32x2 value) +{ + return ffxAsFloat((ffxAsUInt32(value) >> ffxBroadcast2(1u)) + ffxBroadcast2(0x1fbc4639u)); +} + +/// Calculate a low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxApproximateReciprocal(FfxFloat32x2 value) +{ + return ffxAsFloat(ffxBroadcast2(0x7ef07ebbu) - ffxAsUInt32(value)); +} + +/// Calculate a medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxApproximateReciprocalMedium(FfxFloat32x2 value) +{ + FfxFloat32x2 b = ffxAsFloat(ffxBroadcast2(0x7ef19fffu) - ffxAsUInt32(value)); + return b * (-b * value + ffxBroadcast2(2.0f)); +} + +/// Calculate a low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxApproximateReciprocalSquareRoot(FfxFloat32x2 value) +{ + return ffxAsFloat(ffxBroadcast2(0x5f347d74u) - (ffxAsUInt32(value) >> ffxBroadcast2(1u))); +} + +/// Calculate a low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxApproximateSqrt(FfxFloat32x3 value) +{ + return ffxAsFloat((ffxAsUInt32(value) >> ffxBroadcast3(1u)) + ffxBroadcast3(0x1fbc4639u)); +} + +/// Calculate a low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxApproximateReciprocal(FfxFloat32x3 value) +{ + return ffxAsFloat(ffxBroadcast3(0x7ef07ebbu) - ffxAsUInt32(value)); +} + +/// Calculate a medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxApproximateReciprocalMedium(FfxFloat32x3 value) +{ + FfxFloat32x3 b = ffxAsFloat(ffxBroadcast3(0x7ef19fffu) - ffxAsUInt32(value)); + return b * (-b * value + ffxBroadcast3(2.0f)); +} + +/// Calculate a low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxApproximateReciprocalSquareRoot(FfxFloat32x3 value) +{ + return ffxAsFloat(ffxBroadcast3(0x5f347d74u) - (ffxAsUInt32(value) >> ffxBroadcast3(1u))); +} + +/// Calculate a low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxApproximateSqrt(FfxFloat32x4 value) +{ + return ffxAsFloat((ffxAsUInt32(value) >> ffxBroadcast4(1u)) + ffxBroadcast4(0x1fbc4639u)); +} + +/// Calculate a low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxApproximateReciprocal(FfxFloat32x4 value) +{ + return ffxAsFloat(ffxBroadcast4(0x7ef07ebbu) - ffxAsUInt32(value)); +} + +/// Calculate a medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxApproximateReciprocalMedium(FfxFloat32x4 value) +{ + FfxFloat32x4 b = ffxAsFloat(ffxBroadcast4(0x7ef19fffu) - ffxAsUInt32(value)); + return b * (-b * value + ffxBroadcast4(2.0f)); +} + +/// Calculate a low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] value The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxApproximateReciprocalSquareRoot(FfxFloat32x4 value) +{ + return ffxAsFloat(ffxBroadcast4(0x5f347d74u) - (ffxAsUInt32(value) >> ffxBroadcast4(1u))); +} + +/// Calculate dot product of 'a' and 'b'. +/// +/// @param [in] a First vector input. +/// @param [in] b Second vector input. +/// +/// @returns +/// The value of a dot b. +/// +/// @ingroup GPUCore +FfxFloat32 ffxDot2(FfxFloat32x2 a, FfxFloat32x2 b) +{ + return dot(a, b); +} + +/// Calculate dot product of 'a' and 'b'. +/// +/// @param [in] a First vector input. +/// @param [in] b Second vector input. +/// +/// @returns +/// The value of a dot b. +/// +/// @ingroup GPUCore +FfxFloat32 ffxDot3(FfxFloat32x3 a, FfxFloat32x3 b) +{ + return dot(a, b); +} + +/// Calculate dot product of 'a' and 'b'. +/// +/// @param [in] a First vector input. +/// @param [in] b Second vector input. +/// +/// @returns +/// The value of a dot b. +/// +/// @ingroup GPUCore +FfxFloat32 ffxDot4(FfxFloat32x4 a, FfxFloat32x4 b) +{ + return dot(a, b); +} + + +/// Compute an approximate conversion from PQ to Gamma2 space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and Gamma2. +/// +/// @returns +/// The value a converted into Gamma2. +/// +/// @ingroup GPUCore +FfxFloat32 ffxApproximatePQToGamma2Medium(FfxFloat32 a) +{ + return a * a * a * a; +} + +/// Compute an approximate conversion from PQ to linear space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and linear. +/// +/// @returns +/// The value a converted into linear. +/// +/// @ingroup GPUCore +FfxFloat32 ffxApproximatePQToLinear(FfxFloat32 a) +{ + return a * a * a * a * a * a * a * a; +} + +/// Compute an approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32 ffxApproximateGamma2ToPQ(FfxFloat32 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> FfxUInt32(2)) + FfxUInt32(0x2F9A4E46)); +} + +/// Compute a more accurate approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32 ffxApproximateGamma2ToPQMedium(FfxFloat32 a) +{ + FfxFloat32 b = ffxAsFloat((ffxAsUInt32(a) >> FfxUInt32(2)) + FfxUInt32(0x2F9A4E46)); + FfxFloat32 b4 = b * b * b * b; + return b - b * (b4 - a) / (FfxFloat32(4.0) * b4); +} + +/// Compute a high accuracy approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32 ffxApproximateGamma2ToPQHigh(FfxFloat32 a) +{ + return ffxSqrt(ffxSqrt(a)); +} + +/// Compute an approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32 ffxApproximateLinearToPQ(FfxFloat32 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> FfxUInt32(3)) + FfxUInt32(0x378D8723)); +} + +/// Compute a more accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32 ffxApproximateLinearToPQMedium(FfxFloat32 a) +{ + FfxFloat32 b = ffxAsFloat((ffxAsUInt32(a) >> FfxUInt32(3)) + FfxUInt32(0x378D8723)); + FfxFloat32 b8 = b * b * b * b * b * b * b * b; + return b - b * (b8 - a) / (FfxFloat32(8.0) * b8); +} + +/// Compute a very accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32 ffxApproximateLinearToPQHigh(FfxFloat32 a) +{ + return ffxSqrt(ffxSqrt(ffxSqrt(a))); +} + +/// Compute an approximate conversion from PQ to Gamma2 space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and Gamma2. +/// +/// @returns +/// The value a converted into Gamma2. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxApproximatePQToGamma2Medium(FfxFloat32x2 a) +{ + return a * a * a * a; +} + +/// Compute an approximate conversion from PQ to linear space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and linear. +/// +/// @returns +/// The value a converted into linear. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxApproximatePQToLinear(FfxFloat32x2 a) +{ + return a * a * a * a * a * a * a * a; +} + +/// Compute an approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxApproximateGamma2ToPQ(FfxFloat32x2 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast2(2u)) + ffxBroadcast2(0x2F9A4E46u)); +} + +/// Compute a more accurate approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxApproximateGamma2ToPQMedium(FfxFloat32x2 a) +{ + FfxFloat32x2 b = ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast2(2u)) + ffxBroadcast2(0x2F9A4E46u)); + FfxFloat32x2 b4 = b * b * b * b; + return b - b * (b4 - a) / (FfxFloat32(4.0) * b4); +} + +/// Compute a high accuracy approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxApproximateGamma2ToPQHigh(FfxFloat32x2 a) +{ + return ffxSqrt(ffxSqrt(a)); +} + +/// Compute an approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxApproximateLinearToPQ(FfxFloat32x2 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast2(3u)) + ffxBroadcast2(0x378D8723u)); +} + +/// Compute a more accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxApproximateLinearToPQMedium(FfxFloat32x2 a) +{ + FfxFloat32x2 b = ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast2(3u)) + ffxBroadcast2(0x378D8723u)); + FfxFloat32x2 b8 = b * b * b * b * b * b * b * b; + return b - b * (b8 - a) / (FfxFloat32(8.0) * b8); +} + +/// Compute a very accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxApproximateLinearToPQHigh(FfxFloat32x2 a) +{ + return ffxSqrt(ffxSqrt(ffxSqrt(a))); +} + +/// Compute an approximate conversion from PQ to Gamma2 space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and Gamma2. +/// +/// @returns +/// The value a converted into Gamma2. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxApproximatePQToGamma2Medium(FfxFloat32x3 a) +{ + return a * a * a * a; +} + +/// Compute an approximate conversion from PQ to linear space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and linear. +/// +/// @returns +/// The value a converted into linear. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxApproximatePQToLinear(FfxFloat32x3 a) +{ + return a * a * a * a * a * a * a * a; +} + +/// Compute an approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxApproximateGamma2ToPQ(FfxFloat32x3 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast3(2u)) + ffxBroadcast3(0x2F9A4E46u)); +} + +/// Compute a more accurate approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxApproximateGamma2ToPQMedium(FfxFloat32x3 a) +{ + FfxFloat32x3 b = ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast3(2u)) + ffxBroadcast3(0x2F9A4E46u)); + FfxFloat32x3 b4 = b * b * b * b; + return b - b * (b4 - a) / (FfxFloat32(4.0) * b4); +} + +/// Compute a high accuracy approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxApproximateGamma2ToPQHigh(FfxFloat32x3 a) +{ + return ffxSqrt(ffxSqrt(a)); +} + +/// Compute an approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxApproximateLinearToPQ(FfxFloat32x3 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast3(3u)) + ffxBroadcast3(0x378D8723u)); +} + +/// Compute a more accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxApproximateLinearToPQMedium(FfxFloat32x3 a) +{ + FfxFloat32x3 b = ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast3(3u)) + ffxBroadcast3(0x378D8723u)); + FfxFloat32x3 b8 = b * b * b * b * b * b * b * b; + return b - b * (b8 - a) / (FfxFloat32(8.0) * b8); +} + +/// Compute a very accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxApproximateLinearToPQHigh(FfxFloat32x3 a) +{ + return ffxSqrt(ffxSqrt(ffxSqrt(a))); +} + +/// Compute an approximate conversion from PQ to Gamma2 space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and Gamma2. +/// +/// @returns +/// The value a converted into Gamma2. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxApproximatePQToGamma2Medium(FfxFloat32x4 a) +{ + return a * a * a * a; +} + +/// Compute an approximate conversion from PQ to linear space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between PQ and linear. +/// +/// @returns +/// The value a converted into linear. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxApproximatePQToLinear(FfxFloat32x4 a) +{ + return a * a * a * a * a * a * a * a; +} + +/// Compute an approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxApproximateGamma2ToPQ(FfxFloat32x4 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast4(2u)) + ffxBroadcast4(0x2F9A4E46u)); +} + +/// Compute a more accurate approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxApproximateGamma2ToPQMedium(FfxFloat32x4 a) +{ + FfxFloat32x4 b = ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast4(2u)) + ffxBroadcast4(0x2F9A4E46u)); + FfxFloat32x4 b4 = b * b * b * b * b * b * b * b; + return b - b * (b4 - a) / (FfxFloat32(4.0) * b4); +} + +/// Compute a high accuracy approximate conversion from gamma2 to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between gamma2 and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxApproximateGamma2ToPQHigh(FfxFloat32x4 a) +{ + return ffxSqrt(ffxSqrt(a)); +} + +/// Compute an approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxApproximateLinearToPQ(FfxFloat32x4 a) +{ + return ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast4(3u)) + ffxBroadcast4(0x378D8723u)); +} + +/// Compute a more accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxApproximateLinearToPQMedium(FfxFloat32x4 a) +{ + FfxFloat32x4 b = ffxAsFloat((ffxAsUInt32(a) >> ffxBroadcast4(3u)) + ffxBroadcast4(0x378D8723u)); + FfxFloat32x4 b8 = b * b * b * b * b * b * b * b; + return b - b * (b8 - a) / (FfxFloat32(8.0) * b8); +} + +/// Compute a very accurate approximate conversion from linear to PQ space. +/// +/// PQ is very close to x^(1/8). The functions below Use the fast FfxFloat32 approximation method to do +/// PQ conversions to and from Gamma2 (4th power and fast 4th root), and PQ to and from Linear +/// (8th power and fast 8th root). The maximum error is approximately 0.2%. +/// +/// @param a The value to convert between linear and PQ. +/// +/// @returns +/// The value a converted into PQ. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxApproximateLinearToPQHigh(FfxFloat32x4 a) +{ + return ffxSqrt(ffxSqrt(ffxSqrt(a))); +} + +// An approximation of sine. +// +// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +// is {-1/4 to 1/4} representing {-1 to 1}. +// +// @param [in] value The value to calculate approximate sine for. +// +// @returns +// The approximate sine of value. +FfxFloat32 ffxParabolicSin(FfxFloat32 value) +{ + return value * abs(value) - value; +} + +// An approximation of sine. +// +// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +// is {-1/4 to 1/4} representing {-1 to 1}. +// +// @param [in] value The value to calculate approximate sine for. +// +// @returns +// The approximate sine of value. +FfxFloat32x2 ffxParabolicSin(FfxFloat32x2 x) +{ + return x * abs(x) - x; +} + +// An approximation of cosine. +// +// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +// is {-1/4 to 1/4} representing {-1 to 1}. +// +// @param [in] value The value to calculate approximate cosine for. +// +// @returns +// The approximate cosine of value. +FfxFloat32 ffxParabolicCos(FfxFloat32 x) +{ + x = ffxFract(x * FfxFloat32(0.5) + FfxFloat32(0.75)); + x = x * FfxFloat32(2.0) - FfxFloat32(1.0); + return ffxParabolicSin(x); +} + +// An approximation of cosine. +// +// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +// is {-1/4 to 1/4} representing {-1 to 1}. +// +// @param [in] value The value to calculate approximate cosine for. +// +// @returns +// The approximate cosine of value. +FfxFloat32x2 ffxParabolicCos(FfxFloat32x2 x) +{ + x = ffxFract(x * ffxBroadcast2(0.5f) + ffxBroadcast2(0.75f)); + x = x * ffxBroadcast2(2.0f) - ffxBroadcast2(1.0f); + return ffxParabolicSin(x); +} + +// An approximation of both sine and cosine. +// +// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +// is {-1/4 to 1/4} representing {-1 to 1}. +// +// @param [in] value The value to calculate approximate cosine for. +// +// @returns +// A FfxFloat32x2 containing approximations of both sine and cosine of value. +FfxFloat32x2 ffxParabolicSinCos(FfxFloat32 x) +{ + FfxFloat32 y = ffxFract(x * FfxFloat32(0.5) + FfxFloat32(0.75)); + y = y * FfxFloat32(2.0) - FfxFloat32(1.0); + return ffxParabolicSin(FfxFloat32x2(x, y)); +} + +/// Conditional free logic AND operation using values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPUCore +FfxUInt32 ffxZeroOneAnd(FfxUInt32 x, FfxUInt32 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxZeroOneAnd(FfxUInt32x2 x, FfxUInt32x2 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPUCore +FfxUInt32x3 ffxZeroOneAnd(FfxUInt32x3 x, FfxUInt32x3 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPUCore +FfxUInt32x4 ffxZeroOneAnd(FfxUInt32x4 x, FfxUInt32x4 y) +{ + return min(x, y); +} + +/// Conditional free logic NOT operation using two values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPUCore +FfxUInt32 ffxZeroOneAnd(FfxUInt32 x) +{ + return x ^ FfxUInt32(1); +} + +/// Conditional free logic NOT operation using two values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxZeroOneAnd(FfxUInt32x2 x) +{ + return x ^ ffxBroadcast2(1u); +} + +/// Conditional free logic NOT operation using two values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPUCore +FfxUInt32x3 ffxZeroOneAnd(FfxUInt32x3 x) +{ + return x ^ ffxBroadcast3(1u); +} + +/// Conditional free logic NOT operation using two values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPUCore +FfxUInt32x4 ffxZeroOneAnd(FfxUInt32x4 x) +{ + return x ^ ffxBroadcast4(1u); +} + +/// Conditional free logic OR operation using two values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxUInt32 ffxZeroOneOr(FfxUInt32 x, FfxUInt32 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxZeroOneOr(FfxUInt32x2 x, FfxUInt32x2 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxUInt32x3 ffxZeroOneOr(FfxUInt32x3 x, FfxUInt32x3 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxUInt32x4 ffxZeroOneOr(FfxUInt32x4 x, FfxUInt32x4 y) +{ + return max(x, y); +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxUInt32 ffxZeroOneAndToU1(FfxFloat32 x) +{ + return FfxUInt32(FfxFloat32(1.0) - x); +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxZeroOneAndToU2(FfxFloat32x2 x) +{ + return FfxUInt32x2(ffxBroadcast2(1.0) - x); +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxUInt32x3 ffxZeroOneAndToU3(FfxFloat32x3 x) +{ + return FfxUInt32x3(ffxBroadcast3(1.0) - x); +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxUInt32x4 ffxZeroOneAndToU4(FfxFloat32x4 x) +{ + return FfxUInt32x4(ffxBroadcast4(1.0) - x); +} + +/// Conditional free logic AND operation using two values followed by a NOT operation +/// using the resulting value and a third value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat32 ffxZeroOneAndOr(FfxFloat32 x, FfxFloat32 y, FfxFloat32 z) +{ + return ffxSaturate(x * y + z); +} + +/// Conditional free logic AND operation using two values followed by a NOT operation +/// using the resulting value and a third value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxZeroOneAndOr(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 z) +{ + return ffxSaturate(x * y + z); +} + +/// Conditional free logic AND operation using two values followed by a NOT operation +/// using the resulting value and a third value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxZeroOneAndOr(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 z) +{ + return ffxSaturate(x * y + z); +} + +/// Conditional free logic AND operation using two values followed by a NOT operation +/// using the resulting value and a third value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxZeroOneAndOr(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 z) +{ + return ffxSaturate(x * y + z); +} + +/// Given a value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPUCore +FfxFloat32 ffxZeroOneIsGreaterThanZero(FfxFloat32 x) +{ + return ffxSaturate(x * FfxFloat32(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// Given a value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxZeroOneIsGreaterThanZero(FfxFloat32x2 x) +{ + return ffxSaturate(x * ffxBroadcast2(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// Given a value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxZeroOneIsGreaterThanZero(FfxFloat32x3 x) +{ + return ffxSaturate(x * ffxBroadcast3(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// Given a value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxZeroOneIsGreaterThanZero(FfxFloat32x4 x) +{ + return ffxSaturate(x * ffxBroadcast4(FFX_POSITIVE_INFINITY_FLOAT)); +} + +/// Conditional free logic signed NOT operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat32 ffxZeroOneAnd(FfxFloat32 x) +{ + return FfxFloat32(1.0) - x; +} + +/// Conditional free logic signed NOT operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxZeroOneAnd(FfxFloat32x2 x) +{ + return ffxBroadcast2(1.0) - x; +} + +/// Conditional free logic signed NOT operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxZeroOneAnd(FfxFloat32x3 x) +{ + return ffxBroadcast3(1.0) - x; +} + +/// Conditional free logic signed NOT operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxZeroOneAnd(FfxFloat32x4 x) +{ + return ffxBroadcast4(1.0) - x; +} + +/// Conditional free logic OR operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxFloat32 ffxZeroOneOr(FfxFloat32 x, FfxFloat32 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxZeroOneOr(FfxFloat32x2 x, FfxFloat32x2 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxZeroOneOr(FfxFloat32x3 x, FfxFloat32x3 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxZeroOneOr(FfxFloat32x4 x, FfxFloat32x4 y) +{ + return max(x, y); +} + +/// Choose between two FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPUCore +FfxFloat32 ffxZeroOneSelect(FfxFloat32 x, FfxFloat32 y, FfxFloat32 z) +{ + FfxFloat32 r = (-x) * z + z; + return x * y + r; +} + +/// Choose between two FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxZeroOneSelect(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 z) +{ + FfxFloat32x2 r = (-x) * z + z; + return x * y + r; +} + +/// Choose between two FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxZeroOneSelect(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 z) +{ + FfxFloat32x3 r = (-x) * z + z; + return x * y + r; +} + +/// Choose between two FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxZeroOneSelect(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 z) +{ + FfxFloat32x4 r = (-x) * z + z; + return x * y + r; +} + +/// Given a value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPUCore +FfxFloat32 ffxZeroOneIsSigned(FfxFloat32 x) +{ + return ffxSaturate(x * FfxFloat32(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// Given a value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxZeroOneIsSigned(FfxFloat32x2 x) +{ + return ffxSaturate(x * ffxBroadcast2(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// Given a value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxZeroOneIsSigned(FfxFloat32x3 x) +{ + return ffxSaturate(x * ffxBroadcast3(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// Given a value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPUCore +FfxFloat32x4 ffxZeroOneIsSigned(FfxFloat32x4 x) +{ + return ffxSaturate(x * ffxBroadcast4(FFX_NEGATIVE_INFINITY_FLOAT)); +} + +/// Compute a Rec.709 color space. +/// +/// Rec.709 is used for some HDTVs. +/// +/// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +/// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +/// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +/// +/// @param [in] color The color to convert to Rec. 709. +/// +/// @returns +/// The color in linear space. +/// +/// @ingroup GPUCore +FfxFloat32 ffxRec709FromLinear(FfxFloat32 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.018 * 4.5, 4.5, 0.45); + FfxFloat32x2 k = FfxFloat32x2(1.099, -0.099); + return clamp(j.x, color * j.y, pow(color, j.z) * k.x + k.y); +} + +/// Compute a Rec.709 color space. +/// +/// Rec.709 is used for some HDTVs. +/// +/// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +/// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +/// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +/// +/// @param [in] color The color to convert to Rec. 709. +/// +/// @returns +/// The color in linear space. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxRec709FromLinear(FfxFloat32x2 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.018 * 4.5, 4.5, 0.45); + FfxFloat32x2 k = FfxFloat32x2(1.099, -0.099); + return clamp(j.xx, color * j.yy, pow(color, j.zz) * k.xx + k.yy); +} + +/// Compute a Rec.709 color space. +/// +/// Rec.709 is used for some HDTVs. +/// +/// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +/// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +/// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +/// +/// @param [in] color The color to convert to Rec. 709. +/// +/// @returns +/// The color in linear space. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxRec709FromLinear(FfxFloat32x3 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.018 * 4.5, 4.5, 0.45); + FfxFloat32x2 k = FfxFloat32x2(1.099, -0.099); + return clamp(j.xxx, color * j.yyy, pow(color, j.zzz) * k.xxx + k.yyy); +} + +/// Compute a linear value from a REC.709 value. +/// +/// @param [in] color The value to convert to linear from REC.709. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32 ffxLinearFromRec709(FfxFloat32 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.081 / 4.5, 1.0 / 4.5, 1.0 / 0.45); + FfxFloat32x2 k = FfxFloat32x2(1.0 / 1.099, 0.099 / 1.099); + return ffxZeroOneSelect(ffxZeroOneIsSigned(color - j.x), color * j.y, pow(color * k.x + k.y, j.z)); +} + +/// Compute a linear value from a REC.709 value. +/// +/// @param [in] color The value to convert to linear from REC.709. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxLinearFromRec709(FfxFloat32x2 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.081 / 4.5, 1.0 / 4.5, 1.0 / 0.45); + FfxFloat32x2 k = FfxFloat32x2(1.0 / 1.099, 0.099 / 1.099); + return ffxZeroOneSelect(ffxZeroOneIsSigned(color - j.xx), color * j.yy, pow(color * k.xx + k.yy, j.zz)); +} + +/// Compute a linear value from a REC.709 value. +/// +/// @param [in] color The value to convert to linear from REC.709. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxLinearFromRec709(FfxFloat32x3 color) +{ + FfxFloat32x3 j = FfxFloat32x3(0.081 / 4.5, 1.0 / 4.5, 1.0 / 0.45); + FfxFloat32x2 k = FfxFloat32x2(1.0 / 1.099, 0.099 / 1.099); + return ffxZeroOneSelect(ffxZeroOneIsSigned(color - j.xxx), color * j.yyy, pow(color * k.xxx + k.yyy, j.zzz)); +} + +/// Compute a gamma value from a linear value. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// Note: 'rcpX' is '1/x', where the 'x' is what would be used in ffxLinearFromGamma. +/// +/// @param [in] value The value to convert to gamma space from linear. +/// @param [in] power The reciprocal of power value used for the gamma curve. +/// +/// @returns +/// A value in gamma space. +/// +/// @ingroup GPUCore +FfxFloat32 ffxGammaFromLinear(FfxFloat32 value, FfxFloat32 power) +{ + return pow(value, FfxFloat32(power)); +} + +/// Compute a gamma value from a linear value. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// Note: 'rcpX' is '1/x', where the 'x' is what would be used in ffxLinearFromGamma. +/// +/// @param [in] value The value to convert to gamma space from linear. +/// @param [in] power The reciprocal of power value used for the gamma curve. +/// +/// @returns +/// A value in gamma space. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxGammaFromLinear(FfxFloat32x2 value, FfxFloat32 power) +{ + return pow(value, ffxBroadcast2(power)); +} + +/// Compute a gamma value from a linear value. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// Note: 'rcpX' is '1/x', where the 'x' is what would be used in ffxLinearFromGamma. +/// +/// @param [in] value The value to convert to gamma space from linear. +/// @param [in] power The reciprocal of power value used for the gamma curve. +/// +/// @returns +/// A value in gamma space. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxGammaFromLinear(FfxFloat32x3 value, FfxFloat32 power) +{ + return pow(value, ffxBroadcast3(power)); +} + +/// Compute a linear value from a value in a gamma space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] color The value to convert to linear in gamma space. +/// @param [in] power The power value used for the gamma curve. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32 ffxLinearFromGamma(FfxFloat32 color, FfxFloat32 power) +{ + return pow(color, FfxFloat32(power)); +} + +/// Compute a linear value from a value in a gamma space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] color The value to convert to linear in gamma space. +/// @param [in] power The power value used for the gamma curve. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxLinearFromGamma(FfxFloat32x2 color, FfxFloat32 power) +{ + return pow(color, ffxBroadcast2(power)); +} + +/// Compute a linear value from a value in a gamma space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] color The value to convert to linear in gamma space. +/// @param [in] power The power value used for the gamma curve. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxLinearFromGamma(FfxFloat32x3 color, FfxFloat32 power) +{ + return pow(color, ffxBroadcast3(power)); +} + +/// Compute a PQ value from a linear value. +/// +/// @param [in] value The value to convert to PQ from linear. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32 ffxPQFromLinear(FfxFloat32 value) +{ + FfxFloat32 p = pow(value, FfxFloat32(0.159302)); + return pow((FfxFloat32(0.835938) + FfxFloat32(18.8516) * p) / (FfxFloat32(1.0) + FfxFloat32(18.6875) * p), FfxFloat32(78.8438)); +} + +/// Compute a PQ value from a linear value. +/// +/// @param [in] value The value to convert to PQ from linear. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxPQFromLinear(FfxFloat32x2 value) +{ + FfxFloat32x2 p = pow(value, ffxBroadcast2(0.159302)); + return pow((ffxBroadcast2(0.835938) + ffxBroadcast2(18.8516) * p) / (ffxBroadcast2(1.0) + ffxBroadcast2(18.6875) * p), ffxBroadcast2(78.8438)); +} + +/// Compute a PQ value from a linear value. +/// +/// @param [in] value The value to convert to PQ from linear. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxPQFromLinear(FfxFloat32x3 value) +{ + FfxFloat32x3 p = pow(value, ffxBroadcast3(0.159302)); + return pow((ffxBroadcast3(0.835938) + ffxBroadcast3(18.8516) * p) / (ffxBroadcast3(1.0) + ffxBroadcast3(18.6875) * p), ffxBroadcast3(78.8438)); +} + +/// Compute a linear value from a value in a PQ space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] value The value to convert to linear in PQ space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32 ffxLinearFromPQ(FfxFloat32 value) +{ + FfxFloat32 p = pow(value, FfxFloat32(0.0126833)); + return pow(ffxSaturate(p - FfxFloat32(0.835938)) / (FfxFloat32(18.8516) - FfxFloat32(18.6875) * p), FfxFloat32(6.27739)); +} + +/// Compute a linear value from a value in a PQ space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] value The value to convert to linear in PQ space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxLinearFromPQ(FfxFloat32x2 value) +{ + FfxFloat32x2 p = pow(value, ffxBroadcast2(0.0126833)); + return pow(ffxSaturate(p - ffxBroadcast2(0.835938)) / (ffxBroadcast2(18.8516) - ffxBroadcast2(18.6875) * p), ffxBroadcast2(6.27739)); +} + +/// Compute a linear value from a value in a PQ space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] value The value to convert to linear in PQ space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxLinearFromPQ(FfxFloat32x3 value) +{ + FfxFloat32x3 p = pow(value, ffxBroadcast3(0.0126833)); + return pow(ffxSaturate(p - ffxBroadcast3(0.835938)) / (ffxBroadcast3(18.8516) - ffxBroadcast3(18.6875) * p), ffxBroadcast3(6.27739)); +} + +/// Compute an SRGB value from a linear value. +/// +/// @param [in] value The value to convert to SRGB from linear. +/// +/// @returns +/// A value in SRGB space. +/// +/// @ingroup GPUCore +FfxFloat32 ffxSrgbFromLinear(FfxFloat32 value) +{ + FfxFloat32x3 j = FfxFloat32x3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); + FfxFloat32x2 k = FfxFloat32x2(1.055, -0.055); + return clamp(j.x, value * j.y, pow(value, j.z) * k.x + k.y); +} + +/// Compute an SRGB value from a linear value. +/// +/// @param [in] value The value to convert to SRGB from linear. +/// +/// @returns +/// A value in SRGB space. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxSrgbFromLinear(FfxFloat32x2 value) +{ + FfxFloat32x3 j = FfxFloat32x3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); + FfxFloat32x2 k = FfxFloat32x2(1.055, -0.055); + return clamp(j.xx, value * j.yy, pow(value, j.zz) * k.xx + k.yy); +} + +/// Compute an SRGB value from a linear value. +/// +/// @param [in] value The value to convert to SRGB from linear. +/// +/// @returns +/// A value in SRGB space. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxSrgbFromLinear(FfxFloat32x3 value) +{ + FfxFloat32x3 j = FfxFloat32x3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); + FfxFloat32x2 k = FfxFloat32x2(1.055, -0.055); + return clamp(j.xxx, value * j.yyy, pow(value, j.zzz) * k.xxx + k.yyy); +} + +/// Compute a linear value from a value in a SRGB space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] value The value to convert to linear in SRGB space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32 ffxLinearFromSrgb(FfxFloat32 value) +{ + FfxFloat32x3 j = FfxFloat32x3(0.04045 / 12.92, 1.0 / 12.92, 2.4); + FfxFloat32x2 k = FfxFloat32x2(1.0 / 1.055, 0.055 / 1.055); + return ffxZeroOneSelect(ffxZeroOneIsSigned(value - j.x), value * j.y, pow(value * k.x + k.y, j.z)); +} + +/// Compute a linear value from a value in a SRGB space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] value The value to convert to linear in SRGB space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32x2 ffxLinearFromSrgb(FfxFloat32x2 value) +{ + FfxFloat32x3 j = FfxFloat32x3(0.04045 / 12.92, 1.0 / 12.92, 2.4); + FfxFloat32x2 k = FfxFloat32x2(1.0 / 1.055, 0.055 / 1.055); + return ffxZeroOneSelect(ffxZeroOneIsSigned(value - j.xx), value * j.yy, pow(value * k.xx + k.yy, j.zz)); +} + +/// Compute a linear value from a value in a SRGB space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] value The value to convert to linear in SRGB space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat32x3 ffxLinearFromSrgb(FfxFloat32x3 value) +{ + FfxFloat32x3 j = FfxFloat32x3(0.04045 / 12.92, 1.0 / 12.92, 2.4); + FfxFloat32x2 k = FfxFloat32x2(1.0 / 1.055, 0.055 / 1.055); + return ffxZeroOneSelect(ffxZeroOneIsSigned(value - j.xxx), value * j.yyy, pow(value * k.xxx + k.yyy, j.zzz)); +} + +/// A remapping of 64x1 to 8x8 imposing rotated 2x2 pixel quads in quad linear. +/// +/// Remap illustration: +/// +/// 543210 +/// ~~~~~~ +/// ..xxx. +/// yy...y +/// +/// @param [in] a The input 1D coordinates to remap. +/// +/// @returns +/// The remapped 2D coordinates. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxRemapForQuad(FfxUInt32 a) +{ + return FfxUInt32x2(ffxBitfieldExtract(a, 1u, 3u), ffxBitfieldInsertMask(ffxBitfieldExtract(a, 3u, 3u), a, 1u)); +} + +/// A helper function performing a remap 64x1 to 8x8 remapping which is necessary for 2D wave reductions. +/// +/// The 64-wide lane indices to 8x8 remapping is performed as follows: +/// +/// 00 01 08 09 10 11 18 19 +/// 02 03 0a 0b 12 13 1a 1b +/// 04 05 0c 0d 14 15 1c 1d +/// 06 07 0e 0f 16 17 1e 1f +/// 20 21 28 29 30 31 38 39 +/// 22 23 2a 2b 32 33 3a 3b +/// 24 25 2c 2d 34 35 3c 3d +/// 26 27 2e 2f 36 37 3e 3f +/// +/// @param [in] a The input 1D coordinate to remap. +/// +/// @returns +/// The remapped 2D coordinates. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxRemapForWaveReduction(FfxUInt32 a) +{ + return FfxUInt32x2(ffxBitfieldInsertMask(ffxBitfieldExtract(a, 2u, 3u), a, 1u), ffxBitfieldInsertMask(ffxBitfieldExtract(a, 3u, 3u), ffxBitfieldExtract(a, 1u, 2u), 2u)); +} diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_gpu_common.h.meta b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_gpu_common.h.meta new file mode 100644 index 0000000..23d087a --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_gpu_common.h.meta @@ -0,0 +1,27 @@ +fileFormatVersion: 2 +guid: eead56bcc49293d4385163969a9a0307 +PluginImporter: + externalObjects: {} + serializedVersion: 2 + iconMap: {} + executionOrder: {} + defineConstraints: [] + isPreloaded: 0 + isOverridable: 1 + isExplicitlyReferenced: 0 + validateReferences: 1 + platformData: + - first: + Any: + second: + enabled: 1 + settings: {} + - first: + Editor: Editor + second: + enabled: 0 + settings: + DefaultValueInitialized: true + userData: + assetBundleName: + assetBundleVariant: diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_gpu_common_half.h b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_gpu_common_half.h new file mode 100644 index 0000000..1cb780b --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_gpu_common_half.h @@ -0,0 +1,2981 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (C) 2024 Advanced Micro Devices, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#if FFX_HALF +#if FFX_HLSL_SM >= 62 +/// A define value for 16bit positive infinity. +/// +/// @ingroup GPUCore +#define FFX_POSITIVE_INFINITY_HALF FFX_TO_FLOAT16((uint16_t)0x7c00u) + +/// A define value for 16bit negative infinity. +/// +/// @ingroup GPUCore +#define FFX_NEGATIVE_INFINITY_HALF FFX_TO_FLOAT16((uint16_t)0xfc00u) +#else +/// A define value for 16bit positive infinity. +/// +/// @ingroup GPUCore +#define FFX_POSITIVE_INFINITY_HALF FFX_TO_FLOAT16(0x7c00u) + +/// A define value for 16bit negative infinity. +/// +/// @ingroup GPUCore +#define FFX_NEGATIVE_INFINITY_HALF FFX_TO_FLOAT16(0xfc00u) +#endif // #if FFX_HLSL_SM>=62 + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat16 ffxMin(FfxFloat16 x, FfxFloat16 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxMin(FfxFloat16x2 x, FfxFloat16x2 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxMin(FfxFloat16x3 x, FfxFloat16x3 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxMin(FfxFloat16x4 x, FfxFloat16x4 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt16 ffxMin(FfxInt16 x, FfxInt16 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt16x2 ffxMin(FfxInt16x2 x, FfxInt16x2 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt16x3 ffxMin(FfxInt16x3 x, FfxInt16x3 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt16x4 ffxMin(FfxInt16x4 x, FfxInt16x4 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt16 ffxMin(FfxUInt16 x, FfxUInt16 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt16x2 ffxMin(FfxUInt16x2 x, FfxUInt16x2 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt16x3 ffxMin(FfxUInt16x3 x, FfxUInt16x3 y) +{ + return min(x, y); +} + +/// Compute the min of two values. +/// +/// @param [in] x The first value to compute the min of. +/// @param [in] y The second value to compute the min of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt16x4 ffxMin(FfxUInt16x4 x, FfxUInt16x4 y) +{ + return min(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat16 ffxMax(FfxFloat16 x, FfxFloat16 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxMax(FfxFloat16x2 x, FfxFloat16x2 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxMax(FfxFloat16x3 x, FfxFloat16x3 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxMax(FfxFloat16x4 x, FfxFloat16x4 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt16 ffxMax(FfxInt16 x, FfxInt16 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt16x2 ffxMax(FfxInt16x2 x, FfxInt16x2 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt16x3 ffxMax(FfxInt16x3 x, FfxInt16x3 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxInt16x4 ffxMax(FfxInt16x4 x, FfxInt16x4 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt16 ffxMax(FfxUInt16 x, FfxUInt16 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt16x2 ffxMax(FfxUInt16x2 x, FfxUInt16x2 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt16x3 ffxMax(FfxUInt16x3 x, FfxUInt16x3 y) +{ + return max(x, y); +} + +/// Compute the max of two values. +/// +/// @param [in] x The first value to compute the max of. +/// @param [in] y The second value to compute the max of. +/// +/// @returns +/// The the lowest of two values. +/// +/// @ingroup GPUCore +FfxUInt16x4 ffxMax(FfxUInt16x4 x, FfxUInt16x4 y) +{ + return max(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPUCore +FfxFloat16 ffxPow(FfxFloat16 x, FfxFloat16 y) +{ + return pow(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxPow(FfxFloat16x2 x, FfxFloat16x2 y) +{ + return pow(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxPow(FfxFloat16x3 x, FfxFloat16x3 y) +{ + return pow(x, y); +} + +/// Compute the value of the first parameter raised to the power of the second. +/// +/// @param [in] x The value to raise to the power y. +/// @param [in] y The power to which to raise x. +/// +/// @returns +/// The value of the first parameter raised to the power of the second. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxPow(FfxFloat16x4 x, FfxFloat16x4 y) +{ + return pow(x, y); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPUCore +FfxFloat16 ffxSqrt(FfxFloat16 x) +{ + return sqrt(x); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxSqrt(FfxFloat16x2 x) +{ + return sqrt(x); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxSqrt(FfxFloat16x3 x) +{ + return sqrt(x); +} + +/// Compute the square root of a value. +/// +/// @param [in] x The first value to compute the min of. +/// +/// @returns +/// The the square root of x. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxSqrt(FfxFloat16x4 x) +{ + return sqrt(x); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPUCore +FfxFloat16 ffxCopySignBitHalf(FfxFloat16 d, FfxFloat16 s) +{ + return FFX_TO_FLOAT16(FFX_TO_UINT16(d) | (FFX_TO_UINT16(s) & FFX_BROADCAST_UINT16(0x8000u))); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxCopySignBitHalf(FfxFloat16x2 d, FfxFloat16x2 s) +{ + return FFX_TO_FLOAT16X2(FFX_TO_UINT16X2(d) | (FFX_TO_UINT16X2(s) & FFX_BROADCAST_UINT16X2(0x8000u))); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxCopySignBitHalf(FfxFloat16x3 d, FfxFloat16x3 s) +{ + return FFX_TO_FLOAT16X3(FFX_TO_UINT16X3(d) | (FFX_TO_UINT16X3(s) & FFX_BROADCAST_UINT16X3(0x8000u))); +} + +/// Copy the sign bit from 's' to positive 'd'. +/// +/// @param [in] d The value to copy the sign bit into. +/// @param [in] s The value to copy the sign bit from. +/// +/// @returns +/// The value of d with the sign bit from s. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxCopySignBitHalf(FfxFloat16x4 d, FfxFloat16x4 s) +{ + return FFX_TO_FLOAT16X4(FFX_TO_UINT16X4(d) | (FFX_TO_UINT16X4(s) & FFX_BROADCAST_UINT16X4(0x8000u))); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against 0. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or position. +/// +/// @ingroup GPUCore +FfxFloat16 ffxIsSignedHalf(FfxFloat16 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against 0. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or position. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxIsSignedHalf(FfxFloat16x2 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16X2(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against 0. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or position. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxIsSignedHalf(FfxFloat16x3 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16X3(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// A single operation to return the following: +/// m = NaN := 0 +/// m >= 0 := 0 +/// m < 0 := 1 +/// +/// Uses the following useful floating point logic, +/// saturate(+a*(-INF)==-INF) := 0 +/// saturate( 0*(-INF)== NaN) := 0 +/// saturate(-a*(-INF)==+INF) := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against 0. +/// +/// @returns +/// 1.0 when the value is negative, or 0.0 when the value is 0 or position. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxIsSignedHalf(FfxFloat16x4 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16X4(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPUCore +FfxFloat16 ffxIsGreaterThanZeroHalf(FfxFloat16 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16(FFX_POSITIVE_INFINITY_HALF)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxIsGreaterThanZeroHalf(FfxFloat16x2 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16X2(FFX_POSITIVE_INFINITY_HALF)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxIsGreaterThanZeroHalf(FfxFloat16x3 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16X3(FFX_POSITIVE_INFINITY_HALF)); +} + +/// A single operation to return the following: +/// m = NaN := 1 +/// m > 0 := 0 +/// m <= 0 := 1 +/// +/// This function is useful when creating masks for branch-free logic. +/// +/// @param [in] m The value to test against zero. +/// +/// @returns +/// 1.0 when the value is position, or 0.0 when the value is 0 or negative. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxIsGreaterThanZeroHalf(FfxFloat16x4 m) +{ + return ffxSaturate(m * FFX_BROADCAST_FLOAT16X4(FFX_POSITIVE_INFINITY_HALF)); +} + +/// Convert a 16bit floating point value to sortable integer. +/// +/// - If sign bit=0, flip the sign bit (positives). +/// - If sign bit=1, flip all bits (negatives). +/// +/// The function has the side effects that: +/// - Larger integers are more positive values. +/// - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage). +/// +/// @param [in] x The floating point value to make sortable. +/// +/// @returns +/// The sortable integer value. +/// +/// @ingroup GPUCore +FfxUInt16 ffxFloatToSortableIntegerHalf(FfxUInt16 x) +{ + return x ^ ((ffxBitShiftRightHalf(x, FFX_BROADCAST_UINT16(15))) | FFX_BROADCAST_UINT16(0x8000)); +} + +/// Convert a sortable integer to a 16bit floating point value. +/// +/// The function has the side effects that: +/// - If sign bit=1, flip the sign bit (positives). +/// - If sign bit=0, flip all bits (negatives). +/// +/// @param [in] x The sortable integer value to make floating point. +/// +/// @returns +/// The floating point value. +/// +/// @ingroup GPUCore +FfxUInt16 ffxSortableIntegerToFloatHalf(FfxUInt16 x) +{ + return x ^ ((~ffxBitShiftRightHalf(x, FFX_BROADCAST_UINT16(15))) | FFX_BROADCAST_UINT16(0x8000)); +} + +/// Convert a pair of 16bit floating point values to a pair of sortable integers. +/// +/// - If sign bit=0, flip the sign bit (positives). +/// - If sign bit=1, flip all bits (negatives). +/// +/// The function has the side effects that: +/// - Larger integers are more positive values. +/// - Float zero is mapped to center of integers (so clear to integer zero is a nice default for atomic max usage). +/// +/// @param [in] x The floating point values to make sortable. +/// +/// @returns +/// The sortable integer values. +/// +/// @ingroup GPUCore +FfxUInt16x2 ffxFloatToSortableIntegerHalf(FfxUInt16x2 x) +{ + return x ^ ((ffxBitShiftRightHalf(x, FFX_BROADCAST_UINT16X2(15))) | FFX_BROADCAST_UINT16X2(0x8000)); +} + +/// Convert a pair of sortable integers to a pair of 16bit floating point values. +/// +/// The function has the side effects that: +/// - If sign bit=1, flip the sign bit (positives). +/// - If sign bit=0, flip all bits (negatives). +/// +/// @param [in] x The sortable integer values to make floating point. +/// +/// @returns +/// The floating point values. +/// +/// @ingroup GPUCore +FfxUInt16x2 ffxSortableIntegerToFloatHalf(FfxUInt16x2 x) +{ + return x ^ ((~ffxBitShiftRightHalf(x, FFX_BROADCAST_UINT16X2(15))) | FFX_BROADCAST_UINT16X2(0x8000)); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// [Zero] Y0 [Zero] X0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxPackBytesZeroY0ZeroX0(FfxUInt32x2 i) +{ + return ((i.x) & 0xffu) | ((i.y << 16) & 0xff0000u); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// [Zero] Y1 [Zero] X1 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxPackBytesZeroY1ZeroX1(FfxUInt32x2 i) +{ + return ((i.x >> 8) & 0xffu) | ((i.y << 8) & 0xff0000u); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// [Zero] Y2 [Zero] X2 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxPackBytesZeroY2ZeroX2(FfxUInt32x2 i) +{ + return ((i.x >> 16) & 0xffu) | ((i.y) & 0xff0000u); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// [Zero] Y3 [Zero] X3 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxPackBytesZeroY3ZeroX3(FfxUInt32x2 i) +{ + return ((i.x >> 24) & 0xffu) | ((i.y >> 8) & 0xff0000u); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y3 Y2 Y1 X0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxPackBytesY3Y2Y1X0(FfxUInt32x2 i) +{ + return ((i.x) & 0x000000ffu) | (i.y & 0xffffff00u); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y3 Y2 Y1 X2 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxPackBytesY3Y2Y1X2(FfxUInt32x2 i) +{ + return ((i.x >> 16) & 0x000000ffu) | (i.y & 0xffffff00u); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y3 Y2 X0 Y0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxPackBytesY3Y2X0Y0(FfxUInt32x2 i) +{ + return ((i.x << 8) & 0x0000ff00u) | (i.y & 0xffff00ffu); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y3 Y2 X2 Y0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxPackBytesY3Y2X2Y0(FfxUInt32x2 i) +{ + return ((i.x >> 8) & 0x0000ff00u) | (i.y & 0xffff00ffu); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y3 X0 Y1 Y0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxPackBytesY3X0Y1Y0(FfxUInt32x2 i) +{ + return ((i.x << 16) & 0x00ff0000u) | (i.y & 0xff00ffffu); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y3 X2 Y1 Y0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxPackBytesY3X2Y1Y0(FfxUInt32x2 i) +{ + return ((i.x) & 0x00ff0000u) | (i.y & 0xff00ffffu); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// X0 Y2 Y1 Y0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxPackBytesX0Y2Y1Y0(FfxUInt32x2 i) +{ + return ((i.x << 24) & 0xff000000u) | (i.y & 0x00ffffffu); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// X2 Y2 Y1 Y0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxPackBytesX2Y2Y1Y0(FfxUInt32x2 i) +{ + return ((i.x << 8) & 0xff000000u) | (i.y & 0x00ffffffu); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y2 X2 Y0 X0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxPackBytesY2X2Y0X0(FfxUInt32x2 i) +{ + return ((i.x) & 0x00ff00ffu) | ((i.y << 8) & 0xff00ff00u); +} + +/// Packs the bytes from the X and Y components of a FfxUInt32x2 into a single 32-bit integer. +/// +/// The resulting integer will contain bytes in the following order, from most to least significant: +/// Y2 Y0 X2 X0 +/// +/// @param [in] i The integer pair to pack. +/// +/// @returns +/// The packed integer value. +/// +/// @ingroup GPUCore +FfxUInt32 ffxPackBytesY2Y0X2X0(FfxUInt32x2 i) +{ + return (((i.x) & 0xffu) | ((i.x >> 8) & 0xff00u) | ((i.y << 16) & 0xff0000u) | ((i.y << 8) & 0xff000000u)); +} + +/// Takes two Float16x2 values x and y, normalizes them and builds a single Uint16x2 value in the format {{x0,y0},{x1,y1}}. +/// +/// @param [in] x The first float16x2 value to pack. +/// @param [in] y The second float16x2 value to pack. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPUCore +FfxUInt16x2 ffxPackX0Y0X1Y1UnsignedToUint16x2(FfxFloat16x2 x, FfxFloat16x2 y) +{ + x *= FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0); + y *= FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0); + return FFX_UINT32_TO_UINT16X2(ffxPackBytesY2X2Y0X0(FfxUInt32x2(FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(x)), FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(y))))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[0:7], +/// d.y[0:7] into r.y[0:7], i.x[8:15] into r.x[8:15], r.y[8:15] and i.y[0:15] into r.x[16:31], r.y[16:31] using 3 ops. +/// +/// r=ffxPermuteUByte0Float16x2ToUint2(d,i) +/// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +/// Where 'k1' is an SGPR with 0x???? +/// Where 'k2' is an SGPR with 0x???? +/// V_PK_FMA_F16 i,i,k0.x,0 +/// V_PERM_B32 r.x,i,i,k1 +/// V_PERM_B32 r.y,i,i,k2 +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxPermuteUByte0Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0))); + return FfxUInt32x2(ffxPackBytesY3Y2Y1X0(FfxUInt32x2(d.x, b)), ffxPackBytesY3Y2Y1X2(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[8:15], +/// d.y[0:7] into r.y[8:15], i.x[0:7] into r.x[0:7], r.y[0:7] and i.y[0:15] into r.x[16:31], r.y[16:31] using 3 ops. +/// +/// r=ffxPermuteUByte1Float16x2ToUint2(d,i) +/// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +/// Where 'k1' is an SGPR with 0x???? +/// Where 'k2' is an SGPR with 0x???? +/// V_PK_FMA_F16 i,i,k0.x,0 +/// V_PERM_B32 r.x,i,i,k1 +/// V_PERM_B32 r.y,i,i,k2 +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxPermuteUByte1Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0))); + return FfxUInt32x2(ffxPackBytesY3Y2X0Y0(FfxUInt32x2(d.x, b)), ffxPackBytesY3Y2X2Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[16:23], +/// d.y[0:7] into r.y[16:23], i.x[0:15] into r.x[0:15], r.y[0:15] and i.y[8:15] into r.x[24:31], r.y[24:31] using 3 ops. +/// +/// r=ffxPermuteUByte2Float16x2ToUint2(d,i) +/// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +/// Where 'k1' is an SGPR with 0x???? +/// Where 'k2' is an SGPR with 0x???? +/// V_PK_FMA_F16 i,i,k0.x,0 +/// V_PERM_B32 r.x,i,i,k1 +/// V_PERM_B32 r.y,i,i,k2 +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxPermuteUByte2Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0))); + return FfxUInt32x2(ffxPackBytesY3X0Y1Y0(FfxUInt32x2(d.x, b)), ffxPackBytesY3X2Y1Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[24:31], +/// d.y[0:7] into r.y[24:31], i.x[0:15] into r.x[0:15], r.y[0:15] and i.y[0:7] into r.x[16:23], r.y[16:23] using 3 ops. +/// +/// r=ffxPermuteUByte3Float16x2ToUint2(d,i) +/// Where 'k0' is an SGPR with {1.0/32768.0} packed into the lower 16-bits +/// Where 'k1' is an SGPR with 0x???? +/// Where 'k2' is an SGPR with 0x???? +/// V_PK_FMA_F16 i,i,k0.x,0 +/// V_PERM_B32 r.x,i,i,k1 +/// V_PERM_B32 r.y,i,i,k2 +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxPermuteUByte3Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0))); + return FfxUInt32x2(ffxPackBytesX0Y2Y1Y0(FfxUInt32x2(d.x, b)), ffxPackBytesX2Y2Y1Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[0:7] into r.x[0:7] and i.y[0:7] into r.y[0:7] using 2 ops. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxPermuteUByte0Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY0ZeroX0(i))) * FFX_BROADCAST_FLOAT16X2(32768.0); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[8:15] into r.x[0:7] and i.y[8:15] into r.y[0:7] using 2 ops. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxPermuteUByte1Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY1ZeroX1(i))) * FFX_BROADCAST_FLOAT16X2(32768.0); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[16:23] into r.x[0:7] and i.y[16:23] into r.y[0:7] using 2 ops. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxPermuteUByte2Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY2ZeroX2(i))) * FFX_BROADCAST_FLOAT16X2(32768.0); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[24:31] into r.x[0:7] and i.y[24:31] into r.y[0:7] using 2 ops. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxPermuteUByte3Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY3ZeroX3(i))) * FFX_BROADCAST_FLOAT16X2(32768.0); +} + +/// Takes two Float16x2 values x and y, normalizes them and builds a single Uint16x2 value in the format {{x0,y0},{x1,y1}}. +/// +/// @param [in] x The first float16x2 value to pack. +/// @param [in] y The second float16x2 value to pack. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPUCore +FfxUInt16x2 ffxPackX0Y0X1Y1SignedToUint16x2(FfxFloat16x2 x, FfxFloat16x2 y) +{ + x = x * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0); + y = y * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0); + return FFX_UINT32_TO_UINT16X2(ffxPackBytesY2X2Y0X0(FfxUInt32x2(FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(x)), FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(y))))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[0:7], +/// d.y[0:7] into r.y[0:7], i.x[8:15] into r.x[8:15], r.y[8:15] and i.y[0:15] into r.x[16:31], r.y[16:31] using 3 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxPermuteSByte0Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))); + return FfxUInt32x2(ffxPackBytesY3Y2Y1X0(FfxUInt32x2(d.x, b)), ffxPackBytesY3Y2Y1X2(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[8:15], +/// d.y[0:7] into r.y[8:15], i.x[0:7] into r.x[0:7], r.y[0:7] and i.y[0:15] into r.x[16:31], r.y[16:31] using 3 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxPermuteSByte1Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))); + return FfxUInt32x2(ffxPackBytesY3Y2X0Y0(FfxUInt32x2(d.x, b)), ffxPackBytesY3Y2X2Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[16:23], +/// d.y[0:7] into r.y[16:23], i.x[0:15] into r.x[0:15], r.y[0:15] and i.y[8:15] into r.x[24:31], r.y[24:31] using 3 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxPermuteSByte2Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))); + return FfxUInt32x2(ffxPackBytesY3X0Y1Y0(FfxUInt32x2(d.x, b)), ffxPackBytesY3X2Y1Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[24:31], +/// d.y[0:7] into r.y[24:31], i.x[0:15] into r.x[0:15], r.y[0:15] and i.y[0:7] into r.x[16:23], r.y[16:23] using 3 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxPermuteSByte3Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))); + return FfxUInt32x2(ffxPackBytesX0Y2Y1Y0(FfxUInt32x2(d.x, b)), ffxPackBytesX2Y2Y1Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[0:7], +/// d.y[0:7] into r.y[0:7], i.x[8:15] into r.x[8:15], r.y[8:15] and i.y[0:15] into r.x[16:31], r.y[16:31] using 3 ops. +/// +/// Zero-based flips the MSB bit of the byte (making 128 "exact zero" actually zero). +/// This is useful if there is a desire for cleared values to decode as zero. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxPermuteZeroBasedSByte0Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))) ^ 0x00800080u; + return FfxUInt32x2(ffxPackBytesY3Y2Y1X0(FfxUInt32x2(d.x, b)), ffxPackBytesY3Y2Y1X2(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[8:15], +/// d.y[0:7] into r.y[8:15], i.x[0:7] into r.x[0:7], r.y[0:7] and i.y[0:15] into r.x[16:31], r.y[16:31] using 3 ops. +/// +/// Zero-based flips the MSB bit of the byte (making 128 "exact zero" actually zero). +/// This is useful if there is a desire for cleared values to decode as zero. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxPermuteZeroBasedSByte1Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))) ^ 0x00800080u; + return FfxUInt32x2(ffxPackBytesY3Y2X0Y0(FfxUInt32x2(d.x, b)), ffxPackBytesY3Y2X2Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[16:23], +/// d.y[0:7] into r.y[16:23], i.x[0:15] into r.x[0:15], r.y[0:15] and i.y[8:15] into r.x[24:31], r.y[24:31] using 3 ops. +/// +/// Zero-based flips the MSB bit of the byte (making 128 "exact zero" actually zero). +/// This is useful if there is a desire for cleared values to decode as zero. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxPermuteZeroBasedSByte2Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))) ^ 0x00800080u; + return FfxUInt32x2(ffxPackBytesY3X0Y1Y0(FfxUInt32x2(d.x, b)), ffxPackBytesY3X2Y1Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value d, Float16x2 value i and a resulting FfxUInt32x2 value r, this function packs d.x[0:7] into r.x[24:31], +/// d.y[0:7] into r.y[24:31], i.x[0:15] into r.x[0:15], r.y[0:15] and i.y[0:7] into r.x[16:23], r.y[16:23] using 3 ops. +/// +/// Zero-based flips the MSB bit of the byte (making 128 "exact zero" actually zero). +/// This is useful if there is a desire for cleared values to decode as zero. +/// +/// Handles signed byte values. +/// +/// @param [in] d The FfxUInt32x2 value to be packed. +/// @param [in] i The FfxFloat16x2 value to be packed. +/// +/// @returns +/// The packed FfxUInt32x2 value. +/// +/// @ingroup GPUCore +FfxUInt32x2 ffxPermuteZeroBasedSByte3Float16x2ToUint2(FfxUInt32x2 d, FfxFloat16x2 i) +{ + FfxUInt32 b = FFX_UINT16X2_TO_UINT32(FFX_TO_UINT16X2(i * FFX_BROADCAST_FLOAT16X2(1.0 / 32768.0) + FFX_BROADCAST_FLOAT16X2(0.25 / 32768.0))) ^ 0x00800080u; + return FfxUInt32x2(ffxPackBytesX0Y2Y1Y0(FfxUInt32x2(d.x, b)), ffxPackBytesX2Y2Y1Y0(FfxUInt32x2(d.y, b))); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[0:7] into r.x[0:7] and i.y[0:7] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxPermuteSByte0Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY0ZeroX0(i))) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[8:15] into r.x[0:7] and i.y[8:15] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxPermuteSByte1Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY1ZeroX1(i))) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[16:23] into r.x[0:7] and i.y[16:23] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxPermuteSByte2Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY2ZeroX2(i))) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[24:31] into r.x[0:7] and i.y[24:31] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxPermuteSByte3Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY3ZeroX3(i))) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[0:7] into r.x[0:7] and i.y[0:7] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxPermuteZeroBasedSByte0Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY0ZeroX0(i) ^ 0x00800080u)) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[8:15] into r.x[0:7] and i.y[8:15] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxPermuteZeroBasedSByte1Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY1ZeroX1(i) ^ 0x00800080u)) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[16:23] into r.x[0:7] and i.y[16:23] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxPermuteZeroBasedSByte2Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY2ZeroX2(i) ^ 0x00800080u)) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Given a FfxUInt32x2 value i and a resulting Float16x2 value r, this function packs i.x[24:31] into r.x[0:7] and i.y[24:31] into r.y[0:7] using 2 ops. +/// +/// Handles signed byte values. +/// +/// @param [in] i The FfxUInt32x2 value to be unpacked. +/// +/// @returns +/// The unpacked FfxFloat16x2. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxPermuteZeroBasedSByte3Uint2ToFloat16x2(FfxUInt32x2 i) +{ + return FFX_TO_FLOAT16X2(FFX_UINT32_TO_UINT16X2(ffxPackBytesZeroY3ZeroX3(i) ^ 0x00800080u)) * FFX_BROADCAST_FLOAT16X2(32768.0) - FFX_BROADCAST_FLOAT16X2(0.25); +} + +/// Calculate a half-precision low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat16 ffxApproximateSqrtHalf(FfxFloat16 a) +{ + return FFX_TO_FLOAT16((FFX_TO_UINT16(a) >> FFX_BROADCAST_UINT16(1)) + FFX_BROADCAST_UINT16(0x1de2)); +} + +/// Calculate a half-precision low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxApproximateSqrtHalf(FfxFloat16x2 a) +{ + return FFX_TO_FLOAT16X2((FFX_TO_UINT16X2(a) >> FFX_BROADCAST_UINT16X2(1)) + FFX_BROADCAST_UINT16X2(0x1de2)); +} + +/// Calculate a half-precision low-quality approximation for the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the square root for. +/// +/// @returns +/// An approximation of the square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxApproximateSqrtHalf(FfxFloat16x3 a) +{ + return FFX_TO_FLOAT16X3((FFX_TO_UINT16X3(a) >> FFX_BROADCAST_UINT16X3(1)) + FFX_BROADCAST_UINT16X3(0x1de2)); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat16 ffxApproximateReciprocalHalf(FfxFloat16 a) +{ + return FFX_TO_FLOAT16(FFX_BROADCAST_UINT16(0x7784) - FFX_TO_UINT16(a)); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxApproximateReciprocalHalf(FfxFloat16x2 a) +{ + return FFX_TO_FLOAT16X2(FFX_BROADCAST_UINT16X2(0x7784) - FFX_TO_UINT16X2(a)); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxApproximateReciprocalHalf(FfxFloat16x3 a) +{ + return FFX_TO_FLOAT16X3(FFX_BROADCAST_UINT16X3(0x7784) - FFX_TO_UINT16X3(a)); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxApproximateReciprocalHalf(FfxFloat16x4 a) +{ + return FFX_TO_FLOAT16X4(FFX_BROADCAST_UINT16X4(0x7784) - FFX_TO_UINT16X4(a)); +} + +/// Calculate a half-precision medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPUCore +FfxFloat16 ffxApproximateReciprocalMediumHalf(FfxFloat16 a) +{ + FfxFloat16 b = FFX_TO_FLOAT16(FFX_BROADCAST_UINT16(0x778d) - FFX_TO_UINT16(a)); + return b * (-b * a + FFX_BROADCAST_FLOAT16(2.0)); +} + +/// Calculate a half-precision medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxApproximateReciprocalMediumHalf(FfxFloat16x2 a) +{ + FfxFloat16x2 b = FFX_TO_FLOAT16X2(FFX_BROADCAST_UINT16X2(0x778d) - FFX_TO_UINT16X2(a)); + return b * (-b * a + FFX_BROADCAST_FLOAT16X2(2.0)); +} + +/// Calculate a half-precision medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxApproximateReciprocalMediumHalf(FfxFloat16x3 a) +{ + FfxFloat16x3 b = FFX_TO_FLOAT16X3(FFX_BROADCAST_UINT16X3(0x778d) - FFX_TO_UINT16X3(a)); + return b * (-b * a + FFX_BROADCAST_FLOAT16X3(2.0)); +} + +/// Calculate a half-precision medium-quality approximation for the reciprocal of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal for. +/// +/// @returns +/// An approximation of the reciprocal, estimated to medium quality. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxApproximateReciprocalMediumHalf(FfxFloat16x4 a) +{ + FfxFloat16x4 b = FFX_TO_FLOAT16X4(FFX_BROADCAST_UINT16X4(0x778d) - FFX_TO_UINT16X4(a)); + return b * (-b * a + FFX_BROADCAST_FLOAT16X4(2.0)); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal of the square root for. +/// +/// @returns +/// An approximation of the reciprocal of the square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat16 ffxApproximateReciprocalSquareRootHalf(FfxFloat16 a) +{ + return FFX_TO_FLOAT16(FFX_BROADCAST_UINT16(0x59a3) - (FFX_TO_UINT16(a) >> FFX_BROADCAST_UINT16(1))); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal of the square root for. +/// +/// @returns +/// An approximation of the reciprocal of the square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxApproximateReciprocalSquareRootHalf(FfxFloat16x2 a) +{ + return FFX_TO_FLOAT16X2(FFX_BROADCAST_UINT16X2(0x59a3) - (FFX_TO_UINT16X2(a) >> FFX_BROADCAST_UINT16X2(1))); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal of the square root for. +/// +/// @returns +/// An approximation of the reciprocal of the square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxApproximateReciprocalSquareRootHalf(FfxFloat16x3 a) +{ + return FFX_TO_FLOAT16X3(FFX_BROADCAST_UINT16X3(0x59a3) - (FFX_TO_UINT16X3(a) >> FFX_BROADCAST_UINT16X3(1))); +} + +/// Calculate a half-precision low-quality approximation for the reciprocal of the square root of a value. +/// +/// For additional information on the approximation family of functions, you can refer to Michal Drobot's excellent +/// presentation materials: +/// +/// - https://michaldrobot.files.wordpress.com/2014/05/gcn_alu_opt_digitaldragons2014.pdf +/// - https://github.com/michaldrobot/ShaderFastLibs/blob/master/ShaderFastMathLib.h +/// +/// @param [in] a The value to calculate an approximate to the reciprocal of the square root for. +/// +/// @returns +/// An approximation of the reciprocal of the square root, estimated to low quality. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxApproximateReciprocalSquareRootHalf(FfxFloat16x4 a) +{ + return FFX_TO_FLOAT16X4(FFX_BROADCAST_UINT16X4(0x59a3) - (FFX_TO_UINT16X4(a) >> FFX_BROADCAST_UINT16X4(1))); +} + +/// An approximation of sine. +/// +/// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +/// is {-1/4 to 1/4} representing {-1 to 1}. +/// +/// @param [in] x The value to calculate approximate sine for. +/// +/// @returns +/// The approximate sine of value. +FfxFloat16 ffxParabolicSinHalf(FfxFloat16 x) +{ + return x * abs(x) - x; +} + +/// An approximation of sine. +/// +/// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +/// is {-1/4 to 1/4} representing {-1 to 1}. +/// +/// @param [in] x The value to calculate approximate sine for. +/// +/// @returns +/// The approximate sine of value. +FfxFloat16x2 ffxParabolicSinHalf(FfxFloat16x2 x) +{ + return x * abs(x) - x; +} + +/// An approximation of cosine. +/// +/// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +/// is {-1/4 to 1/4} representing {-1 to 1}. +/// +/// @param [in] x The value to calculate approximate cosine for. +/// +/// @returns +/// The approximate cosine of value. +FfxFloat16 ffxParabolicCosHalf(FfxFloat16 x) +{ + x = ffxFract(x * FFX_BROADCAST_FLOAT16(0.5) + FFX_BROADCAST_FLOAT16(0.75)); + x = x * FFX_BROADCAST_FLOAT16(2.0) - FFX_BROADCAST_FLOAT16(1.0); + return ffxParabolicSinHalf(x); +} + +/// An approximation of cosine. +/// +/// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +/// is {-1/4 to 1/4} representing {-1 to 1}. +/// +/// @param [in] x The value to calculate approximate cosine for. +/// +/// @returns +/// The approximate cosine of value. +FfxFloat16x2 ffxParabolicCosHalf(FfxFloat16x2 x) +{ + x = ffxFract(x * FFX_BROADCAST_FLOAT16X2(0.5) + FFX_BROADCAST_FLOAT16X2(0.75)); + x = x * FFX_BROADCAST_FLOAT16X2(2.0) - FFX_BROADCAST_FLOAT16X2(1.0); + return ffxParabolicSinHalf(x); +} + +/// An approximation of both sine and cosine. +/// +/// Valid input range is {-1 to 1} representing {0 to 2 pi}, and the output range +/// is {-1/4 to 1/4} representing {-1 to 1}. +/// +/// @param [in] x The value to calculate approximate cosine for. +/// +/// @returns +/// A FfxFloat32x2 containing approximations of both sine and cosine of value. +FfxFloat16x2 ffxParabolicSinCosHalf(FfxFloat16 x) +{ + FfxFloat16 y = ffxFract(x * FFX_BROADCAST_FLOAT16(0.5) + FFX_BROADCAST_FLOAT16(0.75)); + y = y * FFX_BROADCAST_FLOAT16(2.0) - FFX_BROADCAST_FLOAT16(1.0); + return ffxParabolicSinHalf(FfxFloat16x2(x, y)); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPUCore +FfxUInt16 ffxZeroOneAndHalf(FfxUInt16 x, FfxUInt16 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPUCore +FfxUInt16x2 ffxZeroOneAndHalf(FfxUInt16x2 x, FfxUInt16x2 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPUCore +FfxUInt16x3 ffxZeroOneAndHalf(FfxUInt16x3 x, FfxUInt16x3 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPUCore +FfxUInt16x4 ffxZeroOneAndHalf(FfxUInt16x4 x, FfxUInt16x4 y) +{ + return min(x, y); +} + +/// Conditional free logic NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// @param [in] y The second value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPUCore +FfxUInt16 ffxZeroOneNotHalf(FfxUInt16 x) +{ + return x ^ FFX_BROADCAST_UINT16(1); +} + +/// Conditional free logic NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// @param [in] y The second value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPUCore +FfxUInt16x2 ffxZeroOneNotHalf(FfxUInt16x2 x) +{ + return x ^ FFX_BROADCAST_UINT16X2(1); +} + +/// Conditional free logic NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// @param [in] y The second value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPUCore +FfxUInt16x3 ffxZeroOneNotHalf(FfxUInt16x3 x) +{ + return x ^ FFX_BROADCAST_UINT16X3(1); +} + +/// Conditional free logic NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the NOT operator. +/// @param [in] y The second value to be fed into the NOT operator. +/// +/// @returns +/// Result of the NOT operation. +/// +/// @ingroup GPUCore +FfxUInt16x4 ffxZeroOneNotHalf(FfxUInt16x4 x) +{ + return x ^ FFX_BROADCAST_UINT16X4(1); +} + +/// Conditional free logic OR operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxUInt16 ffxZeroOneOrHalf(FfxUInt16 x, FfxUInt16 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxUInt16x2 ffxZeroOneOrHalf(FfxUInt16x2 x, FfxUInt16x2 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxUInt16x3 ffxZeroOneOrHalf(FfxUInt16x3 x, FfxUInt16x3 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxUInt16x4 ffxZeroOneOrHalf(FfxUInt16x4 x, FfxUInt16x4 y) +{ + return max(x, y); +} + +/// Convert a half-precision FfxFloat32 value between 0.0f and 1.0f to a half-precision Uint. +/// +/// @param [in] x The value to converted to a Uint. +/// +/// @returns +/// The converted Uint value. +/// +/// @ingroup GPUCore +FfxUInt16 ffxZeroOneFloat16ToUint16(FfxFloat16 x) +{ + return FFX_TO_UINT16(x * FFX_TO_FLOAT16(FFX_TO_UINT16(1))); +} + +/// Convert a half-precision FfxFloat32 value between 0.0f and 1.0f to a half-precision Uint. +/// +/// @param [in] x The value to converted to a Uint. +/// +/// @returns +/// The converted Uint value. +/// +/// @ingroup GPUCore +FfxUInt16x2 ffxZeroOneFloat16x2ToUint16x2(FfxFloat16x2 x) +{ + return FFX_TO_UINT16X2(x * FFX_TO_FLOAT16X2(FfxUInt16x2(1, 1))); +} + +/// Convert a half-precision FfxFloat32 value between 0.0f and 1.0f to a half-precision Uint. +/// +/// @param [in] x The value to converted to a Uint. +/// +/// @returns +/// The converted Uint value. +/// +/// @ingroup GPUCore +FfxUInt16x3 ffxZeroOneFloat16x3ToUint16x3(FfxFloat16x3 x) +{ + return FFX_TO_UINT16X3(x * FFX_TO_FLOAT16X3(FfxUInt16x3(1, 1, 1))); +} + +/// Convert a half-precision FfxFloat32 value between 0.0f and 1.0f to a half-precision Uint. +/// +/// @param [in] x The value to converted to a Uint. +/// +/// @returns +/// The converted Uint value. +/// +/// @ingroup GPUCore +FfxUInt16x4 ffxZeroOneFloat16x4ToUint16x4(FfxFloat16x4 x) +{ + return FFX_TO_UINT16X4(x * FFX_TO_FLOAT16X4(FfxUInt16x4(1, 1, 1, 1))); +} + +/// Convert a half-precision FfxUInt32 value between 0 and 1 to a half-precision FfxFloat32. +/// +/// @param [in] x The value to converted to a half-precision FfxFloat32. +/// +/// @returns +/// The converted half-precision FfxFloat32 value. +/// +/// @ingroup GPUCore +FfxFloat16 ffxZeroOneUint16ToFloat16(FfxUInt16 x) +{ + return FFX_TO_FLOAT16(x * FFX_TO_UINT16(FFX_TO_FLOAT16(1.0))); +} + +/// Convert a half-precision FfxUInt32 value between 0 and 1 to a half-precision FfxFloat32. +/// +/// @param [in] x The value to converted to a half-precision FfxFloat32. +/// +/// @returns +/// The converted half-precision FfxFloat32 value. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxZeroOneUint16x2ToFloat16x2(FfxUInt16x2 x) +{ + return FFX_TO_FLOAT16X2(x * FFX_TO_UINT16X2(FfxUInt16x2(FFX_TO_FLOAT16(1.0), FFX_TO_FLOAT16(1.0)))); +} + +/// Convert a half-precision FfxUInt32 value between 0 and 1 to a half-precision FfxFloat32. +/// +/// @param [in] x The value to converted to a half-precision FfxFloat32. +/// +/// @returns +/// The converted half-precision FfxFloat32 value. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxZeroOneUint16x3ToFloat16x3(FfxUInt16x3 x) +{ + return FFX_TO_FLOAT16X3(x * FFX_TO_UINT16X3(FfxUInt16x3(FFX_TO_FLOAT16(1.0), FFX_TO_FLOAT16(1.0), FFX_TO_FLOAT16(1.0)))); +} + +/// Convert a half-precision FfxUInt32 value between 0 and 1 to a half-precision FfxFloat32. +/// +/// @param [in] x The value to converted to a half-precision FfxFloat32. +/// +/// @returns +/// The converted half-precision FfxFloat32 value. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxZeroOneUint16x4ToFloat16x4(FfxUInt16x4 x) +{ + return FFX_TO_FLOAT16X4(x * FFX_TO_UINT16X4(FfxUInt16x4(FFX_TO_FLOAT16(1.0), FFX_TO_FLOAT16(1.0), FFX_TO_FLOAT16(1.0), FFX_TO_FLOAT16(1.0)))); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPUCore +FfxFloat16 ffxZeroOneAndHalf(FfxFloat16 x, FfxFloat16 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxZeroOneAndHalf(FfxFloat16x2 x, FfxFloat16x2 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxZeroOneAndHalf(FfxFloat16x3 x, FfxFloat16x3 y) +{ + return min(x, y); +} + +/// Conditional free logic AND operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// +/// @returns +/// Result of the AND operation. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxZeroOneAndHalf(FfxFloat16x4 x, FfxFloat16x4 y) +{ + return min(x, y); +} + +/// Conditional free logic AND NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND NOT operator. +/// @param [in] y The second value to be fed into the AND NOT operator. +/// +/// @returns +/// Result of the AND NOT operation. +/// +/// @ingroup GPUCore +FfxFloat16 ffxSignedZeroOneAndOrHalf(FfxFloat16 x, FfxFloat16 y) +{ + return (-x) * y + FFX_BROADCAST_FLOAT16(1.0); +} + +/// Conditional free logic AND NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND NOT operator. +/// @param [in] y The second value to be fed into the AND NOT operator. +/// +/// @returns +/// Result of the AND NOT operation. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxSignedZeroOneAndOrHalf(FfxFloat16x2 x, FfxFloat16x2 y) +{ + return (-x) * y + FFX_BROADCAST_FLOAT16X2(1.0); +} + +/// Conditional free logic AND NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND NOT operator. +/// @param [in] y The second value to be fed into the AND NOT operator. +/// +/// @returns +/// Result of the AND NOT operation. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxSignedZeroOneAndOrHalf(FfxFloat16x3 x, FfxFloat16x3 y) +{ + return (-x) * y + FFX_BROADCAST_FLOAT16X3(1.0); +} + +/// Conditional free logic AND NOT operation using two half-precision values. +/// +/// @param [in] x The first value to be fed into the AND NOT operator. +/// @param [in] y The second value to be fed into the AND NOT operator. +/// +/// @returns +/// Result of the AND NOT operation. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxSignedZeroOneAndOrHalf(FfxFloat16x4 x, FfxFloat16x4 y) +{ + return (-x) * y + FFX_BROADCAST_FLOAT16X4(1.0); +} + +/// Conditional free logic AND operation using two half-precision values followed by +/// a NOT operation using the resulting value and a third half-precision value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat16 ffxZeroOneAndOrHalf(FfxFloat16 x, FfxFloat16 y, FfxFloat16 z) +{ + return ffxSaturate(x * y + z); +} + +/// Conditional free logic AND operation using two half-precision values followed by +/// a NOT operation using the resulting value and a third half-precision value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxZeroOneAndOrHalf(FfxFloat16x2 x, FfxFloat16x2 y, FfxFloat16x2 z) +{ + return ffxSaturate(x * y + z); +} + +/// Conditional free logic AND operation using two half-precision values followed by +/// a NOT operation using the resulting value and a third half-precision value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxZeroOneAndOrHalf(FfxFloat16x3 x, FfxFloat16x3 y, FfxFloat16x3 z) +{ + return ffxSaturate(x * y + z); +} + +/// Conditional free logic AND operation using two half-precision values followed by +/// a NOT operation using the resulting value and a third half-precision value. +/// +/// @param [in] x The first value to be fed into the AND operator. +/// @param [in] y The second value to be fed into the AND operator. +/// @param [in] z The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxZeroOneAndOrHalf(FfxFloat16x4 x, FfxFloat16x4 y, FfxFloat16x4 z) +{ + return ffxSaturate(x * y + z); +} + +/// Given a half-precision value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPUCore +FfxFloat16 ffxZeroOneIsGreaterThanZeroHalf(FfxFloat16 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16(FFX_POSITIVE_INFINITY_HALF)); +} + +/// Given a half-precision value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxZeroOneIsGreaterThanZeroHalf(FfxFloat16x2 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16X2(FFX_POSITIVE_INFINITY_HALF)); +} + +/// Given a half-precision value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxZeroOneIsGreaterThanZeroHalf(FfxFloat16x3 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16X3(FFX_POSITIVE_INFINITY_HALF)); +} + +/// Given a half-precision value, returns 1.0 if greater than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the greater than zero comparison. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxZeroOneIsGreaterThanZeroHalf(FfxFloat16x4 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16X4(FFX_POSITIVE_INFINITY_HALF)); +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat16 ffxZeroOneNotHalf(FfxFloat16 x) +{ + return FFX_BROADCAST_FLOAT16(1.0) - x; +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxZeroOneNotHalf(FfxFloat16x2 x) +{ + return FFX_BROADCAST_FLOAT16X2(1.0) - x; +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxZeroOneNotHalf(FfxFloat16x3 x) +{ + return FFX_BROADCAST_FLOAT16X3(1.0) - x; +} + +/// Conditional free logic signed NOT operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the AND OR operator. +/// +/// @returns +/// Result of the AND OR operation. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxZeroOneNotHalf(FfxFloat16x4 x) +{ + return FFX_BROADCAST_FLOAT16X4(1.0) - x; +} + +/// Conditional free logic OR operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxFloat16 ffxZeroOneOrHalf(FfxFloat16 x, FfxFloat16 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxZeroOneOrHalf(FfxFloat16x2 x, FfxFloat16x2 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxZeroOneOrHalf(FfxFloat16x3 x, FfxFloat16x3 y) +{ + return max(x, y); +} + +/// Conditional free logic OR operation using two half-precision FfxFloat32 values. +/// +/// @param [in] x The first value to be fed into the OR operator. +/// @param [in] y The second value to be fed into the OR operator. +/// +/// @returns +/// Result of the OR operation. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxZeroOneOrHalf(FfxFloat16x4 x, FfxFloat16x4 y) +{ + return max(x, y); +} + +/// Choose between two half-precision FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPUCore +FfxFloat16 ffxZeroOneSelectHalf(FfxFloat16 x, FfxFloat16 y, FfxFloat16 z) +{ + FfxFloat16 r = (-x) * z + z; + return x * y + r; +} + +/// Choose between two half-precision FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxZeroOneSelectHalf(FfxFloat16x2 x, FfxFloat16x2 y, FfxFloat16x2 z) +{ + FfxFloat16x2 r = (-x) * z + z; + return x * y + r; +} + +/// Choose between two half-precision FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxZeroOneSelectHalf(FfxFloat16x3 x, FfxFloat16x3 y, FfxFloat16x3 z) +{ + FfxFloat16x3 r = (-x) * z + z; + return x * y + r; +} + +/// Choose between two half-precision FfxFloat32 values if the first paramter is greater than zero. +/// +/// @param [in] x The value to compare against zero. +/// @param [in] y The value to return if the comparision is greater than zero. +/// @param [in] z The value to return if the comparision is less than or equal to zero. +/// +/// @returns +/// The selected value. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxZeroOneSelectHalf(FfxFloat16x4 x, FfxFloat16x4 y, FfxFloat16x4 z) +{ + FfxFloat16x4 r = (-x) * z + z; + return x * y + r; +} + +/// Given a half-precision value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPUCore +FfxFloat16 ffxZeroOneIsSignedHalf(FfxFloat16 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// Given a half-precision value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxZeroOneIsSignedHalf(FfxFloat16x2 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16X2(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// Given a half-precision value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxZeroOneIsSignedHalf(FfxFloat16x3 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16X3(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// Given a half-precision value, returns 1.0 if less than zero and 0.0 if not. +/// +/// @param [in] x The value to be compared. +/// +/// @returns +/// Result of the sign value. +/// +/// @ingroup GPUCore +FfxFloat16x4 ffxZeroOneIsSignedHalf(FfxFloat16x4 x) +{ + return ffxSaturate(x * FFX_BROADCAST_FLOAT16X4(FFX_NEGATIVE_INFINITY_HALF)); +} + +/// Compute a Rec.709 color space. +/// +/// Rec.709 is used for some HDTVs. +/// +/// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +/// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +/// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +/// +/// @param [in] c The color to convert to Rec. 709. +/// +/// @returns +/// The color in Rec.709 space. +/// +/// @ingroup GPUCore +FfxFloat16 ffxRec709FromLinearHalf(FfxFloat16 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.018 * 4.5, 4.5, 0.45); + FfxFloat16x2 k = FfxFloat16x2(1.099, -0.099); + return clamp(j.x, c * j.y, pow(c, j.z) * k.x + k.y); +} + +/// Compute a Rec.709 color space. +/// +/// Rec.709 is used for some HDTVs. +/// +/// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +/// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +/// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +/// +/// @param [in] c The color to convert to Rec. 709. +/// +/// @returns +/// The color in Rec.709 space. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxRec709FromLinearHalf(FfxFloat16x2 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.018 * 4.5, 4.5, 0.45); + FfxFloat16x2 k = FfxFloat16x2(1.099, -0.099); + return clamp(j.xx, c * j.yy, pow(c, j.zz) * k.xx + k.yy); +} + +/// Compute a Rec.709 color space. +/// +/// Rec.709 is used for some HDTVs. +/// +/// Both Rec.709 and sRGB have a linear segment which as spec'ed would intersect the curved segment 2 times. +/// (a.) For 8-bit sRGB, steps {0 to 10.3} are in the linear region (4% of the encoding range). +/// (b.) For 8-bit 709, steps {0 to 20.7} are in the linear region (8% of the encoding range). +/// +/// @param [in] c The color to convert to Rec. 709. +/// +/// @returns +/// The color in Rec.709 space. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxRec709FromLinearHalf(FfxFloat16x3 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.018 * 4.5, 4.5, 0.45); + FfxFloat16x2 k = FfxFloat16x2(1.099, -0.099); + return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); +} + +/// Compute a gamma value from a linear value. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// Note: 'rcpX' is '1/x', where the 'x' is what would be used in ffxLinearFromGammaHalf. +/// +/// @param [in] c The value to convert to gamma space from linear. +/// @param [in] rcpX The reciprocal of power value used for the gamma curve. +/// +/// @returns +/// A value in gamma space. +/// +/// @ingroup GPUCore +FfxFloat16 ffxGammaFromLinearHalf(FfxFloat16 c, FfxFloat16 rcpX) +{ + return pow(c, FFX_BROADCAST_FLOAT16(rcpX)); +} + +/// Compute a gamma value from a linear value. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// Note: 'rcpX' is '1/x', where the 'x' is what would be used in ffxLinearFromGammaHalf. +/// +/// @param [in] c The value to convert to gamma space from linear. +/// @param [in] rcpX The reciprocal of power value used for the gamma curve. +/// +/// @returns +/// A value in gamma space. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxGammaFromLinearHalf(FfxFloat16x2 c, FfxFloat16 rcpX) +{ + return pow(c, FFX_BROADCAST_FLOAT16X2(rcpX)); +} + +/// Compute a gamma value from a linear value. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// Note: 'rcpX' is '1/x', where the 'x' is what would be used in ffxLinearFromGammaHalf. +/// +/// @param [in] c The value to convert to gamma space from linear. +/// @param [in] rcpX The reciprocal of power value used for the gamma curve. +/// +/// @returns +/// A value in gamma space. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxGammaFromLinearHalf(FfxFloat16x3 c, FfxFloat16 rcpX) +{ + return pow(c, FFX_BROADCAST_FLOAT16X3(rcpX)); +} + +/// Compute an SRGB value from a linear value. +/// +/// @param [in] c The value to convert to SRGB from linear. +/// +/// @returns +/// A value in SRGB space. +/// +/// @ingroup GPUCore +FfxFloat16 ffxSrgbFromLinearHalf(FfxFloat16 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); + FfxFloat16x2 k = FfxFloat16x2(1.055, -0.055); + return clamp(j.x, c * j.y, pow(c, j.z) * k.x + k.y); +} + +/// Compute an SRGB value from a linear value. +/// +/// @param [in] c The value to convert to SRGB from linear. +/// +/// @returns +/// A value in SRGB space. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxSrgbFromLinearHalf(FfxFloat16x2 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); + FfxFloat16x2 k = FfxFloat16x2(1.055, -0.055); + return clamp(j.xx, c * j.yy, pow(c, j.zz) * k.xx + k.yy); +} + +/// Compute an SRGB value from a linear value. +/// +/// @param [in] c The value to convert to SRGB from linear. +/// +/// @returns +/// A value in SRGB space. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxSrgbFromLinearHalf(FfxFloat16x3 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.0031308 * 12.92, 12.92, 1.0 / 2.4); + FfxFloat16x2 k = FfxFloat16x2(1.055, -0.055); + return clamp(j.xxx, c * j.yyy, pow(c, j.zzz) * k.xxx + k.yyy); +} + +/// Compute the square root of a value. +/// +/// @param [in] c The value to compute the square root for. +/// +/// @returns +/// A square root of the input value. +/// +/// @ingroup GPUCore +FfxFloat16 ffxSquareRootHalf(FfxFloat16 c) +{ + return sqrt(c); +} + +/// Compute the square root of a value. +/// +/// @param [in] c The value to compute the square root for. +/// +/// @returns +/// A square root of the input value. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxSquareRootHalf(FfxFloat16x2 c) +{ + return sqrt(c); +} + +/// Compute the square root of a value. +/// +/// @param [in] c The value to compute the square root for. +/// +/// @returns +/// A square root of the input value. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxSquareRootHalf(FfxFloat16x3 c) +{ + return sqrt(c); +} + +/// Compute the cube root of a value. +/// +/// @param [in] c The value to compute the cube root for. +/// +/// @returns +/// A cube root of the input value. +/// +/// @ingroup GPUCore +FfxFloat16 ffxCubeRootHalf(FfxFloat16 c) +{ + return pow(c, FFX_BROADCAST_FLOAT16(1.0 / 3.0)); +} + +/// Compute the cube root of a value. +/// +/// @param [in] c The value to compute the cube root for. +/// +/// @returns +/// A cube root of the input value. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxCubeRootHalf(FfxFloat16x2 c) +{ + return pow(c, FFX_BROADCAST_FLOAT16X2(1.0 / 3.0)); +} + +/// Compute the cube root of a value. +/// +/// @param [in] c The value to compute the cube root for. +/// +/// @returns +/// A cube root of the input value. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxCubeRootHalf(FfxFloat16x3 c) +{ + return pow(c, FFX_BROADCAST_FLOAT16X3(1.0 / 3.0)); +} + +/// Compute a linear value from a REC.709 value. +/// +/// @param [in] c The value to convert to linear from REC.709. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat16 ffxLinearFromRec709Half(FfxFloat16 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.081 / 4.5, 1.0 / 4.5, 1.0 / 0.45); + FfxFloat16x2 k = FfxFloat16x2(1.0 / 1.099, 0.099 / 1.099); + return ffxZeroOneSelectHalf(ffxZeroOneIsSignedHalf(c - j.x), c * j.y, pow(c * k.x + k.y, j.z)); +} + +/// Compute a linear value from a REC.709 value. +/// +/// @param [in] c The value to convert to linear from REC.709. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxLinearFromRec709Half(FfxFloat16x2 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.081 / 4.5, 1.0 / 4.5, 1.0 / 0.45); + FfxFloat16x2 k = FfxFloat16x2(1.0 / 1.099, 0.099 / 1.099); + return ffxZeroOneSelectHalf(ffxZeroOneIsSignedHalf(c - j.xx), c * j.yy, pow(c * k.xx + k.yy, j.zz)); +} + +/// Compute a linear value from a REC.709 value. +/// +/// @param [in] c The value to convert to linear from REC.709. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxLinearFromRec709Half(FfxFloat16x3 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.081 / 4.5, 1.0 / 4.5, 1.0 / 0.45); + FfxFloat16x2 k = FfxFloat16x2(1.0 / 1.099, 0.099 / 1.099); + return ffxZeroOneSelectHalf(ffxZeroOneIsSignedHalf(c - j.xxx), c * j.yyy, pow(c * k.xxx + k.yyy, j.zzz)); +} + +/// Compute a linear value from a value in a gamma space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] c The value to convert to linear in gamma space. +/// @param [in] x The power value used for the gamma curve. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat16 ffxLinearFromGammaHalf(FfxFloat16 c, FfxFloat16 x) +{ + return pow(c, FFX_BROADCAST_FLOAT16(x)); +} + +/// Compute a linear value from a value in a gamma space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] c The value to convert to linear in gamma space. +/// @param [in] x The power value used for the gamma curve. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxLinearFromGammaHalf(FfxFloat16x2 c, FfxFloat16 x) +{ + return pow(c, FFX_BROADCAST_FLOAT16X2(x)); +} + +/// Compute a linear value from a value in a gamma space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] c The value to convert to linear in gamma space. +/// @param [in] x The power value used for the gamma curve. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxLinearFromGammaHalf(FfxFloat16x3 c, FfxFloat16 x) +{ + return pow(c, FFX_BROADCAST_FLOAT16X3(x)); +} + +/// Compute a linear value from a value in a SRGB space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] c The value to convert to linear in SRGB space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat16 ffxLinearFromSrgbHalf(FfxFloat16 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.04045 / 12.92, 1.0 / 12.92, 2.4); + FfxFloat16x2 k = FfxFloat16x2(1.0 / 1.055, 0.055 / 1.055); + return ffxZeroOneSelectHalf(ffxZeroOneIsSignedHalf(c - j.x), c * j.y, pow(c * k.x + k.y, j.z)); +} + +/// Compute a linear value from a value in a SRGB space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] c The value to convert to linear in SRGB space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat16x2 ffxLinearFromSrgbHalf(FfxFloat16x2 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.04045 / 12.92, 1.0 / 12.92, 2.4); + FfxFloat16x2 k = FfxFloat16x2(1.0 / 1.055, 0.055 / 1.055); + return ffxZeroOneSelectHalf(ffxZeroOneIsSignedHalf(c - j.xx), c * j.yy, pow(c * k.xx + k.yy, j.zz)); +} + +/// Compute a linear value from a value in a SRGB space. +/// +/// Typically 2.2 for some PC displays, or 2.4-2.5 for CRTs, or 2.2 FreeSync2 native. +/// +/// @param [in] c The value to convert to linear in SRGB space. +/// +/// @returns +/// A value in linear space. +/// +/// @ingroup GPUCore +FfxFloat16x3 ffxLinearFromSrgbHalf(FfxFloat16x3 c) +{ + FfxFloat16x3 j = FfxFloat16x3(0.04045 / 12.92, 1.0 / 12.92, 2.4); + FfxFloat16x2 k = FfxFloat16x2(1.0 / 1.055, 0.055 / 1.055); + return ffxZeroOneSelectHalf(ffxZeroOneIsSignedHalf(c - j.xxx), c * j.yyy, pow(c * k.xxx + k.yyy, j.zzz)); +} + +/// A remapping of 64x1 to 8x8 imposing rotated 2x2 pixel quads in quad linear. +/// +/// Remap illustration: +/// +/// 543210 +/// ~~~~~~ +/// ..xxx. +/// yy...y +/// +/// @param [in] a The input 1D coordinates to remap. +/// +/// @returns +/// The remapped 2D coordinates. +/// +/// @ingroup GPUCore +FfxUInt16x2 ffxRemapForQuadHalf(FfxUInt32 a) +{ + return FfxUInt16x2(ffxBitfieldExtract(a, 1u, 3u), ffxBitfieldInsertMask(ffxBitfieldExtract(a, 3u, 3u), a, 1u)); +} + +/// A helper function performing a remap 64x1 to 8x8 remapping which is necessary for 2D wave reductions. +/// +/// The 64-wide lane indices to 8x8 remapping is performed as follows: +/// +/// 00 01 08 09 10 11 18 19 +/// 02 03 0a 0b 12 13 1a 1b +/// 04 05 0c 0d 14 15 1c 1d +/// 06 07 0e 0f 16 17 1e 1f +/// 20 21 28 29 30 31 38 39 +/// 22 23 2a 2b 32 33 3a 3b +/// 24 25 2c 2d 34 35 3c 3d +/// 26 27 2e 2f 36 37 3e 3f +/// +/// @param [in] a The input 1D coordinate to remap. +/// +/// @returns +/// The remapped 2D coordinates. +/// +/// @ingroup GPUCore +FfxUInt16x2 ffxRemapForWaveReductionHalf(FfxUInt32 a) +{ + return FfxUInt16x2(ffxBitfieldInsertMask(ffxBitfieldExtract(a, 2u, 3u), a, 1u), ffxBitfieldInsertMask(ffxBitfieldExtract(a, 3u, 3u), ffxBitfieldExtract(a, 1u, 2u), 2u)); +} + +#endif // FFX_HALF diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_gpu_common_half.h.meta b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_gpu_common_half.h.meta new file mode 100644 index 0000000..7baf152 --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_gpu_common_half.h.meta @@ -0,0 +1,27 @@ +fileFormatVersion: 2 +guid: 3074a00f29f9e3c448fc5c28b6b90f24 +PluginImporter: + externalObjects: {} + serializedVersion: 2 + iconMap: {} + executionOrder: {} + defineConstraints: [] + isPreloaded: 0 + isOverridable: 1 + isExplicitlyReferenced: 0 + validateReferences: 1 + platformData: + - first: + Any: + second: + enabled: 1 + settings: {} + - first: + Editor: Editor + second: + enabled: 0 + settings: + DefaultValueInitialized: true + userData: + assetBundleName: + assetBundleVariant: diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_hlsl.h b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_hlsl.h new file mode 100644 index 0000000..28827d9 --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_hlsl.h @@ -0,0 +1,1898 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (C) 2024 Advanced Micro Devices, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +/// @defgroup HLSLCore HLSL Core +/// HLSL core defines and functions +/// +/// @ingroup FfxHLSL + +#define DECLARE_SRV_REGISTER(regIndex) t##regIndex +#define DECLARE_UAV_REGISTER(regIndex) u##regIndex +#define DECLARE_CB_REGISTER(regIndex) b##regIndex +#define FFX_DECLARE_SRV(regIndex) register(DECLARE_SRV_REGISTER(regIndex)) +#define FFX_DECLARE_UAV(regIndex) register(DECLARE_UAV_REGISTER(regIndex)) +#define FFX_DECLARE_CB(regIndex) register(DECLARE_CB_REGISTER(regIndex)) + +/// A define for abstracting select functionality for pre/post HLSL 21 +/// +/// @ingroup HLSLCore +#if __HLSL_VERSION >= 2021 + +#define FFX_SELECT(cond, arg1, arg2) select(cond, arg1, arg2) + +#else // #if __HLSL_VERSION >= 2021 + +#define FFX_SELECT(cond, arg1, arg2) cond ? arg1 : arg2 + +#endif // #if __HLSL_VERSION >= 2021 + +/// A define for abstracting shared memory between shading languages. +/// +/// @ingroup HLSLCore +#define FFX_GROUPSHARED groupshared + +/// A define for abstracting compute memory barriers between shading languages. +/// +/// @ingroup HLSLCore +#define FFX_GROUP_MEMORY_BARRIER GroupMemoryBarrierWithGroupSync() + +/// A define for abstracting compute atomic additions between shading languages. +/// +/// @ingroup HLSLCore +#define FFX_ATOMIC_ADD(x, y) InterlockedAdd(x, y) + +/// A define for abstracting compute atomic additions between shading languages. +/// +/// @ingroup HLSLCore +#define FFX_ATOMIC_ADD_RETURN(x, y, r) InterlockedAdd(x, y, r) + +/// A define for abstracting compute atomic OR between shading languages. +/// +/// @ingroup HLSLCore +#define FFX_ATOMIC_OR(x, y) InterlockedOr(x, y) + +/// A define for abstracting compute atomic min between shading languages. +/// +/// @ingroup HLSLCore +#define FFX_ATOMIC_MIN(x, y) InterlockedMin(x, y) + +/// A define for abstracting compute atomic max between shading languages. +/// +/// @ingroup HLSLCore +#define FFX_ATOMIC_MAX(x, y) InterlockedMax(x, y) + +/// A define added to accept static markup on functions to aid CPU/GPU portability of code. +/// +/// @ingroup HLSLCore +#define FFX_STATIC static + +/// A define for abstracting loop unrolling between shading languages. +/// +/// @ingroup HLSLCore +#define FFX_UNROLL [unroll] + +/// A define for abstracting a 'greater than' comparison operator between two types. +/// +/// @ingroup HLSLCore +#define FFX_GREATER_THAN(x, y) x > y + +/// A define for abstracting a 'greater than or equal' comparison operator between two types. +/// +/// @ingroup HLSLCore +#define FFX_GREATER_THAN_EQUAL(x, y) x >= y + +/// A define for abstracting a 'less than' comparison operator between two types. +/// +/// @ingroup HLSLCore +#define FFX_LESS_THAN(x, y) x < y + +/// A define for abstracting a 'less than or equal' comparison operator between two types. +/// +/// @ingroup HLSLCore +#define FFX_LESS_THAN_EQUAL(x, y) x <= y + +/// A define for abstracting an 'equal' comparison operator between two types. +/// +/// @ingroup HLSLCore +#define FFX_EQUAL(x, y) x == y + +/// A define for abstracting a 'not equal' comparison operator between two types. +/// +/// @ingroup HLSLCore +#define FFX_NOT_EQUAL(x, y) x != y + +/// A define for abstracting matrix multiply operations between shading languages. +/// +/// @ingroup HLSLCore +#define FFX_MATRIX_MULTIPLY(a, b) mul(a, b) + +/// A define for abstracting vector transformations between shading languages. +/// +/// @ingroup HLSLCore +#define FFX_TRANSFORM_VECTOR(a, b) mul(a, b) + +/// A define for abstracting modulo operations between shading languages. +/// +/// @ingroup HLSLCore +#define FFX_MODULO(a, b) (fmod(a, b)) + +/// Broadcast a scalar value to a 1-dimensional floating point vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_FLOAT32(x) FfxFloat32(x) + +/// Broadcast a scalar value to a 2-dimensional floating point vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_FLOAT32X2(x) FfxFloat32(x) + +/// Broadcast a scalar value to a 3-dimensional floating point vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_FLOAT32X3(x) FfxFloat32(x) + +/// Broadcast a scalar value to a 4-dimensional floating point vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_FLOAT32X4(x) FfxFloat32(x) + +/// Broadcast a scalar value to a 1-dimensional unsigned integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_UINT32(x) FfxUInt32(x) + +/// Broadcast a scalar value to a 2-dimensional unsigned integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_UINT32X2(x) FfxUInt32(x) + +/// Broadcast a scalar value to a 4-dimensional unsigned integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_UINT32X3(x) FfxUInt32(x) + +/// Broadcast a scalar value to a 4-dimensional unsigned integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_UINT32X4(x) FfxUInt32(x) + +/// Broadcast a scalar value to a 1-dimensional signed integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_INT32(x) FfxInt32(x) + +/// Broadcast a scalar value to a 2-dimensional signed integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_INT32X2(x) FfxInt32(x) + +/// Broadcast a scalar value to a 3-dimensional signed integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_INT32X3(x) FfxInt32(x) + +/// Broadcast a scalar value to a 4-dimensional signed integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_INT32X4(x) FfxInt32(x) + +/// Broadcast a scalar value to a 1-dimensional half-precision floating point vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_MIN_FLOAT16(a) FFX_MIN16_F(a) + +/// Broadcast a scalar value to a 2-dimensional half-precision floating point vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_MIN_FLOAT16X2(a) FFX_MIN16_F(a) + +/// Broadcast a scalar value to a 3-dimensional half-precision floating point vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_MIN_FLOAT16X3(a) FFX_MIN16_F(a) + +/// Broadcast a scalar value to a 4-dimensional half-precision floating point vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_MIN_FLOAT16X4(a) FFX_MIN16_F(a) + +/// Broadcast a scalar value to a 1-dimensional half-precision unsigned integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_MIN_UINT16(a) FFX_MIN16_U(a) + +/// Broadcast a scalar value to a 2-dimensional half-precision unsigned integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_MIN_UINT16X2(a) FFX_MIN16_U(a) + +/// Broadcast a scalar value to a 3-dimensional half-precision unsigned integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_MIN_UINT16X3(a) FFX_MIN16_U(a) + +/// Broadcast a scalar value to a 4-dimensional half-precision unsigned integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_MIN_UINT16X4(a) FFX_MIN16_U(a) + +/// Broadcast a scalar value to a 1-dimensional half-precision signed integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_MIN_INT16(a) FFX_MIN16_I(a) + +/// Broadcast a scalar value to a 2-dimensional half-precision signed integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_MIN_INT16X2(a) FFX_MIN16_I(a) + +/// Broadcast a scalar value to a 3-dimensional half-precision signed integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_MIN_INT16X3(a) FFX_MIN16_I(a) + +/// Broadcast a scalar value to a 4-dimensional half-precision signed integer vector. +/// +/// @ingroup HLSLCore +#define FFX_BROADCAST_MIN_INT16X4(a) FFX_MIN16_I(a) + +/// Convert FfxFloat32 to half (in lower 16-bits of output). +/// +/// This function implements the same fast technique that is documented here: ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf +/// +/// The function supports denormals. +/// +/// Some conversion rules are to make computations possibly "safer" on the GPU, +/// -INF & -NaN -> -65504 +/// +INF & +NaN -> +65504 +/// +/// @param [in] f The 32bit floating point value to convert. +/// +/// @returns +/// The closest 16bit floating point value to f. +/// +/// @ingroup HLSLCore +#define ffxF32ToF16 f32tof16 + +/// Pack 2x32-bit floating point values in a single 32bit value. +/// +/// This function first converts each component of value into their nearest 16-bit floating +/// point representation, and then stores the X and Y components in the lower and upper 16 bits of the +/// 32bit unsigned integer respectively. +/// +/// @param [in] value A 2-dimensional floating point value to convert and pack. +/// +/// @returns +/// A packed 32bit value containing 2 16bit floating point values. +/// +/// @ingroup HLSLCore +FfxUInt32 ffxPackHalf2x16(FfxFloat32x2 value) +{ + return ffxF32ToF16(value.x) | (ffxF32ToF16(value.y) << 16); +} + +/// Broadcast a scalar value to a 2-dimensional floating point vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 2-dimensional floating point vector with value in each component. +/// +/// @ingroup HLSLCore +FfxFloat32x2 ffxBroadcast2(FfxFloat32 value) +{ + return FfxFloat32x2(value, value); +} + +/// Broadcast a scalar value to a 3-dimensional floating point vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 3-dimensional floating point vector with value in each component. +/// +/// @ingroup HLSLCore +FfxFloat32x3 ffxBroadcast3(FfxFloat32 value) +{ + return FfxFloat32x3(value, value, value); +} + +/// Broadcast a scalar value to a 4-dimensional floating point vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 4-dimensional floating point vector with value in each component. +/// +/// @ingroup HLSLCore +FfxFloat32x4 ffxBroadcast4(FfxFloat32 value) +{ + return FfxFloat32x4(value, value, value, value); +} + +/// Broadcast a scalar value to a 2-dimensional signed integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 2-dimensional signed integer vector with value in each component. +/// +/// @ingroup HLSLCore +FfxInt32x2 ffxBroadcast2(FfxInt32 value) +{ + return FfxInt32x2(value, value); +} + +/// Broadcast a scalar value to a 3-dimensional signed integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 3-dimensional signed integer vector with value in each component. +/// +/// @ingroup HLSLCore +FfxInt32x3 ffxBroadcast3(FfxInt32 value) +{ + return FfxInt32x3(value, value, value); +} + +/// Broadcast a scalar value to a 4-dimensional signed integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 4-dimensional signed integer vector with value in each component. +/// +/// @ingroup HLSLCore +FfxInt32x4 ffxBroadcast4(FfxInt32 value) +{ + return FfxInt32x4(value, value, value, value); +} + +/// Broadcast a scalar value to a 2-dimensional unsigned integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 2-dimensional unsigned integer vector with value in each component. +/// +/// @ingroup HLSLCore +FfxUInt32x2 ffxBroadcast2(FfxUInt32 value) +{ + return FfxUInt32x2(value, value); +} + +/// Broadcast a scalar value to a 3-dimensional unsigned integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 3-dimensional unsigned integer vector with value in each component. +/// +/// @ingroup HLSLCore +FfxUInt32x3 ffxBroadcast3(FfxUInt32 value) +{ + return FfxUInt32x3(value, value, value); +} + +/// Broadcast a scalar value to a 4-dimensional unsigned integer vector. +/// +/// @param [in] value The value to to broadcast. +/// +/// @returns +/// A 4-dimensional unsigned integer vector with value in each component. +/// +/// @ingroup HLSLCore +FfxUInt32x4 ffxBroadcast4(FfxUInt32 value) +{ + return FfxUInt32x4(value, value, value, value); +} + +FfxUInt32 ffxBitfieldExtract(FfxUInt32 src, FfxUInt32 off, FfxUInt32 bits) +{ + FfxUInt32 mask = (1u << bits) - 1; + return (src >> off) & mask; +} + +FfxUInt32 ffxBitfieldInsert(FfxUInt32 src, FfxUInt32 ins, FfxUInt32 mask) +{ + return (ins & mask) | (src & (~mask)); +} + +FfxUInt32 ffxBitfieldInsertMask(FfxUInt32 src, FfxUInt32 ins, FfxUInt32 bits) +{ + FfxUInt32 mask = (1u << bits) - 1; + return (ins & mask) | (src & (~mask)); +} + +/// Interprets the bit pattern of x as an unsigned integer. +/// +/// @param [in] x The input value. +/// +/// @returns +/// The input interpreted as an unsigned integer. +/// +/// @ingroup HLSLCore +FfxUInt32 ffxAsUInt32(FfxFloat32 x) +{ + return asuint(x); +} + +/// Interprets the bit pattern of x as an unsigned integer. +/// +/// @param [in] x The input value. +/// +/// @returns +/// The input interpreted as an unsigned integer. +/// +/// @ingroup HLSLCore +FfxUInt32x2 ffxAsUInt32(FfxFloat32x2 x) +{ + return asuint(x); +} + +/// Interprets the bit pattern of x as an unsigned integer. +/// +/// @param [in] x The input value. +/// +/// @returns +/// The input interpreted as an unsigned integer. +/// +/// @ingroup HLSLCore +FfxUInt32x3 ffxAsUInt32(FfxFloat32x3 x) +{ + return asuint(x); +} + +/// Interprets the bit pattern of x as an unsigned integer. +/// +/// @param [in] x The input value. +/// +/// @returns +/// The input interpreted as an unsigned integer. +/// +/// @ingroup HLSLCore +FfxUInt32x4 ffxAsUInt32(FfxFloat32x4 x) +{ + return asuint(x); +} + +/// Interprets the bit pattern of x as a floating-point number. +/// +/// @param [in] x The input value. +/// +/// @returns +/// The input interpreted as a floating-point number. +/// +/// @ingroup HLSLCore +FfxFloat32 ffxAsFloat(FfxUInt32 x) +{ + return asfloat(x); +} + +/// Interprets the bit pattern of x as a floating-point number. +/// +/// @param [in] x The input value. +/// +/// @returns +/// The input interpreted as a floating-point number. +/// +/// @ingroup HLSLCore +FfxFloat32x2 ffxAsFloat(FfxUInt32x2 x) +{ + return asfloat(x); +} + +/// Interprets the bit pattern of x as a floating-point number. +/// +/// @param [in] x The input value. +/// +/// @returns +/// The input interpreted as a floating-point number. +/// +/// @ingroup HLSLCore +FfxFloat32x3 ffxAsFloat(FfxUInt32x3 x) +{ + return asfloat(x); +} + +/// Interprets the bit pattern of x as a floating-point number. +/// +/// @param [in] x The input value. +/// +/// @returns +/// The input interpreted as a floating-point number. +/// +/// @ingroup HLSLCore +FfxFloat32x4 ffxAsFloat(FfxUInt32x4 x) +{ + return asfloat(x); +} + +/// Compute the inverse of a value. +/// +/// @param [in] x The value to calulate the inverse of. +/// +/// @returns +/// The inverse of x. +/// +/// @ingroup HLSLCore +FfxFloat32 ffxReciprocal(FfxFloat32 x) +{ + return rcp(x); +} + +/// Compute the inverse of a value. +/// +/// @param [in] x The value to calulate the inverse of. +/// +/// @returns +/// The inverse of x. +/// +/// @ingroup HLSLCore +FfxFloat32x2 ffxReciprocal(FfxFloat32x2 x) +{ + return rcp(x); +} + +/// Compute the inverse of a value. +/// +/// @param [in] x The value to calulate the inverse of. +/// +/// @returns +/// The inverse of x. +/// +/// @ingroup HLSLCore +FfxFloat32x3 ffxReciprocal(FfxFloat32x3 x) +{ + return rcp(x); +} + +/// Compute the inverse of a value. +/// +/// @param [in] x The value to calulate the inverse of. +/// +/// @returns +/// The inverse of x. +/// +/// @ingroup HLSLCore +FfxFloat32x4 ffxReciprocal(FfxFloat32x4 x) +{ + return rcp(x); +} + +/// Compute the inverse square root of a value. +/// +/// @param [in] x The value to calulate the inverse square root of. +/// +/// @returns +/// The inverse square root of x. +/// +/// @ingroup HLSLCore +FfxFloat32 ffxRsqrt(FfxFloat32 x) +{ + return rsqrt(x); +} + +/// Compute the inverse square root of a value. +/// +/// @param [in] x The value to calulate the inverse square root of. +/// +/// @returns +/// The inverse square root of x. +/// +/// @ingroup HLSLCore +FfxFloat32x2 ffxRsqrt(FfxFloat32x2 x) +{ + return rsqrt(x); +} + +/// Compute the inverse square root of a value. +/// +/// @param [in] x The value to calulate the inverse square root of. +/// +/// @returns +/// The inverse square root of x. +/// +/// @ingroup HLSLCore +FfxFloat32x3 ffxRsqrt(FfxFloat32x3 x) +{ + return rsqrt(x); +} + +/// Compute the inverse square root of a value. +/// +/// @param [in] x The value to calulate the inverse square root of. +/// +/// @returns +/// The inverse square root of x. +/// +/// @ingroup HLSLCore +FfxFloat32x4 ffxRsqrt(FfxFloat32x4 x) +{ + return rsqrt(x); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the HLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup HLSLCore +FfxFloat32 ffxLerp(FfxFloat32 x, FfxFloat32 y, FfxFloat32 t) +{ + return lerp(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the HLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup HLSLCore +FfxFloat32x2 ffxLerp(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32 t) +{ + return lerp(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the HLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup HLSLCore +FfxFloat32x2 ffxLerp(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 t) +{ + return lerp(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the HLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup HLSLCore +FfxFloat32x3 ffxLerp(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32 t) +{ + return lerp(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the HLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup HLSLCore +FfxFloat32x3 ffxLerp(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 t) +{ + return lerp(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the HLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup HLSLCore +FfxFloat32x4 ffxLerp(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32 t) +{ + return lerp(x, y, t); +} + +/// Compute the linear interopation between two values. +/// +/// Implemented by calling the HLSL mix instrinsic function. Implements the +/// following math: +/// +/// (1 - t) * x + t * y +/// +/// @param [in] x The first value to lerp between. +/// @param [in] y The second value to lerp between. +/// @param [in] t The value to determine how much of x and how much of y. +/// +/// @returns +/// A linearly interpolated value between x and y according to t. +/// +/// @ingroup HLSLCore +FfxFloat32x4 ffxLerp(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 t) +{ + return lerp(x, y, t); +} + +/// Clamp a value to a [0..1] range. +/// +/// @param [in] x The value to clamp to [0..1] range. +/// +/// @returns +/// The clamped version of x. +/// +/// @ingroup HLSLCore +FfxFloat32 ffxSaturate(FfxFloat32 x) +{ + return saturate(x); +} + +/// Clamp a value to a [0..1] range. +/// +/// @param [in] x The value to clamp to [0..1] range. +/// +/// @returns +/// The clamped version of x. +/// +/// @ingroup HLSLCore +FfxFloat32x2 ffxSaturate(FfxFloat32x2 x) +{ + return saturate(x); +} + +/// Clamp a value to a [0..1] range. +/// +/// @param [in] x The value to clamp to [0..1] range. +/// +/// @returns +/// The clamped version of x. +/// +/// @ingroup HLSLCore +FfxFloat32x3 ffxSaturate(FfxFloat32x3 x) +{ + return saturate(x); +} + +/// Clamp a value to a [0..1] range. +/// +/// @param [in] x The value to clamp to [0..1] range. +/// +/// @returns +/// The clamped version of x. +/// +/// @ingroup HLSLCore +FfxFloat32x4 ffxSaturate(FfxFloat32x4 x) +{ + return saturate(x); +} + +/// Compute the factional part of a decimal value. +/// +/// This function calculates x - floor(x). Where floor is the intrinsic HLSL function. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. It is +/// worth further noting that this function is intentionally distinct from the HLSL frac intrinsic +/// function. +/// +/// @param [in] x The value to compute the fractional part from. +/// +/// @returns +/// The fractional part of x. +/// +/// @ingroup HLSLCore +FfxFloat32 ffxFract(FfxFloat32 x) +{ + return x - floor(x); +} + +/// Compute the factional part of a decimal value. +/// +/// This function calculates x - floor(x). Where floor is the intrinsic HLSL function. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. It is +/// worth further noting that this function is intentionally distinct from the HLSL frac intrinsic +/// function. +/// +/// @param [in] x The value to compute the fractional part from. +/// +/// @returns +/// The fractional part of x. +/// +/// @ingroup HLSLCore +FfxFloat32x2 ffxFract(FfxFloat32x2 x) +{ + return x - floor(x); +} + +/// Compute the factional part of a decimal value. +/// +/// This function calculates x - floor(x). Where floor is the intrinsic HLSL function. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. It is +/// worth further noting that this function is intentionally distinct from the HLSL frac intrinsic +/// function. +/// +/// @param [in] x The value to compute the fractional part from. +/// +/// @returns +/// The fractional part of x. +/// +/// @ingroup HLSLCore +FfxFloat32x3 ffxFract(FfxFloat32x3 x) +{ + return x - floor(x); +} + +/// Compute the factional part of a decimal value. +/// +/// This function calculates x - floor(x). Where floor is the intrinsic HLSL function. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. It is +/// worth further noting that this function is intentionally distinct from the HLSL frac intrinsic +/// function. +/// +/// @param [in] x The value to compute the fractional part from. +/// +/// @returns +/// The fractional part of x. +/// +/// @ingroup HLSLCore +FfxFloat32x4 ffxFract(FfxFloat32x4 x) +{ + return x - floor(x); +} + +/// Rounds to the nearest integer. In case the fractional part is 0.5, it will round to the nearest even integer. +/// +/// @param [in] x The value to be rounded. +/// +/// @returns +/// The nearest integer from x. The nearest even integer from x if equidistant from 2 integer. +/// +/// @ingroup HLSLCore +FfxFloat32 ffxRound(FfxFloat32 x) +{ + return round(x); +} + +/// Rounds to the nearest integer. In case the fractional part is 0.5, it will round to the nearest even integer. +/// +/// @param [in] x The value to be rounded. +/// +/// @returns +/// The nearest integer from x. The nearest even integer from x if equidistant from 2 integer. +/// +/// @ingroup HLSLCore +FfxFloat32x2 ffxRound(FfxFloat32x2 x) +{ + return round(x); +} + +/// Rounds to the nearest integer. In case the fractional part is 0.5, it will round to the nearest even integer. +/// +/// @param [in] x The value to be rounded. +/// +/// @returns +/// The nearest integer from x. The nearest even integer from x if equidistant from 2 integer. +/// +/// @ingroup HLSLCore +FfxFloat32x3 ffxRound(FfxFloat32x3 x) +{ + return round(x); +} + +/// Rounds to the nearest integer. In case the fractional part is 0.5, it will round to the nearest even integer. +/// +/// @param [in] x The value to be rounded. +/// +/// @returns +/// The nearest integer from x. The nearest even integer from x if equidistant from 2 integer. +/// +/// @ingroup HLSLCore +FfxFloat32x4 ffxRound(FfxFloat32x4 x) +{ + return round(x); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxFloat32 ffxMax3(FfxFloat32 x, FfxFloat32 y, FfxFloat32 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxFloat32x2 ffxMax3(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxFloat32x3 ffxMax3(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxFloat32x4 ffxMax3(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxUInt32 ffxMax3(FfxUInt32 x, FfxUInt32 y, FfxUInt32 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxUInt32x2 ffxMax3(FfxUInt32x2 x, FfxUInt32x2 y, FfxUInt32x2 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxUInt32x3 ffxMax3(FfxUInt32x3 x, FfxUInt32x3 y, FfxUInt32x3 z) +{ + return max(x, max(y, z)); +} + +/// Compute the maximum of three values. +/// +/// NOTE: This function should compile down to a single V_MAX3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the max calculation. +/// @param [in] y The second value to include in the max calcuation. +/// @param [in] z The third value to include in the max calcuation. +/// +/// @returns +/// The maximum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxUInt32x4 ffxMax3(FfxUInt32x4 x, FfxUInt32x4 y, FfxUInt32x4 z) +{ + return max(x, max(y, z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxFloat32 ffxMed3(FfxFloat32 x, FfxFloat32 y, FfxFloat32 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxFloat32x2 ffxMed3(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxFloat32x3 ffxMed3(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxFloat32x4 ffxMed3(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSL +FfxInt32 ffxMed3(FfxInt32 x, FfxInt32 y, FfxInt32 z) +{ + return max(min(x, y), min(max(x, y), z)); + // return min(max(min(y, z), x), max(y, z)); + // return max(max(x, y), z) == x ? max(y, z) : (max(max(x, y), z) == y ? max(x, z) : max(x, y)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSL +FfxInt32x2 ffxMed3(FfxInt32x2 x, FfxInt32x2 y, FfxInt32x2 z) +{ + return max(min(x, y), min(max(x, y), z)); + // return min(max(min(y, z), x), max(y, z)); + // return max(max(x, y), z) == x ? max(y, z) : (max(max(x, y), z) == y ? max(x, z) : max(x, y)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSL +FfxInt32x3 ffxMed3(FfxInt32x3 x, FfxInt32x3 y, FfxInt32x3 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the median of three values. +/// +/// NOTE: This function should compile down to a single V_MED3_I32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the median calculation. +/// @param [in] y The second value to include in the median calcuation. +/// @param [in] z The third value to include in the median calcuation. +/// +/// @returns +/// The median value of x, y, and z. +/// +/// @ingroup HLSL +FfxInt32x4 ffxMed3(FfxInt32x4 x, FfxInt32x4 y, FfxInt32x4 z) +{ + return max(min(x, y), min(max(x, y), z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxFloat32 ffxMin3(FfxFloat32 x, FfxFloat32 y, FfxFloat32 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxFloat32x2 ffxMin3(FfxFloat32x2 x, FfxFloat32x2 y, FfxFloat32x2 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxFloat32x3 ffxMin3(FfxFloat32x3 x, FfxFloat32x3 y, FfxFloat32x3 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxFloat32x4 ffxMin3(FfxFloat32x4 x, FfxFloat32x4 y, FfxFloat32x4 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxUInt32 ffxMin3(FfxUInt32 x, FfxUInt32 y, FfxUInt32 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxUInt32x2 ffxMin3(FfxUInt32x2 x, FfxUInt32x2 y, FfxUInt32x2 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calculation. +/// @param [in] z The third value to include in the min calculation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxUInt32x3 ffxMin3(FfxUInt32x3 x, FfxUInt32x3 y, FfxUInt32x3 z) +{ + return min(x, min(y, z)); +} + +/// Compute the minimum of three values. +/// +/// NOTE: This function should compile down to a single V_MIN3_F32 operation on GCN/RDNA hardware. +/// +/// @param [in] x The first value to include in the min calculation. +/// @param [in] y The second value to include in the min calcuation. +/// @param [in] z The third value to include in the min calcuation. +/// +/// @returns +/// The minimum value of x, y, and z. +/// +/// @ingroup HLSLCore +FfxUInt32x4 ffxMin3(FfxUInt32x4 x, FfxUInt32x4 y, FfxUInt32x4 z) +{ + return min(x, min(y, z)); +} + + +FfxUInt32 ffxAShrSU1(FfxUInt32 a, FfxUInt32 b) +{ + return FfxUInt32(FfxInt32(a) >> FfxInt32(b)); +} + +FfxUInt32 ffxPackF32(FfxFloat32x2 v){ + FfxUInt32x2 p = FfxUInt32x2(ffxF32ToF16(FfxFloat32x2(v).x), ffxF32ToF16(FfxFloat32x2(v).y)); + return p.x | (p.y << 16); +} + +FfxFloat32x2 ffxUnpackF32(FfxUInt32 a){ + return f16tof32(FfxUInt32x2(a & 0xFFFF, a >> 16)); +} + +FfxUInt32x2 ffxPackF32x2(FfxFloat32x4 v){ + return FfxUInt32x2(ffxPackF32(v.xy), ffxPackF32(v.zw)); +} + +FfxFloat32x4 ffxUnpackF32x2(FfxUInt32x2 a){ + return FfxFloat32x4(ffxUnpackF32(a.x), ffxUnpackF32(a.y)); +} + +//============================================================================================================================== +// HLSL HALF +//============================================================================================================================== +//============================================================================================================================== +// Need to use manual unpack to get optimal execution (don't use packed types in buffers directly). +// Unpack requires this pattern: https://gpuopen.com/first-steps-implementing-fp16/ +FFX_MIN16_F2 ffxUint32ToFloat16x2(FfxUInt32 x) +{ + FfxFloat32x2 t = f16tof32(FfxUInt32x2(x & 0xFFFF, x >> 16)); + return FFX_MIN16_F2(t); +} +FFX_MIN16_F4 ffxUint32x2ToFloat16x4(FfxUInt32x2 x) +{ + return FFX_MIN16_F4(ffxUint32ToFloat16x2(x.x), ffxUint32ToFloat16x2(x.y)); +} +FFX_MIN16_U2 ffxUint32ToUint16x2(FfxUInt32 x) +{ + FfxUInt32x2 t = FfxUInt32x2(x & 0xFFFF, x >> 16); + return FFX_MIN16_U2(t); +} +FFX_MIN16_U4 ffxUint32x2ToUint16x4(FfxUInt32x2 x) +{ + return FFX_MIN16_U4(ffxUint32ToUint16x2(x.x), ffxUint32ToUint16x2(x.y)); +} + +FfxUInt32x2 ffxFloat16x4ToUint32x2(FFX_MIN16_F4 v) +{ + FfxUInt32x2 result; + result.x = ffxF32ToF16(v.x) | (ffxF32ToF16(v.y) << 16); + result.y = ffxF32ToF16(v.z) | (ffxF32ToF16(v.w) << 16); + return result; +} + +/// @brief Inverts the value while avoiding division by zero. If the value is zero, zero is returned. +/// @param v Value to invert. +/// @return If v = 0 returns 0. If v != 0 returns 1/v. +FfxFloat32 ffxInvertSafe(FfxFloat32 v){ + FfxFloat32 s = FfxFloat32(sign(v)); + FfxFloat32 s2 = s*s; + return s2/(v + s2 - 1.0); +} + +/// @brief Inverts the value while avoiding division by zero. If the value is zero, zero is returned. +/// @param v Value to invert. +/// @return If v = 0 returns 0. If v != 0 returns 1/v. +FfxFloat32x2 ffxInvertSafe(FfxFloat32x2 v){ + FfxFloat32x2 s = FfxFloat32x2(sign(v)); + FfxFloat32x2 s2 = s*s; + return s2/(v + s2 - FfxFloat32x2(1.0, 1.0)); +} + +/// @brief Inverts the value while avoiding division by zero. If the value is zero, zero is returned. +/// @param v Value to invert. +/// @return If v = 0 returns 0. If v != 0 returns 1/v. +FfxFloat32x3 ffxInvertSafe(FfxFloat32x3 v){ + FfxFloat32x3 s = FfxFloat32x3(sign(v)); + FfxFloat32x3 s2 = s*s; + return s2/(v + s2 - FfxFloat32x3(1.0, 1.0, 1.0)); +} + +/// @brief Inverts the value while avoiding division by zero. If the value is zero, zero is returned. +/// @param v Value to invert. +/// @return If v = 0 returns 0. If v != 0 returns 1/v. +FfxFloat32x4 ffxInvertSafe(FfxFloat32x4 v){ + FfxFloat32x4 s = FfxFloat32x4(sign(v)); + FfxFloat32x4 s2 = s*s; + return s2/(v + s2 - FfxFloat32x4(1.0, 1.0, 1.0, 1.0)); +} + +#define FFX_UINT32_TO_FLOAT16X2(x) ffxUint32ToFloat16x2(FfxUInt32(x)) +#if FFX_HALF + +#define FFX_UINT32X2_TO_FLOAT16X4(x) ffxUint32x2ToFloat16x4(FfxUInt32x2(x)) +#define FFX_UINT32_TO_UINT16X2(x) ffxUint32ToUint16x2(FfxUInt32(x)) +#define FFX_UINT32X2_TO_UINT16X4(x) ffxUint32x2ToUint16x4(FfxUInt32x2(x)) + +FfxUInt32 ffxPackF16(FfxFloat16x2 v){ + FfxUInt32x2 p = FfxUInt32x2(ffxF32ToF16(FfxFloat32x2(v).x), ffxF32ToF16(FfxFloat32x2(v).y)); + return p.x | (p.y << 16); +} + +FfxFloat16x2 ffxUnpackF16(FfxUInt32 a){ + return FfxFloat16x2(f16tof32(FfxUInt32x2(a & 0xFFFF, a >> 16))); +} + +//------------------------------------------------------------------------------------------------------------------------------ +FfxUInt32 FFX_MIN16_F2ToUint32(FFX_MIN16_F2 x) +{ + return ffxF32ToF16(x.x) + (ffxF32ToF16(x.y) << 16); +} +FfxUInt32x2 FFX_MIN16_F4ToUint32x2(FFX_MIN16_F4 x) +{ + return FfxUInt32x2(FFX_MIN16_F2ToUint32(x.xy), FFX_MIN16_F2ToUint32(x.zw)); +} +FfxUInt32 FFX_MIN16_U2ToUint32(FFX_MIN16_U2 x) +{ + return FfxUInt32(x.x) + (FfxUInt32(x.y) << 16); +} +FfxUInt32x2 FFX_MIN16_U4ToUint32x2(FFX_MIN16_U4 x) +{ + return FfxUInt32x2(FFX_MIN16_U2ToUint32(x.xy), FFX_MIN16_U2ToUint32(x.zw)); +} +#define FFX_FLOAT16X2_TO_UINT32(x) FFX_MIN16_F2ToUint32(FFX_MIN16_F2(x)) +#define FFX_FLOAT16X4_TO_UINT32X2(x) FFX_MIN16_F4ToUint32x2(FFX_MIN16_F4(x)) +#define FFX_UINT16X2_TO_UINT32(x) FFX_MIN16_U2ToUint32(FFX_MIN16_U2(x)) +#define FFX_UINT16X4_TO_UINT32X2(x) FFX_MIN16_U4ToUint32x2(FFX_MIN16_U4(x)) + +#if (FFX_HLSL_SM >= 62) && !defined(FFX_NO_16_BIT_CAST) +#define FFX_TO_UINT16(x) asuint16(x) +#define FFX_TO_UINT16X2(x) asuint16(x) +#define FFX_TO_UINT16X3(x) asuint16(x) +#define FFX_TO_UINT16X4(x) asuint16(x) +#else +#define FFX_TO_UINT16(a) FFX_MIN16_U(ffxF32ToF16(FfxFloat32(a))) +#define FFX_TO_UINT16X2(a) FFX_MIN16_U2(FFX_TO_UINT16((a).x), FFX_TO_UINT16((a).y)) +#define FFX_TO_UINT16X3(a) FFX_MIN16_U3(FFX_TO_UINT16((a).x), FFX_TO_UINT16((a).y), FFX_TO_UINT16((a).z)) +#define FFX_TO_UINT16X4(a) FFX_MIN16_U4(FFX_TO_UINT16((a).x), FFX_TO_UINT16((a).y), FFX_TO_UINT16((a).z), FFX_TO_UINT16((a).w)) +#endif // #if (FFX_HLSL_SM>=62) && !defined(FFX_NO_16_BIT_CAST) + +#if (FFX_HLSL_SM >= 62) && !defined(FFX_NO_16_BIT_CAST) +#define FFX_TO_FLOAT16(x) asfloat16(x) +#define FFX_TO_FLOAT16X2(x) asfloat16(x) +#define FFX_TO_FLOAT16X3(x) asfloat16(x) +#define FFX_TO_FLOAT16X4(x) asfloat16(x) +#else +#define FFX_TO_FLOAT16(a) FFX_MIN16_F(f16tof32(FfxUInt32(a))) +#define FFX_TO_FLOAT16X2(a) FFX_MIN16_F2(FFX_TO_FLOAT16((a).x), FFX_TO_FLOAT16((a).y)) +#define FFX_TO_FLOAT16X3(a) FFX_MIN16_F3(FFX_TO_FLOAT16((a).x), FFX_TO_FLOAT16((a).y), FFX_TO_FLOAT16((a).z)) +#define FFX_TO_FLOAT16X4(a) FFX_MIN16_F4(FFX_TO_FLOAT16((a).x), FFX_TO_FLOAT16((a).y), FFX_TO_FLOAT16((a).z), FFX_TO_FLOAT16((a).w)) +#endif // #if (FFX_HLSL_SM>=62) && !defined(FFX_NO_16_BIT_CAST) + +//============================================================================================================================== +#define FFX_BROADCAST_FLOAT16(a) FFX_MIN16_F(a) +#define FFX_BROADCAST_FLOAT16X2(a) FFX_MIN16_F(a) +#define FFX_BROADCAST_FLOAT16X3(a) FFX_MIN16_F(a) +#define FFX_BROADCAST_FLOAT16X4(a) FFX_MIN16_F(a) + +//------------------------------------------------------------------------------------------------------------------------------ +#define FFX_BROADCAST_INT16(a) FFX_MIN16_I(a) +#define FFX_BROADCAST_INT16X2(a) FFX_MIN16_I(a) +#define FFX_BROADCAST_INT16X3(a) FFX_MIN16_I(a) +#define FFX_BROADCAST_INT16X4(a) FFX_MIN16_I(a) + +//------------------------------------------------------------------------------------------------------------------------------ +#define FFX_BROADCAST_UINT16(a) FFX_MIN16_U(a) +#define FFX_BROADCAST_UINT16X2(a) FFX_MIN16_U(a) +#define FFX_BROADCAST_UINT16X3(a) FFX_MIN16_U(a) +#define FFX_BROADCAST_UINT16X4(a) FFX_MIN16_U(a) + +//============================================================================================================================== +FFX_MIN16_U ffxAbsHalf(FFX_MIN16_U a) +{ + return FFX_MIN16_U(abs(FFX_MIN16_I(a))); +} +FFX_MIN16_U2 ffxAbsHalf(FFX_MIN16_U2 a) +{ + return FFX_MIN16_U2(abs(FFX_MIN16_I2(a))); +} +FFX_MIN16_U3 ffxAbsHalf(FFX_MIN16_U3 a) +{ + return FFX_MIN16_U3(abs(FFX_MIN16_I3(a))); +} +FFX_MIN16_U4 ffxAbsHalf(FFX_MIN16_U4 a) +{ + return FFX_MIN16_U4(abs(FFX_MIN16_I4(a))); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxClampHalf(FFX_MIN16_F x, FFX_MIN16_F n, FFX_MIN16_F m) +{ + return max(n, min(x, m)); +} +FFX_MIN16_F2 ffxClampHalf(FFX_MIN16_F2 x, FFX_MIN16_F2 n, FFX_MIN16_F2 m) +{ + return max(n, min(x, m)); +} +FFX_MIN16_F3 ffxClampHalf(FFX_MIN16_F3 x, FFX_MIN16_F3 n, FFX_MIN16_F3 m) +{ + return max(n, min(x, m)); +} +FFX_MIN16_F4 ffxClampHalf(FFX_MIN16_F4 x, FFX_MIN16_F4 n, FFX_MIN16_F4 m) +{ + return max(n, min(x, m)); +} +//------------------------------------------------------------------------------------------------------------------------------ +// V_FRACT_F16 (note DX frac() is different). +FFX_MIN16_F ffxFract(FFX_MIN16_F x) +{ + return x - floor(x); +} +FFX_MIN16_F2 ffxFract(FFX_MIN16_F2 x) +{ + return x - floor(x); +} +FFX_MIN16_F3 ffxFract(FFX_MIN16_F3 x) +{ + return x - floor(x); +} +FFX_MIN16_F4 ffxFract(FFX_MIN16_F4 x) +{ + return x - floor(x); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxLerp(FFX_MIN16_F x, FFX_MIN16_F y, FFX_MIN16_F a) +{ + return lerp(x, y, a); +} +FFX_MIN16_F2 ffxLerp(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F a) +{ + return lerp(x, y, a); +} +FFX_MIN16_F2 ffxLerp(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F2 a) +{ + return lerp(x, y, a); +} +FFX_MIN16_F3 ffxLerp(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F a) +{ + return lerp(x, y, a); +} +FFX_MIN16_F3 ffxLerp(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F3 a) +{ + return lerp(x, y, a); +} +FFX_MIN16_F4 ffxLerp(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F a) +{ + return lerp(x, y, a); +} +FFX_MIN16_F4 ffxLerp(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F4 a) +{ + return lerp(x, y, a); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxMax3Half(FFX_MIN16_F x, FFX_MIN16_F y, FFX_MIN16_F z) +{ + return max(x, max(y, z)); +} +FFX_MIN16_F2 ffxMax3Half(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F2 z) +{ + return max(x, max(y, z)); +} +FFX_MIN16_F3 ffxMax3Half(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F3 z) +{ + return max(x, max(y, z)); +} +FFX_MIN16_F4 ffxMax3Half(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F4 z) +{ + return max(x, max(y, z)); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxMin3Half(FFX_MIN16_F x, FFX_MIN16_F y, FFX_MIN16_F z) +{ + return min(x, min(y, z)); +} +FFX_MIN16_F2 ffxMin3Half(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F2 z) +{ + return min(x, min(y, z)); +} +FFX_MIN16_F3 ffxMin3Half(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F3 z) +{ + return min(x, min(y, z)); +} +FFX_MIN16_F4 ffxMin3Half(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F4 z) +{ + return min(x, min(y, z)); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxMed3Half(FFX_MIN16_F x, FFX_MIN16_F y, FFX_MIN16_F z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FFX_MIN16_F2 ffxMed3Half(FFX_MIN16_F2 x, FFX_MIN16_F2 y, FFX_MIN16_F2 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FFX_MIN16_F3 ffxMed3Half(FFX_MIN16_F3 x, FFX_MIN16_F3 y, FFX_MIN16_F3 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FFX_MIN16_F4 ffxMed3Half(FFX_MIN16_F4 x, FFX_MIN16_F4 y, FFX_MIN16_F4 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_I ffxMed3Half(FFX_MIN16_I x, FFX_MIN16_I y, FFX_MIN16_I z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FFX_MIN16_I2 ffxMed3Half(FFX_MIN16_I2 x, FFX_MIN16_I2 y, FFX_MIN16_I2 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FFX_MIN16_I3 ffxMed3Half(FFX_MIN16_I3 x, FFX_MIN16_I3 y, FFX_MIN16_I3 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +FFX_MIN16_I4 ffxMed3Half(FFX_MIN16_I4 x, FFX_MIN16_I4 y, FFX_MIN16_I4 z) +{ + return max(min(x, y), min(max(x, y), z)); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxReciprocalHalf(FFX_MIN16_F x) +{ + return rcp(x); +} +FFX_MIN16_F2 ffxReciprocalHalf(FFX_MIN16_F2 x) +{ + return rcp(x); +} +FFX_MIN16_F3 ffxReciprocalHalf(FFX_MIN16_F3 x) +{ + return rcp(x); +} +FFX_MIN16_F4 ffxReciprocalHalf(FFX_MIN16_F4 x) +{ + return rcp(x); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxReciprocalSquareRootHalf(FFX_MIN16_F x) +{ + return rsqrt(x); +} +FFX_MIN16_F2 ffxReciprocalSquareRootHalf(FFX_MIN16_F2 x) +{ + return rsqrt(x); +} +FFX_MIN16_F3 ffxReciprocalSquareRootHalf(FFX_MIN16_F3 x) +{ + return rsqrt(x); +} +FFX_MIN16_F4 ffxReciprocalSquareRootHalf(FFX_MIN16_F4 x) +{ + return rsqrt(x); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_F ffxSaturate(FFX_MIN16_F x) +{ + return saturate(x); +} +FFX_MIN16_F2 ffxSaturate(FFX_MIN16_F2 x) +{ + return saturate(x); +} +FFX_MIN16_F3 ffxSaturate(FFX_MIN16_F3 x) +{ + return saturate(x); +} +FFX_MIN16_F4 ffxSaturate(FFX_MIN16_F4 x) +{ + return saturate(x); +} +//------------------------------------------------------------------------------------------------------------------------------ +FFX_MIN16_U ffxBitShiftRightHalf(FFX_MIN16_U a, FFX_MIN16_U b) +{ + return FFX_MIN16_U(FFX_MIN16_I(a) >> FFX_MIN16_I(b)); +} +FFX_MIN16_U2 ffxBitShiftRightHalf(FFX_MIN16_U2 a, FFX_MIN16_U2 b) +{ + return FFX_MIN16_U2(FFX_MIN16_I2(a) >> FFX_MIN16_I2(b)); +} +FFX_MIN16_U3 ffxBitShiftRightHalf(FFX_MIN16_U3 a, FFX_MIN16_U3 b) +{ + return FFX_MIN16_U3(FFX_MIN16_I3(a) >> FFX_MIN16_I3(b)); +} +FFX_MIN16_U4 ffxBitShiftRightHalf(FFX_MIN16_U4 a, FFX_MIN16_U4 b) +{ + return FFX_MIN16_U4(FFX_MIN16_I4(a) >> FFX_MIN16_I4(b)); +} +#endif // FFX_HALF + +//============================================================================================================================== +// HLSL WAVE +//============================================================================================================================== +#if defined(FFX_WAVE) +// Where 'x' must be a compile time literal. +FfxFloat32 ffxWaveXorF1(FfxFloat32 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxFloat32x2 ffxWaveXorF2(FfxFloat32x2 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxFloat32x3 ffxWaveXorF3(FfxFloat32x3 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxFloat32x4 ffxWaveXorF4(FfxFloat32x4 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxUInt32 ffxWaveXorU1(FfxUInt32 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxUInt32x2 ffxWaveXorU1(FfxUInt32x2 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxUInt32x3 ffxWaveXorU1(FfxUInt32x3 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxUInt32x4 ffxWaveXorU1(FfxUInt32x4 v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, WaveGetLaneIndex() ^ x); +} +FfxBoolean ffxWaveIsFirstLane() +{ + return WaveIsFirstLane(); +} +FfxUInt32 ffxWaveLaneIndex() +{ + return WaveGetLaneIndex(); +} +FfxBoolean ffxWaveReadAtLaneIndexB1(FfxBoolean v, FfxUInt32 x) +{ + return WaveReadLaneAt(v, x); +} +FfxUInt32 ffxWavePrefixCountBits(FfxBoolean v) +{ + return WavePrefixCountBits(v); +} +FfxUInt32 ffxWaveActiveCountBits(FfxBoolean v) +{ + return WaveActiveCountBits(v); +} +FfxUInt32 ffxWaveReadLaneFirstU1(FfxUInt32 v) +{ + return WaveReadLaneFirst(v); +} +FfxUInt32x2 ffxWaveReadLaneFirstU2(FfxUInt32x2 v) +{ + return WaveReadLaneFirst(v); +} +FfxBoolean ffxWaveReadLaneFirstB1(FfxBoolean v) +{ + return WaveReadLaneFirst(v); +} +FfxUInt32 ffxWaveOr(FfxUInt32 a) +{ + return WaveActiveBitOr(a); +} +FfxUInt32 ffxWaveMin(FfxUInt32 a) +{ + return WaveActiveMin(a); +} +FfxFloat32 ffxWaveMin(FfxFloat32 a) +{ + return WaveActiveMin(a); +} +FfxUInt32 ffxWaveMax(FfxUInt32 a) +{ + return WaveActiveMax(a); +} +FfxFloat32 ffxWaveMax(FfxFloat32 a) +{ + return WaveActiveMax(a); +} +FfxUInt32 ffxWaveSum(FfxUInt32 a) +{ + return WaveActiveSum(a); +} +FfxFloat32 ffxWaveSum(FfxFloat32 a) +{ + return WaveActiveSum(a); +} +FfxUInt32 ffxWaveLaneCount() +{ + return WaveGetLaneCount(); +} +FfxBoolean ffxWaveAllTrue(FfxBoolean v) +{ + return WaveActiveAllTrue(v); +} +FfxFloat32 ffxQuadReadX(FfxFloat32 v) +{ + return QuadReadAcrossX(v); +} +FfxFloat32x2 ffxQuadReadX(FfxFloat32x2 v) +{ + return QuadReadAcrossX(v); +} +FfxFloat32 ffxQuadReadY(FfxFloat32 v) +{ + return QuadReadAcrossY(v); +} +FfxFloat32x2 ffxQuadReadY(FfxFloat32x2 v) +{ + return QuadReadAcrossY(v); +} + +#if FFX_HALF +FfxFloat16x2 ffxWaveXorFloat16x2(FfxFloat16x2 v, FfxUInt32 x) +{ + return FFX_UINT32_TO_FLOAT16X2(WaveReadLaneAt(FFX_FLOAT16X2_TO_UINT32(v), WaveGetLaneIndex() ^ x)); +} +FfxFloat16x4 ffxWaveXorFloat16x4(FfxFloat16x4 v, FfxUInt32 x) +{ + return FFX_UINT32X2_TO_FLOAT16X4(WaveReadLaneAt(FFX_FLOAT16X4_TO_UINT32X2(v), WaveGetLaneIndex() ^ x)); +} +FfxUInt16x2 ffxWaveXorUint16x2(FfxUInt16x2 v, FfxUInt32 x) +{ + return FFX_UINT32_TO_UINT16X2(WaveReadLaneAt(FFX_UINT16X2_TO_UINT32(v), WaveGetLaneIndex() ^ x)); +} +FfxUInt16x4 ffxWaveXorUint16x4(FfxUInt16x4 v, FfxUInt32 x) +{ + return FFX_UINT32X2_TO_UINT16X4(WaveReadLaneAt(FFX_UINT16X4_TO_UINT32X2(v), WaveGetLaneIndex() ^ x)); +} +#endif // FFX_HALF +#endif // #if defined(FFX_WAVE) diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_hlsl.h.meta b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_hlsl.h.meta new file mode 100644 index 0000000..356b2a1 --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_hlsl.h.meta @@ -0,0 +1,27 @@ +fileFormatVersion: 2 +guid: c98f28ee15ed8494c89aaedd91dd8d3d +PluginImporter: + externalObjects: {} + serializedVersion: 2 + iconMap: {} + executionOrder: {} + defineConstraints: [] + isPreloaded: 0 + isOverridable: 1 + isExplicitlyReferenced: 0 + validateReferences: 1 + platformData: + - first: + Any: + second: + enabled: 1 + settings: {} + - first: + Editor: Editor + second: + enabled: 0 + settings: + DefaultValueInitialized: true + userData: + assetBundleName: + assetBundleVariant: diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_portability.h b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_portability.h new file mode 100644 index 0000000..12147b9 --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_portability.h @@ -0,0 +1,46 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (C) 2024 Advanced Micro Devices, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +void ffxOpAAddOneF3(FFX_PARAMETER_OUT FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32 b) +{ + d = a + ffxBroadcast3(b); +} + +void ffxOpACpyF3(FFX_PARAMETER_OUT FfxFloat32x3 d, FfxFloat32x3 a) +{ + d = a; +} + +void ffxOpAMulF3(FFX_PARAMETER_OUT FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32x3 b) +{ + d = a * b; +} + +void ffxOpAMulOneF3(FFX_PARAMETER_OUT FfxFloat32x3 d, FfxFloat32x3 a, FfxFloat32 b) +{ + d = a * b; +} + +void ffxOpARcpF3(FFX_PARAMETER_OUT FfxFloat32x3 d, FfxFloat32x3 a) +{ + d = ffxReciprocal(a); +} diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_portability.h.meta b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_portability.h.meta new file mode 100644 index 0000000..302a17d --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_core_portability.h.meta @@ -0,0 +1,27 @@ +fileFormatVersion: 2 +guid: 9943ca8c436885d448cc133e640fd4b2 +PluginImporter: + externalObjects: {} + serializedVersion: 2 + iconMap: {} + executionOrder: {} + defineConstraints: [] + isPreloaded: 0 + isOverridable: 1 + isExplicitlyReferenced: 0 + validateReferences: 1 + platformData: + - first: + Any: + second: + enabled: 1 + settings: {} + - first: + Editor: Editor + second: + enabled: 0 + settings: + DefaultValueInitialized: true + userData: + assetBundleName: + assetBundleVariant: diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_spd.h b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_spd.h new file mode 100644 index 0000000..c3ee50f --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_spd.h @@ -0,0 +1,1014 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (C) 2024 Advanced Micro Devices, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +/// @defgroup FfxGPUSpd FidelityFX SPD +/// FidelityFX Single Pass Downsampler 2.0 GPU documentation +/// +/// @ingroup FfxGPUEffects + +/// Setup required constant values for SPD (CPU). +/// +/// @param [out] dispatchThreadGroupCountXY CPU side: dispatch thread group count xy. z is number of slices of the input texture +/// @param [out] workGroupOffset GPU side: pass in as constant +/// @param [out] numWorkGroupsAndMips GPU side: pass in as constant +/// @param [in] rectInfo left, top, width, height +/// @param [in] mips optional: if -1, calculate based on rect width and height +/// +/// @ingroup FfxGPUSpd +#if defined(FFX_CPU) +FFX_STATIC void ffxSpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY, + FfxUInt32x2 workGroupOffset, + FfxUInt32x2 numWorkGroupsAndMips, + FfxUInt32x4 rectInfo, + FfxInt32 mips) +{ + // determines the offset of the first tile to downsample based on + // left (rectInfo[0]) and top (rectInfo[1]) of the subregion. + workGroupOffset[0] = rectInfo[0] / 64; + workGroupOffset[1] = rectInfo[1] / 64; + + FfxUInt32 endIndexX = (rectInfo[0] + rectInfo[2] - 1) / 64; // rectInfo[0] = left, rectInfo[2] = width + FfxUInt32 endIndexY = (rectInfo[1] + rectInfo[3] - 1) / 64; // rectInfo[1] = top, rectInfo[3] = height + + // we only need to dispatch as many thread groups as tiles we need to downsample + // number of tiles per slice depends on the subregion to downsample + dispatchThreadGroupCountXY[0] = endIndexX + 1 - workGroupOffset[0]; + dispatchThreadGroupCountXY[1] = endIndexY + 1 - workGroupOffset[1]; + + // number of thread groups per slice + numWorkGroupsAndMips[0] = (dispatchThreadGroupCountXY[0]) * (dispatchThreadGroupCountXY[1]); + + if (mips >= 0) + { + numWorkGroupsAndMips[1] = FfxUInt32(mips); + } + else + { + // calculate based on rect width and height + FfxUInt32 resolution = ffxMax(rectInfo[2], rectInfo[3]); + numWorkGroupsAndMips[1] = FfxUInt32((ffxMin(floor(log2(FfxFloat32(resolution))), FfxFloat32(12)))); + } +} + +/// Setup required constant values for SPD (CPU). +/// +/// @param [out] dispatchThreadGroupCountXY CPU side: dispatch thread group count xy. z is number of slices of the input texture +/// @param [out] workGroupOffset GPU side: pass in as constant +/// @param [out] numWorkGroupsAndMips GPU side: pass in as constant +/// @param [in] rectInfo left, top, width, height +/// +/// @ingroup FfxGPUSpd +FFX_STATIC void ffxSpdSetup(FfxUInt32x2 dispatchThreadGroupCountXY, + FfxUInt32x2 workGroupOffset, + FfxUInt32x2 numWorkGroupsAndMips, + FfxUInt32x4 rectInfo) +{ + ffxSpdSetup(dispatchThreadGroupCountXY, workGroupOffset, numWorkGroupsAndMips, rectInfo, -1); +} +#endif // #if defined(FFX_CPU) + + +//============================================================================================================================== +// NON-PACKED VERSION +//============================================================================================================================== +#if defined(FFX_GPU) +#if defined(FFX_SPD_PACKED_ONLY) +// Avoid compiler errors by including default implementations of these callbacks. +FfxFloat32x4 SpdLoadSourceImage(FfxInt32x2 p, FfxUInt32 slice) +{ + return FfxFloat32x4(0.0, 0.0, 0.0, 0.0); +} + +FfxFloat32x4 SpdLoad(FfxInt32x2 p, FfxUInt32 slice) +{ + return FfxFloat32x4(0.0, 0.0, 0.0, 0.0); +} +void SpdStore(FfxInt32x2 p, FfxFloat32x4 value, FfxUInt32 mip, FfxUInt32 slice) +{ +} +FfxFloat32x4 SpdLoadIntermediate(FfxUInt32 x, FfxUInt32 y) +{ + return FfxFloat32x4(0.0, 0.0, 0.0, 0.0); +} +void SpdStoreIntermediate(FfxUInt32 x, FfxUInt32 y, FfxFloat32x4 value) +{ +} +FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3) +{ + return FfxFloat32x4(0.0, 0.0, 0.0, 0.0); +} +#endif // #if FFX_SPD_PACKED_ONLY + +//_____________________________________________________________/\_______________________________________________________________ +#if defined(FFX_GLSL) && !defined(FFX_SPD_NO_WAVE_OPERATIONS) +#extension GL_KHR_shader_subgroup_quad:require +#endif + +void ffxSpdWorkgroupShuffleBarrier() +{ + FFX_GROUP_MEMORY_BARRIER; +} + +// Only last active workgroup should proceed +bool SpdExitWorkgroup(FfxUInt32 numWorkGroups, FfxUInt32 localInvocationIndex, FfxUInt32 slice) +{ + // global atomic counter + if (localInvocationIndex == 0) + { + SpdIncreaseAtomicCounter(slice); + } + + ffxSpdWorkgroupShuffleBarrier(); + return (SpdGetAtomicCounter() != (numWorkGroups - 1)); +} + +// User defined: FfxFloat32x4 SpdReduce4(FfxFloat32x4 v0, FfxFloat32x4 v1, FfxFloat32x4 v2, FfxFloat32x4 v3); +FfxFloat32x4 SpdReduceQuad(FfxFloat32x4 v) +{ +#if defined(FFX_GLSL) && !defined(FFX_SPD_NO_WAVE_OPERATIONS) + + FfxFloat32x4 v0 = v; + FfxFloat32x4 v1 = subgroupQuadSwapHorizontal(v); + FfxFloat32x4 v2 = subgroupQuadSwapVertical(v); + FfxFloat32x4 v3 = subgroupQuadSwapDiagonal(v); + return SpdReduce4(v0, v1, v2, v3); + +#elif defined(FFX_HLSL) && !defined(FFX_SPD_NO_WAVE_OPERATIONS) + + // requires SM6.0 + FfxFloat32x4 v0 = v; + FfxFloat32x4 v1 = QuadReadAcrossX(v); + FfxFloat32x4 v2 = QuadReadAcrossY(v); + FfxFloat32x4 v3 = QuadReadAcrossDiagonal(v); + return SpdReduce4(v0, v1, v2, v3); +/* + // if SM6.0 is not available, you can use the AMD shader intrinsics + // the AMD shader intrinsics are available in AMD GPU Services (AGS) library: + // https://gpuopen.com/amd-gpu-services-ags-library/ + // works for DX11 + FfxFloat32x4 v0 = v; + FfxFloat32x4 v1; + v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + FfxFloat32x4 v2; + v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + FfxFloat32x4 v3; + v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + return SpdReduce4(v0, v1, v2, v3); + */ +#endif + return v; +} + +FfxFloat32x4 SpdReduceIntermediate(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3) +{ + FfxFloat32x4 v0 = SpdLoadIntermediate(i0.x, i0.y); + FfxFloat32x4 v1 = SpdLoadIntermediate(i1.x, i1.y); + FfxFloat32x4 v2 = SpdLoadIntermediate(i2.x, i2.y); + FfxFloat32x4 v3 = SpdLoadIntermediate(i3.x, i3.y); + return SpdReduce4(v0, v1, v2, v3); +} + +FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice) +{ + FfxFloat32x4 v0 = SpdLoad(FfxInt32x2(i0), slice); + FfxFloat32x4 v1 = SpdLoad(FfxInt32x2(i1), slice); + FfxFloat32x4 v2 = SpdLoad(FfxInt32x2(i2), slice); + FfxFloat32x4 v3 = SpdLoad(FfxInt32x2(i3), slice); + return SpdReduce4(v0, v1, v2, v3); +} + +FfxFloat32x4 SpdReduceLoad4(FfxUInt32x2 base, FfxUInt32 slice) +{ + return SpdReduceLoad4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice); +} + +FfxFloat32x4 SpdReduceLoadSourceImage4(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice) +{ + FfxFloat32x4 v0 = SpdLoadSourceImage(FfxInt32x2(i0), slice); + FfxFloat32x4 v1 = SpdLoadSourceImage(FfxInt32x2(i1), slice); + FfxFloat32x4 v2 = SpdLoadSourceImage(FfxInt32x2(i2), slice); + FfxFloat32x4 v3 = SpdLoadSourceImage(FfxInt32x2(i3), slice); + return SpdReduce4(v0, v1, v2, v3); +} + +FfxFloat32x4 SpdReduceLoadSourceImage(FfxUInt32x2 base, FfxUInt32 slice) +{ +#if defined(SPD_LINEAR_SAMPLER) + return SpdLoadSourceImage(FfxInt32x2(base), slice); +#else + return SpdReduceLoadSourceImage4(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice); +#endif +} + +void SpdDownsampleMips_0_1_Intrinsics(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ + FfxFloat32x4 v[4]; + + FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2); + FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y); + v[0] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[0], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y); + v[1] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[1], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16); + v[2] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[2], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[3], 0, slice); + + if (mip <= 1) + return; + + v[0] = SpdReduceQuad(v[0]); + v[1] = SpdReduceQuad(v[1]); + v[2] = SpdReduceQuad(v[2]); + v[3] = SpdReduceQuad(v[3]); + + if ((localInvocationIndex % 4) == 0) + { + SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice); + SpdStoreIntermediate(x / 2, y / 2, v[0]); + + SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice); + SpdStoreIntermediate(x / 2 + 8, y / 2, v[1]); + + SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice); + SpdStoreIntermediate(x / 2, y / 2 + 8, v[2]); + + SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice); + SpdStoreIntermediate(x / 2 + 8, y / 2 + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1_LDS(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ + FfxFloat32x4 v[4]; + + FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2); + FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y); + v[0] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[0], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y); + v[1] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[1], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16); + v[2] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[2], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImage(tex, slice); + SpdStore(pix, v[3], 0, slice); + + if (mip <= 1) + return; + + for (FfxUInt32 i = 0; i < 4; i++) + { + SpdStoreIntermediate(x, y, v[i]); + ffxSpdWorkgroupShuffleBarrier(); + if (localInvocationIndex < 64) + { + v[i] = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1)); + SpdStore(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice); + } + ffxSpdWorkgroupShuffleBarrier(); + } + + if (localInvocationIndex < 64) + { + SpdStoreIntermediate(x + 0, y + 0, v[0]); + SpdStoreIntermediate(x + 8, y + 0, v[1]); + SpdStoreIntermediate(x + 0, y + 8, v[2]); + SpdStoreIntermediate(x + 8, y + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#if defined(FFX_SPD_NO_WAVE_OPERATIONS) + SpdDownsampleMips_0_1_LDS(x, y, workGroupID, localInvocationIndex, mip, slice); +#else + SpdDownsampleMips_0_1_Intrinsics(x, y, workGroupID, localInvocationIndex, mip, slice); +#endif +} + + +void SpdDownsampleMip_2(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#if defined(FFX_SPD_NO_WAVE_OPERATIONS) + if (localInvocationIndex < 64) + { + FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1)); + SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice); + // store to LDS, try to reduce bank conflicts + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // ... + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + SpdStoreIntermediate(x * 2 + y % 2, y * 2, v); + } +#else + FfxFloat32x4 v = SpdLoadIntermediate(x, y); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice); + SpdStoreIntermediate(x + (y / 2) % 2, y, v); + } +#endif +} + +void SpdDownsampleMip_3(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#if defined(FFX_SPD_NO_WAVE_OPERATIONS) + if (localInvocationIndex < 16) + { + // x 0 x 0 + // 0 0 0 0 + // 0 x 0 x + // 0 0 0 0 + FfxFloat32x4 v = + SpdReduceIntermediate(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2)); + SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice); + // store to LDS + // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 + // ... + // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 + // ... + // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x + // ... + SpdStoreIntermediate(x * 4 + y, y * 4, v); + } +#else + if (localInvocationIndex < 64) + { + FfxFloat32x4 v = SpdLoadIntermediate(x * 2 + y % 2, y * 2); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice); + SpdStoreIntermediate(x * 2 + y / 2, y * 2, v); + } + } +#endif +} + +void SpdDownsampleMip_4(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#if defined(FFX_SPD_NO_WAVE_OPERATIONS) + if (localInvocationIndex < 4) + { + // x 0 0 0 x 0 0 0 + // ... + // 0 x 0 0 0 x 0 0 + FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0), + FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0), + FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4), + FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4)); + SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice); + // store to LDS + // x x x x 0 ... + // 0 ... + SpdStoreIntermediate(x + y * 2, 0, v); + } +#else + if (localInvocationIndex < 16) + { + FfxFloat32x4 v = SpdLoadIntermediate(x * 4 + y, y * 4); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice); + SpdStoreIntermediate(x / 2 + y, 0, v); + } + } +#endif +} + +void SpdDownsampleMip_5(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#if defined(FFX_SPD_NO_WAVE_OPERATIONS) + if (localInvocationIndex < 1) + { + // x x x x 0 ... + // 0 ... + FfxFloat32x4 v = SpdReduceIntermediate(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0)); + SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice); + } +#else + if (localInvocationIndex < 4) + { + FfxFloat32x4 v = SpdLoadIntermediate(localInvocationIndex, 0); + v = SpdReduceQuad(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStore(FfxInt32x2(workGroupID.xy), v, mip, slice); + } + } +#endif +} + +void SpdDownsampleMips_6_7(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice) +{ + FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0); + FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0); + FfxFloat32x4 v0 = SpdReduceLoad4(tex, slice); + SpdStore(pix, v0, 6, slice); + + tex = FfxInt32x2(x * 4 + 2, y * 4 + 0); + pix = FfxInt32x2(x * 2 + 1, y * 2 + 0); + FfxFloat32x4 v1 = SpdReduceLoad4(tex, slice); + SpdStore(pix, v1, 6, slice); + + tex = FfxInt32x2(x * 4 + 0, y * 4 + 2); + pix = FfxInt32x2(x * 2 + 0, y * 2 + 1); + FfxFloat32x4 v2 = SpdReduceLoad4(tex, slice); + SpdStore(pix, v2, 6, slice); + + tex = FfxInt32x2(x * 4 + 2, y * 4 + 2); + pix = FfxInt32x2(x * 2 + 1, y * 2 + 1); + FfxFloat32x4 v3 = SpdReduceLoad4(tex, slice); + SpdStore(pix, v3, 6, slice); + + if (mips <= 7) + return; + // no barrier needed, working on values only from the same thread + + FfxFloat32x4 v = SpdReduce4(v0, v1, v2, v3); + SpdStore(FfxInt32x2(x, y), v, 7, slice); + SpdStoreIntermediate(x, y, v); +} + +void SpdDownsampleNextFour(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice) +{ + if (mips <= baseMip) + return; + ffxSpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_2(x, y, workGroupID, localInvocationIndex, baseMip, slice); + + if (mips <= baseMip + 1) + return; + ffxSpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_3(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice); + + if (mips <= baseMip + 2) + return; + ffxSpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_4(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice); + + if (mips <= baseMip + 3) + return; + ffxSpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_5(workGroupID, localInvocationIndex, baseMip + 3, slice); +} + +/// Downsamples a 64x64 tile based on the work group id. +/// If after downsampling it's the last active thread group, computes the remaining MIP levels. +/// +/// @param [in] workGroupID index of the work group / thread group +/// @param [in] localInvocationIndex index of the thread within the thread group in 1D +/// @param [in] mips the number of total MIP levels to compute for the input texture +/// @param [in] numWorkGroups the total number of dispatched work groups / thread groups for this slice +/// @param [in] slice the slice of the input texture +/// +/// @ingroup FfxGPUSpd +void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice) +{ + // compute MIP level 0 and 1 + FfxUInt32x2 sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64); + FfxUInt32 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2); + FfxUInt32 y = sub_xy.y + 8 * ((localInvocationIndex >> 7)); + SpdDownsampleMips_0_1(x, y, workGroupID, localInvocationIndex, mips, slice); + + // compute MIP level 2, 3, 4, 5 + SpdDownsampleNextFour(x, y, workGroupID, localInvocationIndex, 2, mips, slice); + + if (mips <= 6) + return; + + // increase the global atomic counter for the given slice and check if it's the last remaining thread group: + // terminate if not, continue if yes. + if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice)) + return; + + // reset the global atomic counter back to 0 for the next spd dispatch + SpdResetAtomicCounter(slice); + + // After mip 5 there is only a single workgroup left that downsamples the remaining up to 64x64 texels. + // compute MIP level 6 and 7 + SpdDownsampleMips_6_7(x, y, mips, slice); + + // compute MIP level 8, 9, 10, 11 + SpdDownsampleNextFour(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice); +} +/// Downsamples a 64x64 tile based on the work group id and work group offset. +/// If after downsampling it's the last active thread group, computes the remaining MIP levels. +/// +/// @param [in] workGroupID index of the work group / thread group +/// @param [in] localInvocationIndex index of the thread within the thread group in 1D +/// @param [in] mips the number of total MIP levels to compute for the input texture +/// @param [in] numWorkGroups the total number of dispatched work groups / thread groups for this slice +/// @param [in] slice the slice of the input texture +/// @param [in] workGroupOffset the work group offset. it's (0,0) in case the entire input texture is downsampled. +/// +/// @ingroup FfxGPUSpd +void SpdDownsample(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset) +{ + SpdDownsample(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice); +} + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + +//============================================================================================================================== +// PACKED VERSION +//============================================================================================================================== + +#if FFX_HALF + +#if defined(FFX_GLSL) +#extension GL_EXT_shader_subgroup_extended_types_float16:require +#endif + +FfxFloat16x4 SpdReduceQuadH(FfxFloat16x4 v) +{ +#if defined(FFX_GLSL) && !defined(FFX_SPD_NO_WAVE_OPERATIONS) + FfxFloat16x4 v0 = v; + FfxFloat16x4 v1 = subgroupQuadSwapHorizontal(v); + FfxFloat16x4 v2 = subgroupQuadSwapVertical(v); + FfxFloat16x4 v3 = subgroupQuadSwapDiagonal(v); + return SpdReduce4H(v0, v1, v2, v3); +#elif defined(FFX_HLSL) && !defined(FFX_SPD_NO_WAVE_OPERATIONS) + // requires SM6.0 + FfxFloat16x4 v0 = v; + FfxFloat16x4 v1 = QuadReadAcrossX(v); + FfxFloat16x4 v2 = QuadReadAcrossY(v); + FfxFloat16x4 v3 = QuadReadAcrossDiagonal(v); + return SpdReduce4H(v0, v1, v2, v3); +/* + // if SM6.0 is not available, you can use the AMD shader intrinsics + // the AMD shader intrinsics are available in AMD GPU Services (AGS) library: + // https://gpuopen.com/amd-gpu-services-ags-library/ + // works for DX11 + FfxFloat16x4 v0 = v; + FfxFloat16x4 v1; + v1.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + v1.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX1); + FfxFloat16x4 v2; + v2.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + v2.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_SwapX2); + FfxFloat16x4 v3; + v3.x = AmdExtD3DShaderIntrinsics_SwizzleF(v.x, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.y = AmdExtD3DShaderIntrinsics_SwizzleF(v.y, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.z = AmdExtD3DShaderIntrinsics_SwizzleF(v.z, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + v3.w = AmdExtD3DShaderIntrinsics_SwizzleF(v.w, AmdExtD3DShaderIntrinsicsSwizzle_ReverseX4); + return SpdReduce4H(v0, v1, v2, v3); + */ +#endif + return FfxFloat16x4(0.0, 0.0, 0.0, 0.0); +} + +FfxFloat16x4 SpdReduceIntermediateH(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3) +{ + FfxFloat16x4 v0 = SpdLoadIntermediateH(i0.x, i0.y); + FfxFloat16x4 v1 = SpdLoadIntermediateH(i1.x, i1.y); + FfxFloat16x4 v2 = SpdLoadIntermediateH(i2.x, i2.y); + FfxFloat16x4 v3 = SpdLoadIntermediateH(i3.x, i3.y); + return SpdReduce4H(v0, v1, v2, v3); +} + +FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice) +{ + FfxFloat16x4 v0 = SpdLoadH(FfxInt32x2(i0), slice); + FfxFloat16x4 v1 = SpdLoadH(FfxInt32x2(i1), slice); + FfxFloat16x4 v2 = SpdLoadH(FfxInt32x2(i2), slice); + FfxFloat16x4 v3 = SpdLoadH(FfxInt32x2(i3), slice); + return SpdReduce4H(v0, v1, v2, v3); +} + +FfxFloat16x4 SpdReduceLoad4H(FfxUInt32x2 base, FfxUInt32 slice) +{ + return SpdReduceLoad4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice); +} + +FfxFloat16x4 SpdReduceLoadSourceImage4H(FfxUInt32x2 i0, FfxUInt32x2 i1, FfxUInt32x2 i2, FfxUInt32x2 i3, FfxUInt32 slice) +{ + FfxFloat16x4 v0 = SpdLoadSourceImageH(FfxInt32x2(i0), slice); + FfxFloat16x4 v1 = SpdLoadSourceImageH(FfxInt32x2(i1), slice); + FfxFloat16x4 v2 = SpdLoadSourceImageH(FfxInt32x2(i2), slice); + FfxFloat16x4 v3 = SpdLoadSourceImageH(FfxInt32x2(i3), slice); + return SpdReduce4H(v0, v1, v2, v3); +} + +FfxFloat16x4 SpdReduceLoadSourceImageH(FfxUInt32x2 base, FfxUInt32 slice) +{ +#if defined(SPD_LINEAR_SAMPLER) + return SpdLoadSourceImageH(FfxInt32x2(base), slice); +#else + return SpdReduceLoadSourceImage4H(FfxUInt32x2(base + FfxUInt32x2(0, 0)), FfxUInt32x2(base + FfxUInt32x2(0, 1)), FfxUInt32x2(base + FfxUInt32x2(1, 0)), FfxUInt32x2(base + FfxUInt32x2(1, 1)), slice); +#endif +} + +void SpdDownsampleMips_0_1_IntrinsicsH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice) +{ + FfxFloat16x4 v[4]; + + FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2); + FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y); + v[0] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[0], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y); + v[1] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[1], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16); + v[2] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[2], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[3], 0, slice); + + if (mips <= 1) + return; + + v[0] = SpdReduceQuadH(v[0]); + v[1] = SpdReduceQuadH(v[1]); + v[2] = SpdReduceQuadH(v[2]); + v[3] = SpdReduceQuadH(v[3]); + + if ((localInvocationIndex % 4) == 0) + { + SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2), v[0], 1, slice); + SpdStoreIntermediateH(x / 2, y / 2, v[0]); + + SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2), v[1], 1, slice); + SpdStoreIntermediateH(x / 2 + 8, y / 2, v[1]); + + SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2, y / 2 + 8), v[2], 1, slice); + SpdStoreIntermediateH(x / 2, y / 2 + 8, v[2]); + + SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x / 2 + 8, y / 2 + 8), v[3], 1, slice); + SpdStoreIntermediateH(x / 2 + 8, y / 2 + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1_LDSH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice) +{ + FfxFloat16x4 v[4]; + + FfxInt32x2 tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2); + FfxInt32x2 pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y); + v[0] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[0], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y); + v[1] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[1], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x, y + 16); + v[2] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[2], 0, slice); + + tex = FfxInt32x2(workGroupID.xy * 64) + FfxInt32x2(x * 2 + 32, y * 2 + 32); + pix = FfxInt32x2(workGroupID.xy * 32) + FfxInt32x2(x + 16, y + 16); + v[3] = SpdReduceLoadSourceImageH(tex, slice); + SpdStoreH(pix, v[3], 0, slice); + + if (mips <= 1) + return; + + for (FfxInt32 i = 0; i < 4; i++) + { + SpdStoreIntermediateH(x, y, v[i]); + ffxSpdWorkgroupShuffleBarrier(); + if (localInvocationIndex < 64) + { + v[i] = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1)); + SpdStoreH(FfxInt32x2(workGroupID.xy * 16) + FfxInt32x2(x + (i % 2) * 8, y + (i / 2) * 8), v[i], 1, slice); + } + ffxSpdWorkgroupShuffleBarrier(); + } + + if (localInvocationIndex < 64) + { + SpdStoreIntermediateH(x + 0, y + 0, v[0]); + SpdStoreIntermediateH(x + 8, y + 0, v[1]); + SpdStoreIntermediateH(x + 0, y + 8, v[2]); + SpdStoreIntermediateH(x + 8, y + 8, v[3]); + } +} + +void SpdDownsampleMips_0_1H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 slice) +{ +#if defined(FFX_SPD_NO_WAVE_OPERATIONS) + SpdDownsampleMips_0_1_LDSH(x, y, workGroupID, localInvocationIndex, mips, slice); +#else + SpdDownsampleMips_0_1_IntrinsicsH(x, y, workGroupID, localInvocationIndex, mips, slice); +#endif +} + + +void SpdDownsampleMip_2H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#if defined(FFX_SPD_NO_WAVE_OPERATIONS) + if (localInvocationIndex < 64) + { + FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 2 + 0, y * 2 + 0), FfxUInt32x2(x * 2 + 1, y * 2 + 0), FfxUInt32x2(x * 2 + 0, y * 2 + 1), FfxUInt32x2(x * 2 + 1, y * 2 + 1)); + SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x, y), v, mip, slice); + // store to LDS, try to reduce bank conflicts + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 x + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + // ... + // x 0 x 0 x 0 x 0 x 0 x 0 x 0 x 0 + SpdStoreIntermediateH(x * 2 + y % 2, y * 2, v); + } +#else + FfxFloat16x4 v = SpdLoadIntermediateH(x, y); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(FfxInt32x2(workGroupID.xy * 8) + FfxInt32x2(x / 2, y / 2), v, mip, slice); + SpdStoreIntermediateH(x + (y / 2) % 2, y, v); + } +#endif +} + +void SpdDownsampleMip_3H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#if defined(FFX_SPD_NO_WAVE_OPERATIONS) + if (localInvocationIndex < 16) + { + // x 0 x 0 + // 0 0 0 0 + // 0 x 0 x + // 0 0 0 0 + FfxFloat16x4 v = + SpdReduceIntermediateH(FfxUInt32x2(x * 4 + 0 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 2 + 0, y * 4 + 0), FfxUInt32x2(x * 4 + 0 + 1, y * 4 + 2), FfxUInt32x2(x * 4 + 2 + 1, y * 4 + 2)); + SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x, y), v, mip, slice); + // store to LDS + // x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + // 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 0 + // ... + // 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x 0 + // ... + // 0 0 0 x 0 0 0 x 0 0 0 x 0 0 0 x + // ... + SpdStoreIntermediateH(x * 4 + y, y * 4, v); + } +#else + if (localInvocationIndex < 64) + { + FfxFloat16x4 v = SpdLoadIntermediateH(x * 2 + y % 2, y * 2); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(FfxInt32x2(workGroupID.xy * 4) + FfxInt32x2(x / 2, y / 2), v, mip, slice); + SpdStoreIntermediateH(x * 2 + y / 2, y * 2, v); + } + } +#endif +} + +void SpdDownsampleMip_4H(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#if defined(FFX_SPD_NO_WAVE_OPERATIONS) + if (localInvocationIndex < 4) + { + // x 0 0 0 x 0 0 0 + // ... + // 0 x 0 0 0 x 0 0 + FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(x * 8 + 0 + 0 + y * 2, y * 8 + 0), + FfxUInt32x2(x * 8 + 4 + 0 + y * 2, y * 8 + 0), + FfxUInt32x2(x * 8 + 0 + 1 + y * 2, y * 8 + 4), + FfxUInt32x2(x * 8 + 4 + 1 + y * 2, y * 8 + 4)); + SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x, y), v, mip, slice); + // store to LDS + // x x x x 0 ... + // 0 ... + SpdStoreIntermediateH(x + y * 2, 0, v); + } +#else + if (localInvocationIndex < 16) + { + FfxFloat16x4 v = SpdLoadIntermediateH(x * 4 + y, y * 4); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(FfxInt32x2(workGroupID.xy * 2) + FfxInt32x2(x / 2, y / 2), v, mip, slice); + SpdStoreIntermediateH(x / 2 + y, 0, v); + } + } +#endif +} + +void SpdDownsampleMip_5H(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mip, FfxUInt32 slice) +{ +#if defined(FFX_SPD_NO_WAVE_OPERATIONS) + if (localInvocationIndex < 1) + { + // x x x x 0 ... + // 0 ... + FfxFloat16x4 v = SpdReduceIntermediateH(FfxUInt32x2(0, 0), FfxUInt32x2(1, 0), FfxUInt32x2(2, 0), FfxUInt32x2(3, 0)); + SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice); + } +#else + if (localInvocationIndex < 4) + { + FfxFloat16x4 v = SpdLoadIntermediateH(localInvocationIndex, 0); + v = SpdReduceQuadH(v); + // quad index 0 stores result + if (localInvocationIndex % 4 == 0) + { + SpdStoreH(FfxInt32x2(workGroupID.xy), v, mip, slice); + } + } +#endif +} + +void SpdDownsampleMips_6_7H(FfxUInt32 x, FfxUInt32 y, FfxUInt32 mips, FfxUInt32 slice) +{ + FfxInt32x2 tex = FfxInt32x2(x * 4 + 0, y * 4 + 0); + FfxInt32x2 pix = FfxInt32x2(x * 2 + 0, y * 2 + 0); + FfxFloat16x4 v0 = SpdReduceLoad4H(tex, slice); + SpdStoreH(pix, v0, 6, slice); + + tex = FfxInt32x2(x * 4 + 2, y * 4 + 0); + pix = FfxInt32x2(x * 2 + 1, y * 2 + 0); + FfxFloat16x4 v1 = SpdReduceLoad4H(tex, slice); + SpdStoreH(pix, v1, 6, slice); + + tex = FfxInt32x2(x * 4 + 0, y * 4 + 2); + pix = FfxInt32x2(x * 2 + 0, y * 2 + 1); + FfxFloat16x4 v2 = SpdReduceLoad4H(tex, slice); + SpdStoreH(pix, v2, 6, slice); + + tex = FfxInt32x2(x * 4 + 2, y * 4 + 2); + pix = FfxInt32x2(x * 2 + 1, y * 2 + 1); + FfxFloat16x4 v3 = SpdReduceLoad4H(tex, slice); + SpdStoreH(pix, v3, 6, slice); + + if (mips < 8) + return; + // no barrier needed, working on values only from the same thread + + FfxFloat16x4 v = SpdReduce4H(v0, v1, v2, v3); + SpdStoreH(FfxInt32x2(x, y), v, 7, slice); + SpdStoreIntermediateH(x, y, v); +} + +void SpdDownsampleNextFourH(FfxUInt32 x, FfxUInt32 y, FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 baseMip, FfxUInt32 mips, FfxUInt32 slice) +{ + if (mips <= baseMip) + return; + ffxSpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_2H(x, y, workGroupID, localInvocationIndex, baseMip, slice); + + if (mips <= baseMip + 1) + return; + ffxSpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_3H(x, y, workGroupID, localInvocationIndex, baseMip + 1, slice); + + if (mips <= baseMip + 2) + return; + ffxSpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_4H(x, y, workGroupID, localInvocationIndex, baseMip + 2, slice); + + if (mips <= baseMip + 3) + return; + ffxSpdWorkgroupShuffleBarrier(); + SpdDownsampleMip_5H(workGroupID, localInvocationIndex, baseMip + 3, slice); +} + +/// Downsamples a 64x64 tile based on the work group id and work group offset. +/// If after downsampling it's the last active thread group, computes the remaining MIP levels. +/// Uses half types. +/// +/// @param [in] workGroupID index of the work group / thread group +/// @param [in] localInvocationIndex index of the thread within the thread group in 1D +/// @param [in] mips the number of total MIP levels to compute for the input texture +/// @param [in] numWorkGroups the total number of dispatched work groups / thread groups for this slice +/// @param [in] slice the slice of the input texture +/// +/// @ingroup FfxGPUSpd +void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice) +{ + FfxUInt32x2 sub_xy = ffxRemapForWaveReduction(localInvocationIndex % 64); + FfxUInt32 x = sub_xy.x + 8 * ((localInvocationIndex >> 6) % 2); + FfxUInt32 y = sub_xy.y + 8 * ((localInvocationIndex >> 7)); + + // compute MIP level 0 and 1 + SpdDownsampleMips_0_1H(x, y, workGroupID, localInvocationIndex, mips, slice); + + // compute MIP level 2, 3, 4, 5 + SpdDownsampleNextFourH(x, y, workGroupID, localInvocationIndex, 2, mips, slice); + + if (mips < 7) + return; + + // increase the global atomic counter for the given slice and check if it's the last remaining thread group: + // terminate if not, continue if yes. + if (SpdExitWorkgroup(numWorkGroups, localInvocationIndex, slice)) + return; + + // reset the global atomic counter back to 0 for the next spd dispatch + SpdResetAtomicCounter(slice); + + // After mip 5 there is only a single workgroup left that downsamples the remaining up to 64x64 texels. + // compute MIP level 6 and 7 + SpdDownsampleMips_6_7H(x, y, mips, slice); + + // compute MIP level 8, 9, 10, 11 + SpdDownsampleNextFourH(x, y, FfxUInt32x2(0, 0), localInvocationIndex, 8, mips, slice); +} + +/// Downsamples a 64x64 tile based on the work group id and work group offset. +/// If after downsampling it's the last active thread group, computes the remaining MIP levels. +/// Uses half types. +/// +/// @param [in] workGroupID index of the work group / thread group +/// @param [in] localInvocationIndex index of the thread within the thread group in 1D +/// @param [in] mips the number of total MIP levels to compute for the input texture +/// @param [in] numWorkGroups the total number of dispatched work groups / thread groups for this slice +/// @param [in] slice the slice of the input texture +/// @param [in] workGroupOffset the work group offset. it's (0,0) in case the entire input texture is downsampled. +/// +/// @ingroup FfxGPUSpd +void SpdDownsampleH(FfxUInt32x2 workGroupID, FfxUInt32 localInvocationIndex, FfxUInt32 mips, FfxUInt32 numWorkGroups, FfxUInt32 slice, FfxUInt32x2 workGroupOffset) +{ + SpdDownsampleH(workGroupID + workGroupOffset, localInvocationIndex, mips, numWorkGroups, slice); +} + +#endif // #if FFX_HALF +#endif // #if defined(FFX_GPU) diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_spd.h.meta b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_spd.h.meta new file mode 100644 index 0000000..f05552e --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_spd.h.meta @@ -0,0 +1,27 @@ +fileFormatVersion: 2 +guid: 8e323410b1a2e304dad4767b57398f10 +PluginImporter: + externalObjects: {} + serializedVersion: 2 + iconMap: {} + executionOrder: {} + defineConstraints: [] + isPreloaded: 0 + isOverridable: 1 + isExplicitlyReferenced: 0 + validateReferences: 1 + platformData: + - first: + Any: + second: + enabled: 1 + settings: {} + - first: + Editor: Editor + second: + enabled: 0 + settings: + DefaultValueInitialized: true + userData: + assetBundleName: + assetBundleVariant: diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_spd_callbacks_hlsl.h b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_spd_callbacks_hlsl.h new file mode 100644 index 0000000..d86f66f --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_spd_callbacks_hlsl.h @@ -0,0 +1,218 @@ +// This file is part of the FidelityFX SDK. +// +// Copyright (C) 2024 Advanced Micro Devices, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files(the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and /or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions : +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#include "ffx_spd_resources.h" + +#if defined(FFX_GPU) +#ifdef __hlsl_dx_compiler +#pragma dxc diagnostic push +#pragma dxc diagnostic ignored "-Wambig-lit-shift" +#endif //__hlsl_dx_compiler +#include "ffx_core.h" +#ifdef __hlsl_dx_compiler +#pragma dxc diagnostic pop +#endif //__hlsl_dx_compiler + +#ifndef FFX_PREFER_WAVE64 +#define FFX_PREFER_WAVE64 +#endif // #ifndef FFX_PREFER_WAVE64 + +#pragma warning(disable: 3205) // conversion from larger type to smaller + +#define FFX_DECLARE_SRV_REGISTER(regIndex) t##regIndex +#define FFX_DECLARE_UAV_REGISTER(regIndex) u##regIndex +#define FFX_DECLARE_CB_REGISTER(regIndex) b##regIndex +#define FFX_SPD_DECLARE_SRV(regIndex) register(FFX_DECLARE_SRV_REGISTER(regIndex)) +#define FFX_SPD_DECLARE_UAV(regIndex) register(FFX_DECLARE_UAV_REGISTER(regIndex)) +#define FFX_SPD_DECLARE_CB(regIndex) register(FFX_DECLARE_CB_REGISTER(regIndex)) + +#if defined(FFX_SPD_BIND_CB_SPD) + cbuffer cbSPD : FFX_SPD_DECLARE_CB(FFX_SPD_BIND_CB_SPD) + { + FfxUInt32 mips; + FfxUInt32 numWorkGroups; + FfxUInt32x2 workGroupOffset; + FfxFloat32x2 invInputSize; // Only used for linear sampling mode + FfxFloat32x2 padding; + + #define FFX_SPD_CONSTANT_BUFFER_1_SIZE 8 // Number of 32-bit values. This must be kept in sync with the cbSPD size. + }; +#else + #define mips 0 + #define numWorkGroups 0 + #define workGroupOffset 0 + #define invInputSize 0 + #define padding 0 +#endif + +#define FFX_SPD_ROOTSIG_STRINGIFY(p) FFX_SPD_ROOTSIG_STR(p) +#define FFX_SPD_ROOTSIG_STR(p) #p +#define FFX_SPD_ROOTSIG [RootSignature( "DescriptorTable(UAV(u0, numDescriptors = " FFX_SPD_ROOTSIG_STRINGIFY(FFX_SPD_RESOURCE_IDENTIFIER_COUNT) ")), " \ + "DescriptorTable(SRV(t0, numDescriptors = " FFX_SPD_ROOTSIG_STRINGIFY(FFX_SPD_RESOURCE_IDENTIFIER_COUNT) ")), " \ + "CBV(b0), " \ + "StaticSampler(s0, filter = FILTER_MIN_MAG_LINEAR_MIP_POINT, " \ + "addressU = TEXTURE_ADDRESS_CLAMP, " \ + "addressV = TEXTURE_ADDRESS_CLAMP, " \ + "addressW = TEXTURE_ADDRESS_CLAMP, " \ + "comparisonFunc = COMPARISON_NEVER, " \ + "borderColor = STATIC_BORDER_COLOR_TRANSPARENT_BLACK)" )] + +#if defined(FFX_SPD_EMBED_ROOTSIG) +#define FFX_SPD_EMBED_ROOTSIG_CONTENT FFX_SPD_ROOTSIG +#else +#define FFX_SPD_EMBED_ROOTSIG_CONTENT +#endif // #if FFX_SPD_EMBED_ROOTSIG + +FfxUInt32 Mips() +{ + return mips; +} + +FfxUInt32 NumWorkGroups() +{ + return numWorkGroups; +} + +FfxUInt32x2 WorkGroupOffset() +{ + return workGroupOffset; +} + +FfxFloat32x2 InvInputSize() +{ + return invInputSize; +} + +SamplerState s_LinearClamp : register(s0); + + // SRVs + #if defined(FFX_SPD_BIND_SRV_INPUT_DOWNSAMPLE_SRC) + Texture2DArray r_input_downsample_src : FFX_SPD_DECLARE_SRV(FFX_SPD_BIND_SRV_INPUT_DOWNSAMPLE_SRC); + #endif + + // UAV declarations + #if defined(FFX_SPD_BIND_UAV_INTERNAL_GLOBAL_ATOMIC) + struct SpdGlobalAtomicBuffer { FfxUInt32 counter[6]; }; + globallycoherent RWStructuredBuffer rw_internal_global_atomic : FFX_SPD_DECLARE_UAV(FFX_SPD_BIND_UAV_INTERNAL_GLOBAL_ATOMIC); + #endif + #if defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MID_MIPMAP) + globallycoherent RWTexture2DArray rw_input_downsample_src_mid_mip : FFX_SPD_DECLARE_UAV(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MID_MIPMAP); + #endif + #if defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MIPS) + RWTexture2DArray rw_input_downsample_src_mips[SPD_MAX_MIP_LEVELS+1] : FFX_SPD_DECLARE_UAV(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MIPS); + #endif + +#if FFX_HALF + +#if defined(FFX_SPD_BIND_SRV_INPUT_DOWNSAMPLE_SRC) + FfxFloat16x4 SampleSrcImageH(FfxFloat32x2 uv, FfxUInt32 slice) + { + FfxFloat32x2 textureCoord = FfxFloat32x2(uv) * InvInputSize() + InvInputSize(); + FfxFloat32x4 result = r_input_downsample_src.SampleLevel(s_LinearClamp, FfxFloat32x3(textureCoord, slice), 0); + return FfxFloat16x4(ffxSrgbFromLinear(result.x), ffxSrgbFromLinear(result.y), ffxSrgbFromLinear(result.z), result.w); + } + #endif // defined(FFX_SPD_BIND_SRV_INPUT_DOWNSAMPLE_SRC) + +#if defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MIPS) + FfxFloat16x4 LoadSrcImageH(FfxFloat32x2 uv, FfxUInt32 slice) + { + return FfxFloat16x4(rw_input_downsample_src_mips[0][FfxUInt32x3(uv, slice)]); + } +#endif // defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MIPS) + +#if defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MIPS) + void StoreSrcMipH(FfxFloat16x4 value, FfxInt32x2 uv, FfxUInt32 slice, FfxUInt32 mip) + { + rw_input_downsample_src_mips[mip][FfxUInt32x3(uv, slice)] = FfxFloat32x4(value); + } +#endif // defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MIPS) + +#if defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MID_MIPMAP) + FfxFloat16x4 LoadMidMipH(FfxInt32x2 uv, FfxUInt32 slice) + { + return FfxFloat16x4(rw_input_downsample_src_mid_mip[FfxUInt32x3(uv, slice)]); + } +#endif // defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MID_MIPMAP) + +#if defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MID_MIPMAP) + void StoreMidMipH(FfxFloat16x4 value, FfxInt32x2 uv, FfxUInt32 slice) + { + rw_input_downsample_src_mid_mip[FfxUInt32x3(uv, slice)] = FfxFloat32x4(value); + } +#endif // defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MID_MIPMAP) + +#else // FFX_HALF + +#if defined(FFX_SPD_BIND_SRV_INPUT_DOWNSAMPLE_SRC) + FfxFloat32x4 SampleSrcImage(FfxInt32x2 uv, FfxUInt32 slice) + { + FfxFloat32x2 textureCoord = FfxFloat32x2(uv) * InvInputSize() + InvInputSize(); + FfxFloat32x4 result = r_input_downsample_src.SampleLevel(s_LinearClamp, FfxFloat32x3(textureCoord, slice), 0); + return FfxFloat32x4(ffxSrgbFromLinear(result.x), ffxSrgbFromLinear(result.y), ffxSrgbFromLinear(result.z), result.w); + } +#endif // defined(FFX_SPD_BIND_SRV_INPUT_DOWNSAMPLE_SRC) + +#if defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MIPS) + FfxFloat32x4 LoadSrcImage(FfxInt32x2 uv, FfxUInt32 slice) + { + return rw_input_downsample_src_mips[0][FfxUInt32x3(uv, slice)]; + } +#endif // defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MIPS) + +#if defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MIPS) + void StoreSrcMip(FfxFloat32x4 value, FfxInt32x2 uv, FfxUInt32 slice, FfxUInt32 mip) + { + rw_input_downsample_src_mips[mip][FfxUInt32x3(uv, slice)] = value; + } +#endif // defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MIPS) + +#if defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MID_MIPMAP) + FfxFloat32x4 LoadMidMip(FfxInt32x2 uv, FfxUInt32 slice) + { + return rw_input_downsample_src_mid_mip[FfxUInt32x3(uv, slice)]; + } +#endif // defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MID_MIPMAP) + +#if defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MID_MIPMAP) + void StoreMidMip(FfxFloat32x4 value, FfxInt32x2 uv, FfxUInt32 slice) + { + rw_input_downsample_src_mid_mip[FfxUInt32x3(uv, slice)] = value; + } +#endif // defined(FFX_SPD_BIND_UAV_INPUT_DOWNSAMPLE_SRC_MID_MIPMAP) + +#endif // FFX_HALF + +#if defined(FFX_SPD_BIND_UAV_INTERNAL_GLOBAL_ATOMIC) +void IncreaseAtomicCounter(FFX_PARAMETER_IN FfxUInt32 slice, FFX_PARAMETER_INOUT FfxUInt32 counter) +{ + InterlockedAdd(rw_internal_global_atomic[0].counter[slice], 1, counter); +} +#endif // defined(FFX_SPD_BIND_UAV_INTERNAL_GLOBAL_ATOMIC) + +#if defined(FFX_SPD_BIND_UAV_INTERNAL_GLOBAL_ATOMIC) +void ResetAtomicCounter(FFX_PARAMETER_IN FfxUInt32 slice) +{ + rw_internal_global_atomic[0].counter[slice] = 0; +} +#endif // defined(FFX_SPD_BIND_UAV_INTERNAL_GLOBAL_ATOMIC) + +#endif // #if defined(FFX_GPU) diff --git a/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_spd_callbacks_hlsl.h.meta b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_spd_callbacks_hlsl.h.meta new file mode 100644 index 0000000..9bd02bd --- /dev/null +++ b/Packages/com.unity.render-pipelines.universal@14.0.11/Runtime/OcclusionCulling/Shaders/ffx/ffx_spd_callbacks_hlsl.h.meta @@ -0,0 +1,27 @@ +fileFormatVersion: 2 +guid: d906db2acd4127e49a5e5501da951810 +PluginImporter: + externalObjects: {} + serializedVersion: 2 + iconMap: {} + executionOrder: {} + defineConstraints: [] + isPreloaded: 0 + isOverridable: 1 + isExplicitlyReferenced: 0 + validateReferences: 1 + platformData: + - first: + Any: + second: + enabled: 1 + settings: {} + - first: + Editor: Editor + second: + enabled: 0 + settings: + DefaultValueInitialized: true + userData: + assetBundleName: + assetBundleVariant: