ComputeResolve

#include "../Common.hlsli"
#include "../Multisample.hlsli"

/// <summary>Buffer of all tiles = base tile information (tile location in image, tile classification (whether it is MSAA or not) and UAV (CPU-side only)</summary>
RWStructuredBuffer<Tile> Tiles: register(u0);
/// <summary>Output texture image in which the multisampled texture or buffer are resolved</summary>
RWTexture2D<float4> Output: register(u1);
/// <summary>Buffer of all tile records = tile properties (offset into sample buffer and how many samples are in current tile)</summary>
RWStructuredBuffer<TileRecord> TilesRecords: register(u2);
/// <summary>Buffer of all tile samples (contain coordinates inside tile (to allow sampling source MSAA images), sample weight (for resolve) and sample index (to allow obtaining specific sample from Texture2DMS)</summary>
RWStructuredBuffer<TileSample> TilesSamples: register(u3);

/// <summary>Multisampled input image to resolve (for Resolve)</summary>
Texture2DMS<float4, SamplesMSAA> Input: register(t0);
/// <summary>Multisampled sample buffer to resolve (for ResolveBuffer)</summary>
StructuredBuffer<float4> InputBuffer: register(t1);

/// <summary>Constants at Constant Buffer View (CBV) register 0</summary>
cbuffer Params : register(b0)
{
    /// <summary>Input image width</summary>
    uint Width;

    /// <summary>Input image height</summary>
    uint Height;
}

/// <summary>Counter for group - used to obtain sample indexes in atomic way when looping either through tile's samples (ResolveBuffer) or through pixel's samples (Resolve)</summary>
groupshared uint counter;
/// <summary>Group shared buffer for resulting color - samples are resolved into this group shared buffer and then stored in output image</summary>
groupshared uint color[16 * 16 * 4];

/// <summary>Clear output image (single group clears single tile)</summary>
/// <param name="GI">Dispatch group index (CPU side defined number of how many groups are dispatched)</param>
/// <param name="DTid">Dispatch thread index (element indexes - going from 0 to numthreads*groups-1</param>
/// <param name="GTid">Group thread index (goes from (0,0,0) to (numthreads.x-1, numthreads.y-1, numthreads.z-1) defined below</param>
[numthreads(16, 16, 1)]
void Clear(uint3 GI : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID)
{
    // Get image coordinates
    uint2 coord = uint2(Tiles[GI.x].x + GTid.x, Tiles[GI.x].y + GTid.y);

    // Clear when in image boundaries
    if (coord.x < Width && coord.y < Height)
    {
        Output[coord] = float4(0.0f, 0.0f, 0.0f, 1.0f);
    }
}

/// <summary>Resolve msaa image with tile structure around it into non-msaa image</summary>
/// <param name="GI">Dispatch group index (CPU side defined number of how many groups are dispatched)</param>
/// <param name="DTid">Dispatch thread index (element indexes - going from 0 to numthreads*groups-1</param>
/// <param name="GTid">Group thread index (goes from (0,0,0) to (numthreads.x-1, numthreads.y-1, numthreads.z-1) defined below</param>
[numthreads(16, 16, 1)]
void Resolve(uint3 GI : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID)
{
    // Reset group shared counter
    if (GTid.x == 0 && GTid.y == 0)
    {
        counter = 0;
    }

    // Get image coordinates
    uint2 coord = uint2(Tiles[GI.x].x + GTid.x, Tiles[GI.x].y + GTid.y);

    // Reset output pixel's color
    color[(GTid.x + GTid.y * 16) * 4 + 0] = 0;
    color[(GTid.x + GTid.y * 16) * 4 + 1] = 0;
    color[(GTid.x + GTid.y * 16) * 4 + 2] = 0;
    color[(GTid.x + GTid.y * 16) * 4 + 3] = 0;

    // Memory barrier, wait for all writes into pixel colors are done and group shared counter is reset
    GroupMemoryBarrierWithGroupSync();

    // Work only when in image boundaries
    if (coord.x < Width && coord.y < Height)
    {
        // If tile is classified as non-msaa
        if (Tiles[GI.x].classification == 0)
        {
            // Write input pixel into output pixel
            float4 c = Input.Load(coord, 0);
            Output[coord] = c;

            // DEBUG:
            if (coord.x == (Width / 4) || coord.y == (Height / 4) || coord.x == (3 * Width / 4) || coord.y == (3 * Height / 4))
            {
                Output[coord] = float4(1.0f, 0.0f, 0.0f, 1.0f);
            }
        }
        // Otherwise (msaa-image)
        else
        {
            // First step is to resolve all samples into group shared color buffer

            // Total samples count in tile
            uint samplesCount = TilesRecords[GI.x].count;
            // Currently processed sample
            uint index = 0;

            // Get next sample index (requires atomic operation - thread safety in workgroup)
            InterlockedAdd(counter, 1, index);

            // Loop until all samples in tile are processed (sample index is higher than there are samples in current tile)
            while (index < samplesCount)
            {
                // Image coordinates that this sample represents
                uint2 sampleCoord = uint2(Tiles[GI.x].x + TilesSamples[TilesRecords[GI.x].offset + index].x, Tiles[GI.x].y + TilesSamples[TilesRecords[GI.x].offset + index].y);
                // Index in tile
                uint localCoord = TilesSamples[TilesRecords[GI.x].offset + index].x + TilesSamples[TilesRecords[GI.x].offset + index].y * 16;

                // Load sample from input image (note, specific sampleIndex is taken from currently processed sample) and multiply by weight
                float4 c = Input.Load(sampleCoord, TilesSamples[TilesRecords[GI.x].offset + index].sampleIndex) * TilesSamples[TilesRecords[GI.x].offset + index].weight;

                // TODO: Tonemapping should happen here!

                // NOTE: Due to requirement of using atomics, we have to operate on integers - therefore the floats are casted into unsigned
                // integers, but scaled before (by 2^16). This should be large-enough scale for HDR displays.

                // If pixel does have multiple samples
                if (TilesSamples[TilesRecords[GI.x].offset + index].weight < 1.0f)
                {
                    // Atomically increment group shared buffer color at local coordinates in tile
                    uint val;
                    InterlockedAdd(color[localCoord * 4 + 0], uint(c.x * 65536.0f), val);
                    InterlockedAdd(color[localCoord * 4 + 1], uint(c.y * 65536.0f), val);
                    InterlockedAdd(color[localCoord * 4 + 2], uint(c.z * 65536.0f), val);
                }
                // If pixel doesn't have multiple samples
                else
                {
                    // Just store in group shared buffer color
                    color[localCoord * 4 + 0] = uint(c.x * 65536.0f);
                    color[localCoord * 4 + 1] = uint(c.y * 65536.0f);
                    color[localCoord * 4 + 2] = uint(c.z * 65536.0f);
                }

                // Get next sample index
                InterlockedAdd(counter, 1, index);
            }

            // Memory barrier (wait until all threads in group finished adding values into group shared buffer color
            GroupMemoryBarrierWithGroupSync();

            // Image coordinates
            uint2 imageCoord = uint2(Tiles[GI.x].x + GTid.x, Tiles[GI.x].y + GTid.y);
            // Local coordinates for current thread (just use 1 thread = 1 pixel at this point, as we only have to copy group shared buffer color into output image tile (they're both same size))
            uint imageIndex = GTid.x + GTid.y * 16;

            // DEBUG:
            if (imageCoord.x == (Width / 4) || imageCoord.y == (Height / 4) || imageCoord.x == (3 * Width / 4) || imageCoord.y == (3 * Height / 4))
            {
                color[imageIndex * 4 + 0] = 65536.0f;
                color[imageIndex * 4 + 1] = 0.0f;
                color[imageIndex * 4 + 2] = 0.0f;
            }

            // Store data in output image
            // NOTE: Due to previous multiplication and cast into unsigned integers (for ability to do atomics), we now have to cast back into float and divide
            Output[imageCoord] = float4(float(color[imageIndex * 4 + 0]) / 65536.0f, float(color[imageIndex * 4 + 1]) / 65536.0f, float(color[imageIndex * 4 + 2]) / 65536.0f, 1.0f);
        }
    }
    //else
    //{
    //  Output[coord] = float4(0.0f, 0.0f, 1.0f, 1.0f);
    //}
}

/// <summary>Resolve msaa sample buffer into non-msaa image</summary>
/// <param name="GI">Dispatch group index (CPU side defined number of how many groups are dispatched)</param>
/// <param name="DTid">Dispatch thread index (element indexes - going from 0 to numthreads*groups-1</param>
/// <param name="GTid">Group thread index (goes from (0,0,0) to (numthreads.x-1, numthreads.y-1, numthreads.z-1) defined below</param>
[numthreads(16, 16, 1)]
void ResolveBuffer(uint3 GI : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID)
{
    // Reset group shared counter
    if (GTid.x == 0 && GTid.y == 0)
    {
        counter = 0;
    }

    // Get image coordinates
    uint2 coord = uint2(Tiles[GI.x].x + GTid.x, Tiles[GI.x].y + GTid.y);

    // Reset output pixel's color
    color[(GTid.x + GTid.y * 16) * 4 + 0] = 0;
    color[(GTid.x + GTid.y * 16) * 4 + 1] = 0;
    color[(GTid.x + GTid.y * 16) * 4 + 2] = 0;
    color[(GTid.x + GTid.y * 16) * 4 + 3] = 0;

    // Memory barrier, wait for all writes into pixel colors are done and group shared counter is reset
    GroupMemoryBarrierWithGroupSync();

    // Total number of elements in current tile
    uint items = TilesRecords[GI.x].count;
    // Sample offset where current tile begins
    uint offset = TilesRecords[GI.x].offset;

    // Current sample index
    uint index = 0;

    // Get next sample index (requires atomic operation - thread safety in workgroup)
    InterlockedAdd(counter, 1, index);

    while (index < items)
    {
        // Index in tile
        uint localCoord = TilesSamples[offset + index].x + TilesSamples[offset + index].y * 16;
        // Get sample from input buffer, multiply by weight
        float4 c = InputBuffer[offset + index] * TilesSamples[offset + index].weight;

        // TODO: Tonemapping should happen here!

        // NOTE: Due to requirement of using atomics, we have to operate on integers - therefore the floats are casted into unsigned
        // integers, but scaled before (by 2^16). This should be large-enough scale for HDR displays

        // If pixel does have multiple samples
        if (TilesSamples[offset + index].weight < 1.0f)
        {
            // Atomically increment group shared buffer color at local coordinates in tile
            uint val;
            InterlockedAdd(color[localCoord * 4 + 0], uint(c.x * 65536.0f), val);
            InterlockedAdd(color[localCoord * 4 + 1], uint(c.y * 65536.0f), val);
            InterlockedAdd(color[localCoord * 4 + 2], uint(c.z * 65536.0f), val);
        }
        // If pixel doesn't have multiple samples
        else
        {
            // Just store in group shared buffer color
            color[localCoord * 4 + 0] = uint(c.x * 65536.0f);
            color[localCoord * 4 + 1] = uint(c.y * 65536.0f);
            color[localCoord * 4 + 2] = uint(c.z * 65536.0f);
        }

        // Get next sample index
        InterlockedAdd(counter, 1, index);
    }

    // Memory barrier (wait until all threads in group finished adding values into group shared buffer color
    GroupMemoryBarrierWithGroupSync();

    // Local coordinates for current thread (just use 1 thread = 1 pixel at this point, as we only have to copy group shared buffer color into output image tile (they're both same size))
    uint imageIndex = GTid.x + GTid.y * 16;

    // Store data in output image
    // NOTE: Due to previous multiplication and cast into unsigned integers (for ability to do atomics), we now have to cast back into float and divide
    Output[coord] = float4(float(color[imageIndex * 4 + 0] / 65536.0f), float(color[imageIndex * 4 + 1] / 65536.0f), float(color[imageIndex * 4 + 2] / 65536.0f), 1.0f);
}