Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "../Common.hlsli"
- #include "../Multisample.hlsli"
- /// <summary>Buffer of all tiles = base tile information (tile location in image, tile classification (whether it is MSAA or not) and UAV (CPU-side only)</summary>
- RWStructuredBuffer<Tile> Tiles: register(u0);
- /// <summary>Output texture image in which the multisampled texture or buffer are resolved</summary>
- RWTexture2D<float4> Output: register(u1);
- /// <summary>Buffer of all tile records = tile properties (offset into sample buffer and how many samples are in current tile)</summary>
- RWStructuredBuffer<TileRecord> TilesRecords: register(u2);
- /// <summary>Buffer of all tile samples (contain coordinates inside tile (to allow sampling source MSAA images), sample weight (for resolve) and sample index (to allow obtaining specific sample from Texture2DMS)</summary>
- RWStructuredBuffer<TileSample> TilesSamples: register(u3);
- /// <summary>Multisampled input image to resolve (for Resolve)</summary>
- Texture2DMS<float4, SamplesMSAA> Input: register(t0);
- /// <summary>Multisampled sample buffer to resolve (for ResolveBuffer)</summary>
- StructuredBuffer<float4> InputBuffer: register(t1);
- /// <summary>Constants at Constant Buffer View (CBV) register 0</summary>
- cbuffer Params : register(b0)
- {
- /// <summary>Input image width</summary>
- uint Width;
- /// <summary>Input image height</summary>
- uint Height;
- }
- /// <summary>Counter for group - used to obtain sample indexes in atomic way when looping either through tile's samples (ResolveBuffer) or through pixel's samples (Resolve)</summary>
- groupshared uint counter;
- /// <summary>Group shared buffer for resulting color - samples are resolved into this group shared buffer and then stored in output image</summary>
- groupshared uint color[16 * 16 * 4];
- /// <summary>Clear output image (single group clears single tile)</summary>
- /// <param name="GI">Dispatch group index (CPU side defined number of how many groups are dispatched)</param>
- /// <param name="DTid">Dispatch thread index (element indexes - going from 0 to numthreads*groups-1</param>
- /// <param name="GTid">Group thread index (goes from (0,0,0) to (numthreads.x-1, numthreads.y-1, numthreads.z-1) defined below</param>
- [numthreads(16, 16, 1)]
- void Clear(uint3 GI : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID)
- {
- // Get image coordinates
- uint2 coord = uint2(Tiles[GI.x].x + GTid.x, Tiles[GI.x].y + GTid.y);
- // Clear when in image boundaries
- if (coord.x < Width && coord.y < Height)
- {
- Output[coord] = float4(0.0f, 0.0f, 0.0f, 1.0f);
- }
- }
- /// <summary>Resolve msaa image with tile structure around it into non-msaa image</summary>
- /// <param name="GI">Dispatch group index (CPU side defined number of how many groups are dispatched)</param>
- /// <param name="DTid">Dispatch thread index (element indexes - going from 0 to numthreads*groups-1</param>
- /// <param name="GTid">Group thread index (goes from (0,0,0) to (numthreads.x-1, numthreads.y-1, numthreads.z-1) defined below</param>
- [numthreads(16, 16, 1)]
- void Resolve(uint3 GI : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID)
- {
- // Reset group shared counter
- if (GTid.x == 0 && GTid.y == 0)
- {
- counter = 0;
- }
- // Get image coordinates
- uint2 coord = uint2(Tiles[GI.x].x + GTid.x, Tiles[GI.x].y + GTid.y);
- // Reset output pixel's color
- color[(GTid.x + GTid.y * 16) * 4 + 0] = 0;
- color[(GTid.x + GTid.y * 16) * 4 + 1] = 0;
- color[(GTid.x + GTid.y * 16) * 4 + 2] = 0;
- color[(GTid.x + GTid.y * 16) * 4 + 3] = 0;
- // Memory barrier, wait for all writes into pixel colors are done and group shared counter is reset
- GroupMemoryBarrierWithGroupSync();
- // Work only when in image boundaries
- if (coord.x < Width && coord.y < Height)
- {
- // If tile is classified as non-msaa
- if (Tiles[GI.x].classification == 0)
- {
- // Write input pixel into output pixel
- float4 c = Input.Load(coord, 0);
- Output[coord] = c;
- // DEBUG:
- if (coord.x == (Width / 4) || coord.y == (Height / 4) || coord.x == (3 * Width / 4) || coord.y == (3 * Height / 4))
- {
- Output[coord] = float4(1.0f, 0.0f, 0.0f, 1.0f);
- }
- }
- // Otherwise (msaa-image)
- else
- {
- // First step is to resolve all samples into group shared color buffer
- // Total samples count in tile
- uint samplesCount = TilesRecords[GI.x].count;
- // Currently processed sample
- uint index = 0;
- // Get next sample index (requires atomic operation - thread safety in workgroup)
- InterlockedAdd(counter, 1, index);
- // Loop until all samples in tile are processed (sample index is higher than there are samples in current tile)
- while (index < samplesCount)
- {
- // Image coordinates that this sample represents
- uint2 sampleCoord = uint2(Tiles[GI.x].x + TilesSamples[TilesRecords[GI.x].offset + index].x, Tiles[GI.x].y + TilesSamples[TilesRecords[GI.x].offset + index].y);
- // Index in tile
- uint localCoord = TilesSamples[TilesRecords[GI.x].offset + index].x + TilesSamples[TilesRecords[GI.x].offset + index].y * 16;
- // Load sample from input image (note, specific sampleIndex is taken from currently processed sample) and multiply by weight
- float4 c = Input.Load(sampleCoord, TilesSamples[TilesRecords[GI.x].offset + index].sampleIndex) * TilesSamples[TilesRecords[GI.x].offset + index].weight;
- // TODO: Tonemapping should happen here!
- // NOTE: Due to requirement of using atomics, we have to operate on integers - therefore the floats are casted into unsigned
- // integers, but scaled before (by 2^16). This should be large-enough scale for HDR displays.
- // If pixel does have multiple samples
- if (TilesSamples[TilesRecords[GI.x].offset + index].weight < 1.0f)
- {
- // Atomically increment group shared buffer color at local coordinates in tile
- uint val;
- InterlockedAdd(color[localCoord * 4 + 0], uint(c.x * 65536.0f), val);
- InterlockedAdd(color[localCoord * 4 + 1], uint(c.y * 65536.0f), val);
- InterlockedAdd(color[localCoord * 4 + 2], uint(c.z * 65536.0f), val);
- }
- // If pixel doesn't have multiple samples
- else
- {
- // Just store in group shared buffer color
- color[localCoord * 4 + 0] = uint(c.x * 65536.0f);
- color[localCoord * 4 + 1] = uint(c.y * 65536.0f);
- color[localCoord * 4 + 2] = uint(c.z * 65536.0f);
- }
- // Get next sample index
- InterlockedAdd(counter, 1, index);
- }
- // Memory barrier (wait until all threads in group finished adding values into group shared buffer color
- GroupMemoryBarrierWithGroupSync();
- // Image coordinates
- uint2 imageCoord = uint2(Tiles[GI.x].x + GTid.x, Tiles[GI.x].y + GTid.y);
- // Local coordinates for current thread (just use 1 thread = 1 pixel at this point, as we only have to copy group shared buffer color into output image tile (they're both same size))
- uint imageIndex = GTid.x + GTid.y * 16;
- // DEBUG:
- if (imageCoord.x == (Width / 4) || imageCoord.y == (Height / 4) || imageCoord.x == (3 * Width / 4) || imageCoord.y == (3 * Height / 4))
- {
- color[imageIndex * 4 + 0] = 65536.0f;
- color[imageIndex * 4 + 1] = 0.0f;
- color[imageIndex * 4 + 2] = 0.0f;
- }
- // Store data in output image
- // NOTE: Due to previous multiplication and cast into unsigned integers (for ability to do atomics), we now have to cast back into float and divide
- Output[imageCoord] = float4(float(color[imageIndex * 4 + 0]) / 65536.0f, float(color[imageIndex * 4 + 1]) / 65536.0f, float(color[imageIndex * 4 + 2]) / 65536.0f, 1.0f);
- }
- }
- //else
- //{
- // Output[coord] = float4(0.0f, 0.0f, 1.0f, 1.0f);
- //}
- }
- /// <summary>Resolve msaa sample buffer into non-msaa image</summary>
- /// <param name="GI">Dispatch group index (CPU side defined number of how many groups are dispatched)</param>
- /// <param name="DTid">Dispatch thread index (element indexes - going from 0 to numthreads*groups-1</param>
- /// <param name="GTid">Group thread index (goes from (0,0,0) to (numthreads.x-1, numthreads.y-1, numthreads.z-1) defined below</param>
- [numthreads(16, 16, 1)]
- void ResolveBuffer(uint3 GI : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID)
- {
- // Reset group shared counter
- if (GTid.x == 0 && GTid.y == 0)
- {
- counter = 0;
- }
- // Get image coordinates
- uint2 coord = uint2(Tiles[GI.x].x + GTid.x, Tiles[GI.x].y + GTid.y);
- // Reset output pixel's color
- color[(GTid.x + GTid.y * 16) * 4 + 0] = 0;
- color[(GTid.x + GTid.y * 16) * 4 + 1] = 0;
- color[(GTid.x + GTid.y * 16) * 4 + 2] = 0;
- color[(GTid.x + GTid.y * 16) * 4 + 3] = 0;
- // Memory barrier, wait for all writes into pixel colors are done and group shared counter is reset
- GroupMemoryBarrierWithGroupSync();
- // Total number of elements in current tile
- uint items = TilesRecords[GI.x].count;
- // Sample offset where current tile begins
- uint offset = TilesRecords[GI.x].offset;
- // Current sample index
- uint index = 0;
- // Get next sample index (requires atomic operation - thread safety in workgroup)
- InterlockedAdd(counter, 1, index);
- while (index < items)
- {
- // Index in tile
- uint localCoord = TilesSamples[offset + index].x + TilesSamples[offset + index].y * 16;
- // Get sample from input buffer, multiply by weight
- float4 c = InputBuffer[offset + index] * TilesSamples[offset + index].weight;
- // TODO: Tonemapping should happen here!
- // NOTE: Due to requirement of using atomics, we have to operate on integers - therefore the floats are casted into unsigned
- // integers, but scaled before (by 2^16). This should be large-enough scale for HDR displays
- // If pixel does have multiple samples
- if (TilesSamples[offset + index].weight < 1.0f)
- {
- // Atomically increment group shared buffer color at local coordinates in tile
- uint val;
- InterlockedAdd(color[localCoord * 4 + 0], uint(c.x * 65536.0f), val);
- InterlockedAdd(color[localCoord * 4 + 1], uint(c.y * 65536.0f), val);
- InterlockedAdd(color[localCoord * 4 + 2], uint(c.z * 65536.0f), val);
- }
- // If pixel doesn't have multiple samples
- else
- {
- // Just store in group shared buffer color
- color[localCoord * 4 + 0] = uint(c.x * 65536.0f);
- color[localCoord * 4 + 1] = uint(c.y * 65536.0f);
- color[localCoord * 4 + 2] = uint(c.z * 65536.0f);
- }
- // Get next sample index
- InterlockedAdd(counter, 1, index);
- }
- // Memory barrier (wait until all threads in group finished adding values into group shared buffer color
- GroupMemoryBarrierWithGroupSync();
- // Local coordinates for current thread (just use 1 thread = 1 pixel at this point, as we only have to copy group shared buffer color into output image tile (they're both same size))
- uint imageIndex = GTid.x + GTid.y * 16;
- // Store data in output image
- // NOTE: Due to previous multiplication and cast into unsigned integers (for ability to do atomics), we now have to cast back into float and divide
- Output[coord] = float4(float(color[imageIndex * 4 + 0] / 65536.0f), float(color[imageIndex * 4 + 1] / 65536.0f), float(color[imageIndex * 4 + 2] / 65536.0f), 1.0f);
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement