Advertisement
Zgragselus

ComputeResolve

Oct 9th, 2023
1,025
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 10.99 KB | None | 0 0
  1. #include "../Common.hlsli"
  2. #include "../Multisample.hlsli"
  3.  
  4. /// <summary>Buffer of all tiles = base tile information (tile location in image, tile classification (whether it is MSAA or not) and UAV (CPU-side only)</summary>
  5. RWStructuredBuffer<Tile> Tiles: register(u0);
  6. /// <summary>Output texture image in which the multisampled texture or buffer are resolved</summary>
  7. RWTexture2D<float4> Output: register(u1);
  8. /// <summary>Buffer of all tile records = tile properties (offset into sample buffer and how many samples are in current tile)</summary>
  9. RWStructuredBuffer<TileRecord> TilesRecords: register(u2);
  10. /// <summary>Buffer of all tile samples (contain coordinates inside tile (to allow sampling source MSAA images), sample weight (for resolve) and sample index (to allow obtaining specific sample from Texture2DMS)</summary>
  11. RWStructuredBuffer<TileSample> TilesSamples: register(u3);
  12.  
  13. /// <summary>Multisampled input image to resolve (for Resolve)</summary>
  14. Texture2DMS<float4, SamplesMSAA> Input: register(t0);
  15. /// <summary>Multisampled sample buffer to resolve (for ResolveBuffer)</summary>
  16. StructuredBuffer<float4> InputBuffer: register(t1);
  17.  
  18. /// <summary>Constants at Constant Buffer View (CBV) register 0</summary>
  19. cbuffer Params : register(b0)
  20. {
  21.     /// <summary>Input image width</summary>
  22.     uint Width;
  23.  
  24.     /// <summary>Input image height</summary>
  25.     uint Height;
  26. }
  27.  
  28. /// <summary>Counter for group - used to obtain sample indexes in atomic way when looping either through tile's samples (ResolveBuffer) or through pixel's samples (Resolve)</summary>
  29. groupshared uint counter;
  30. /// <summary>Group shared buffer for resulting color - samples are resolved into this group shared buffer and then stored in output image</summary>
  31. groupshared uint color[16 * 16 * 4];
  32.  
  33. /// <summary>Clear output image (single group clears single tile)</summary>
  34. /// <param name="GI">Dispatch group index (CPU side defined number of how many groups are dispatched)</param>
  35. /// <param name="DTid">Dispatch thread index (element indexes - going from 0 to numthreads*groups-1</param>
  36. /// <param name="GTid">Group thread index (goes from (0,0,0) to (numthreads.x-1, numthreads.y-1, numthreads.z-1) defined below</param>
  37. [numthreads(16, 16, 1)]
  38. void Clear(uint3 GI : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID)
  39. {
  40.     // Get image coordinates
  41.     uint2 coord = uint2(Tiles[GI.x].x + GTid.x, Tiles[GI.x].y + GTid.y);
  42.  
  43.     // Clear when in image boundaries
  44.     if (coord.x < Width && coord.y < Height)
  45.     {
  46.         Output[coord] = float4(0.0f, 0.0f, 0.0f, 1.0f);
  47.     }
  48. }
  49.  
  50. /// <summary>Resolve msaa image with tile structure around it into non-msaa image</summary>
  51. /// <param name="GI">Dispatch group index (CPU side defined number of how many groups are dispatched)</param>
  52. /// <param name="DTid">Dispatch thread index (element indexes - going from 0 to numthreads*groups-1</param>
  53. /// <param name="GTid">Group thread index (goes from (0,0,0) to (numthreads.x-1, numthreads.y-1, numthreads.z-1) defined below</param>
  54. [numthreads(16, 16, 1)]
  55. void Resolve(uint3 GI : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID)
  56. {
  57.     // Reset group shared counter
  58.     if (GTid.x == 0 && GTid.y == 0)
  59.     {
  60.         counter = 0;
  61.     }
  62.  
  63.     // Get image coordinates
  64.     uint2 coord = uint2(Tiles[GI.x].x + GTid.x, Tiles[GI.x].y + GTid.y);
  65.  
  66.     // Reset output pixel's color
  67.     color[(GTid.x + GTid.y * 16) * 4 + 0] = 0;
  68.     color[(GTid.x + GTid.y * 16) * 4 + 1] = 0;
  69.     color[(GTid.x + GTid.y * 16) * 4 + 2] = 0;
  70.     color[(GTid.x + GTid.y * 16) * 4 + 3] = 0;
  71.  
  72.     // Memory barrier, wait for all writes into pixel colors are done and group shared counter is reset
  73.     GroupMemoryBarrierWithGroupSync();
  74.  
  75.     // Work only when in image boundaries
  76.     if (coord.x < Width && coord.y < Height)
  77.     {
  78.         // If tile is classified as non-msaa
  79.         if (Tiles[GI.x].classification == 0)
  80.         {
  81.             // Write input pixel into output pixel
  82.             float4 c = Input.Load(coord, 0);
  83.             Output[coord] = c;
  84.  
  85.             // DEBUG:
  86.             if (coord.x == (Width / 4) || coord.y == (Height / 4) || coord.x == (3 * Width / 4) || coord.y == (3 * Height / 4))
  87.             {
  88.                 Output[coord] = float4(1.0f, 0.0f, 0.0f, 1.0f);
  89.             }
  90.         }
  91.         // Otherwise (msaa-image)
  92.         else
  93.         {
  94.             // First step is to resolve all samples into group shared color buffer
  95.  
  96.             // Total samples count in tile
  97.             uint samplesCount = TilesRecords[GI.x].count;
  98.             // Currently processed sample
  99.             uint index = 0;
  100.  
  101.             // Get next sample index (requires atomic operation - thread safety in workgroup)
  102.             InterlockedAdd(counter, 1, index);
  103.  
  104.             // Loop until all samples in tile are processed (sample index is higher than there are samples in current tile)
  105.             while (index < samplesCount)
  106.             {
  107.                 // Image coordinates that this sample represents
  108.                 uint2 sampleCoord = uint2(Tiles[GI.x].x + TilesSamples[TilesRecords[GI.x].offset + index].x, Tiles[GI.x].y + TilesSamples[TilesRecords[GI.x].offset + index].y);
  109.                 // Index in tile
  110.                 uint localCoord = TilesSamples[TilesRecords[GI.x].offset + index].x + TilesSamples[TilesRecords[GI.x].offset + index].y * 16;
  111.  
  112.                 // Load sample from input image (note, specific sampleIndex is taken from currently processed sample) and multiply by weight
  113.                 float4 c = Input.Load(sampleCoord, TilesSamples[TilesRecords[GI.x].offset + index].sampleIndex) * TilesSamples[TilesRecords[GI.x].offset + index].weight;
  114.  
  115.                 // TODO: Tonemapping should happen here!
  116.  
  117.                 // NOTE: Due to requirement of using atomics, we have to operate on integers - therefore the floats are casted into unsigned
  118.                 // integers, but scaled before (by 2^16). This should be large-enough scale for HDR displays.
  119.  
  120.                 // If pixel does have multiple samples
  121.                 if (TilesSamples[TilesRecords[GI.x].offset + index].weight < 1.0f)
  122.                 {
  123.                     // Atomically increment group shared buffer color at local coordinates in tile
  124.                     uint val;
  125.                     InterlockedAdd(color[localCoord * 4 + 0], uint(c.x * 65536.0f), val);
  126.                     InterlockedAdd(color[localCoord * 4 + 1], uint(c.y * 65536.0f), val);
  127.                     InterlockedAdd(color[localCoord * 4 + 2], uint(c.z * 65536.0f), val);
  128.                 }
  129.                 // If pixel doesn't have multiple samples
  130.                 else
  131.                 {
  132.                     // Just store in group shared buffer color
  133.                     color[localCoord * 4 + 0] = uint(c.x * 65536.0f);
  134.                     color[localCoord * 4 + 1] = uint(c.y * 65536.0f);
  135.                     color[localCoord * 4 + 2] = uint(c.z * 65536.0f);
  136.                 }
  137.  
  138.                 // Get next sample index
  139.                 InterlockedAdd(counter, 1, index);
  140.             }
  141.  
  142.             // Memory barrier (wait until all threads in group finished adding values into group shared buffer color
  143.             GroupMemoryBarrierWithGroupSync();
  144.  
  145.             // Image coordinates
  146.             uint2 imageCoord = uint2(Tiles[GI.x].x + GTid.x, Tiles[GI.x].y + GTid.y);
  147.             // Local coordinates for current thread (just use 1 thread = 1 pixel at this point, as we only have to copy group shared buffer color into output image tile (they're both same size))
  148.             uint imageIndex = GTid.x + GTid.y * 16;
  149.  
  150.             // DEBUG:
  151.             if (imageCoord.x == (Width / 4) || imageCoord.y == (Height / 4) || imageCoord.x == (3 * Width / 4) || imageCoord.y == (3 * Height / 4))
  152.             {
  153.                 color[imageIndex * 4 + 0] = 65536.0f;
  154.                 color[imageIndex * 4 + 1] = 0.0f;
  155.                 color[imageIndex * 4 + 2] = 0.0f;
  156.             }
  157.  
  158.             // Store data in output image
  159.             // NOTE: Due to previous multiplication and cast into unsigned integers (for ability to do atomics), we now have to cast back into float and divide
  160.             Output[imageCoord] = float4(float(color[imageIndex * 4 + 0]) / 65536.0f, float(color[imageIndex * 4 + 1]) / 65536.0f, float(color[imageIndex * 4 + 2]) / 65536.0f, 1.0f);
  161.         }
  162.     }
  163.     //else
  164.     //{
  165.     //  Output[coord] = float4(0.0f, 0.0f, 1.0f, 1.0f);
  166.     //}
  167. }
  168.  
  169. /// <summary>Resolve msaa sample buffer into non-msaa image</summary>
  170. /// <param name="GI">Dispatch group index (CPU side defined number of how many groups are dispatched)</param>
  171. /// <param name="DTid">Dispatch thread index (element indexes - going from 0 to numthreads*groups-1</param>
  172. /// <param name="GTid">Group thread index (goes from (0,0,0) to (numthreads.x-1, numthreads.y-1, numthreads.z-1) defined below</param>
  173. [numthreads(16, 16, 1)]
  174. void ResolveBuffer(uint3 GI : SV_GroupID, uint3 DTid : SV_DispatchThreadID, uint3 GTid : SV_GroupThreadID)
  175. {
  176.     // Reset group shared counter
  177.     if (GTid.x == 0 && GTid.y == 0)
  178.     {
  179.         counter = 0;
  180.     }
  181.  
  182.     // Get image coordinates
  183.     uint2 coord = uint2(Tiles[GI.x].x + GTid.x, Tiles[GI.x].y + GTid.y);
  184.  
  185.     // Reset output pixel's color
  186.     color[(GTid.x + GTid.y * 16) * 4 + 0] = 0;
  187.     color[(GTid.x + GTid.y * 16) * 4 + 1] = 0;
  188.     color[(GTid.x + GTid.y * 16) * 4 + 2] = 0;
  189.     color[(GTid.x + GTid.y * 16) * 4 + 3] = 0;
  190.  
  191.     // Memory barrier, wait for all writes into pixel colors are done and group shared counter is reset
  192.     GroupMemoryBarrierWithGroupSync();
  193.  
  194.     // Total number of elements in current tile
  195.     uint items = TilesRecords[GI.x].count;
  196.     // Sample offset where current tile begins
  197.     uint offset = TilesRecords[GI.x].offset;
  198.  
  199.     // Current sample index
  200.     uint index = 0;
  201.  
  202.     // Get next sample index (requires atomic operation - thread safety in workgroup)
  203.     InterlockedAdd(counter, 1, index);
  204.  
  205.     while (index < items)
  206.     {
  207.         // Index in tile
  208.         uint localCoord = TilesSamples[offset + index].x + TilesSamples[offset + index].y * 16;
  209.         // Get sample from input buffer, multiply by weight
  210.         float4 c = InputBuffer[offset + index] * TilesSamples[offset + index].weight;
  211.  
  212.         // TODO: Tonemapping should happen here!
  213.  
  214.         // NOTE: Due to requirement of using atomics, we have to operate on integers - therefore the floats are casted into unsigned
  215.         // integers, but scaled before (by 2^16). This should be large-enough scale for HDR displays
  216.  
  217.         // If pixel does have multiple samples
  218.         if (TilesSamples[offset + index].weight < 1.0f)
  219.         {
  220.             // Atomically increment group shared buffer color at local coordinates in tile
  221.             uint val;
  222.             InterlockedAdd(color[localCoord * 4 + 0], uint(c.x * 65536.0f), val);
  223.             InterlockedAdd(color[localCoord * 4 + 1], uint(c.y * 65536.0f), val);
  224.             InterlockedAdd(color[localCoord * 4 + 2], uint(c.z * 65536.0f), val);
  225.         }
  226.         // If pixel doesn't have multiple samples
  227.         else
  228.         {
  229.             // Just store in group shared buffer color
  230.             color[localCoord * 4 + 0] = uint(c.x * 65536.0f);
  231.             color[localCoord * 4 + 1] = uint(c.y * 65536.0f);
  232.             color[localCoord * 4 + 2] = uint(c.z * 65536.0f);
  233.         }
  234.  
  235.         // Get next sample index
  236.         InterlockedAdd(counter, 1, index);
  237.     }
  238.  
  239.     // Memory barrier (wait until all threads in group finished adding values into group shared buffer color
  240.     GroupMemoryBarrierWithGroupSync();
  241.  
  242.     // Local coordinates for current thread (just use 1 thread = 1 pixel at this point, as we only have to copy group shared buffer color into output image tile (they're both same size))
  243.     uint imageIndex = GTid.x + GTid.y * 16;
  244.  
  245.     // Store data in output image
  246.     // NOTE: Due to previous multiplication and cast into unsigned integers (for ability to do atomics), we now have to cast back into float and divide
  247.     Output[coord] = float4(float(color[imageIndex * 4 + 0] / 65536.0f), float(color[imageIndex * 4 + 1] / 65536.0f), float(color[imageIndex * 4 + 2] / 65536.0f), 1.0f);
  248. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement