Advertisement
desdemona

cuda img filtering - better

May 31st, 2015
686
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 22.36 KB | None | 0 0
  1.  
  2. #include "cuda_runtime.h"
  3. #include "device_launch_parameters.h"
  4. #include "helper_cuda.h"
  5.  
  6. #include <stdlib.h>
  7. #include <stdio.h>
  8. #include <memory>
  9. #include <iostream>
  10. #include <cstdio>
  11. #include <cstdlib>
  12. #include <ctime>
  13.  
  14. #include "bitmap_image.hpp"
  15. #include "csv_parser/csv_parser.hpp"
  16.  
  17. #ifndef __CUDACC__  
  18.     #define __CUDACC__
  19. #endif
  20.  
  21. const int Filter_Size = 7;
  22. const int Filter_Total_Size = 49;
  23.  
  24. __device__ __constant__ float g_filter[Filter_Total_Size];
  25. __device__ __constant__ float g_factor[2];
  26.  
  27. __device__ int buffadr(unsigned int x, unsigned int y, unsigned int imageWidth, unsigned int imageHeight)
  28. {
  29.     if (x >= imageWidth || y >= imageHeight)
  30.     {
  31.         return -1;
  32.     }
  33.     return y*imageWidth + x;
  34. }
  35.  
  36. __global__ void filterKernel(const unsigned char *inputColor, unsigned char *outputColor, unsigned int imageWidth, unsigned int imageHeight,
  37.     unsigned int blockPartWidth, unsigned int blockPartHeight, unsigned int threadPartWidth, unsigned int threadPartHeight)
  38. {
  39.     __shared__ int blockXstart, blockYstart;
  40.     __shared__ int blockXend, blockYend;
  41.  
  42.     if (threadIdx.x == 0 && threadIdx.y == 0)
  43.     {
  44.         //policz i zapisz do shared czym zajmuje sie dany blok
  45.         blockXstart = blockIdx.x * blockPartWidth;
  46.         blockYstart = blockIdx.y * blockPartHeight;
  47.         blockXend = blockXstart + blockPartWidth; //tego miejsca juz nie liczymy
  48.         blockYend = blockYstart + blockPartHeight; //to też już poza obliczeniami
  49.         if (blockXend > imageWidth)
  50.         {
  51.             blockXend = imageWidth;
  52.         }
  53.         if (blockYend > imageHeight)
  54.         {
  55.             blockYend = imageHeight;
  56.         }
  57.         //policzyliśmy czym zajmuje dany block i zapisalismy, ok.
  58.     }
  59.  
  60.     __syncthreads();
  61.     //teraz każdy wątek liczy sobie swój start i koniec
  62.     unsigned int threadXstart = (threadIdx.x * threadPartWidth) + blockXstart;
  63.     unsigned int threadYstart = (threadIdx.y * threadPartHeight) + blockYstart;
  64.     unsigned int threadXend = threadXstart + threadPartWidth;
  65.     unsigned int threadYend = threadYstart + threadPartHeight;
  66.     threadXend = (threadXend > blockXend) ? blockXend : threadXend;
  67.     threadYend = (threadYend > blockYend) ? blockYend : threadYend;
  68.  
  69.     int x_, y_, bufadr_;
  70.     int c;
  71.     for (int y = threadYstart; y < threadYend; y++)
  72.     {
  73.         for (int x = threadXstart; x < threadXend; x++)
  74.         {
  75.             bufadr_ = buffadr(x, y, imageWidth, imageHeight);
  76.             if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
  77.             {
  78.                 continue;
  79.             }
  80.             c = 0;
  81.             for (int j = -3; j < 4; j++)
  82.             {
  83.                 for (int i = -3; i < 4; i++)
  84.                 {
  85.                     y_ = y + j;
  86.                     x_ = x + i;
  87.                     bufadr_ = buffadr(x_, y_, imageWidth, imageHeight);
  88.                     if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
  89.                     {
  90.                         continue;
  91.                     }
  92.                     c += g_filter[((j + 3)*7) + i + 3] * (int)inputColor[bufadr_];
  93.                 }
  94.             }
  95.             c = g_factor[0] * c + g_factor[1];
  96.            
  97.             c = (c < 0) ? 0 : c;
  98.             c = (c > 255) ? 255 : c;
  99.             outputColor[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c;
  100.         }
  101.     }
  102.    
  103. }
  104.  
  105. __global__ void newFilterKernel(const unsigned char *inputR, unsigned char *outputR, const unsigned char *inputG,
  106.     unsigned char *outputG, const unsigned char *inputB, unsigned char *outputB, unsigned int imageWidth, unsigned int imageHeight,
  107.     unsigned int blockPartWidth, unsigned int blockPartHeight, unsigned int threadPartWidth, unsigned int threadPartHeight)
  108. {
  109.     __shared__ int blockXstart, blockYstart;
  110.     __shared__ int blockXend, blockYend;
  111.  
  112.     if (threadIdx.x == 0 && threadIdx.y == 0)
  113.     {
  114.         //policz i zapisz do shared czym zajmuje sie dany blok
  115.         blockXstart = blockIdx.x * blockPartWidth;
  116.         blockYstart = blockIdx.y * blockPartHeight;
  117.         blockXend = blockXstart + blockPartWidth; //tego miejsca juz nie liczymy
  118.         blockYend = blockYstart + blockPartHeight; //to też już poza obliczeniami
  119.         if (blockXend > imageWidth)
  120.         {
  121.             blockXend = imageWidth;
  122.         }
  123.         if (blockYend > imageHeight)
  124.         {
  125.             blockYend = imageHeight;
  126.         }
  127.         //policzyliśmy czym zajmuje dany block i zapisalismy, ok.
  128.     }
  129.  
  130.     __syncthreads();
  131.     //teraz każdy wątek liczy sobie swój start i koniec
  132.     unsigned int threadXstart = (threadIdx.x * threadPartWidth) + blockXstart;
  133.     unsigned int threadYstart = (threadIdx.y * threadPartHeight) + blockYstart;
  134.     unsigned int threadXend = threadXstart + threadPartWidth;
  135.     unsigned int threadYend = threadYstart + threadPartHeight;
  136.     threadXend = (threadXend > blockXend) ? blockXend : threadXend;
  137.     threadYend = (threadYend > blockYend) ? blockYend : threadYend;
  138.  
  139.     int x_, y_, bufadr_;
  140.     int c_r, c_g, c_b;
  141.     float fil;
  142.     for (int y = threadYstart; y < threadYend; y++)
  143.     {
  144.         for (int x = threadXstart; x < threadXend; x++)
  145.         {
  146.             bufadr_ = buffadr(x, y, imageWidth, imageHeight);
  147.             if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
  148.             {
  149.                 continue;
  150.             }
  151.             c_r = c_g = c_b = 0;
  152.             for (int j = -3; j < 4; j++)
  153.             {
  154.                 for (int i = -3; i < 4; i++)
  155.                 {
  156.                     fil = g_filter[((j + 3)*7) + i + 3];
  157.                     if(0.0 == fil)
  158.                     { continue; }
  159.                     y_ = y + j;
  160.                     x_ = x + i;
  161.                     bufadr_ = buffadr(x_, y_, imageWidth, imageHeight);
  162.                     if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
  163.                     {
  164.                         continue;
  165.                     }
  166.                     //c += g_filter[((j + 3)*7) + i + 3] * (int)inputColor[bufadr_];
  167.                     c_r += fil * (int)inputR[bufadr_];
  168.                     c_g += fil * (int)inputG[bufadr_];
  169.                     c_b += fil * (int)inputB[bufadr_];
  170.                 }
  171.             }
  172.             c_r = g_factor[0] * c_r + g_factor[1];
  173.             c_g = g_factor[0] * c_g + g_factor[1];
  174.             c_b = g_factor[0] * c_b + g_factor[1];
  175.  
  176.             c_r = (c_r < 0) ? 0 : c_r;
  177.             c_r = (c_r > 255) ? 255 : c_r;
  178.             outputR[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_r;
  179.  
  180.             c_g = (c_g < 0) ? 0 : c_g;
  181.             c_g = (c_g > 255) ? 255 : c_g;
  182.             outputG[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_g;
  183.  
  184.             c_b = (c_b < 0) ? 0 : c_b;
  185.             c_b = (c_b > 255) ? 255 : c_b;
  186.             outputB[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_b;
  187.         }
  188.     }
  189. }
  190.  
  191. void printHelp()
  192. {
  193.     std::cout << "HELP: \n";
  194.     std::cout << "exe filter_file_path image_file_path\n";
  195.     std::cout << "\n";
  196.     system("pause");
  197. }
  198.  
  199. void readDeviceAttributes(int devId, int &driverVersion, int &runtimeVersion, unsigned long long &totalGlobalMem,
  200.     int &multiProcessorCount, int &cudaCores, int &warpSize, unsigned int &totalConstantMem,
  201.     unsigned int &sharedMemPerBlock, int &maxThreadsPerBlock, int &maxThreadsPerMP)
  202. {
  203.     /* code from deviceQuery example*/
  204.     cudaSetDevice(devId);
  205.     cudaDeviceProp deviceProp;
  206.     cudaGetDeviceProperties(&deviceProp, devId);
  207.     printf("\nDevice %d: \"%s\"\n", devId, deviceProp.name);
  208.     cudaDriverGetVersion(&driverVersion);
  209.     cudaRuntimeGetVersion(&runtimeVersion);
  210.     printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10, runtimeVersion / 1000, (runtimeVersion % 100) / 10);
  211.     printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
  212.  
  213.     totalGlobalMem = (unsigned long long) deviceProp.totalGlobalMem;
  214.     printf("  Total amount of global memory: (%llu bytes)\n", totalGlobalMem);
  215.     printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
  216.         deviceProp.multiProcessorCount,
  217.         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
  218.         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
  219.  
  220.     printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
  221.     totalConstantMem = deviceProp.totalConstMem;
  222.     printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
  223.     sharedMemPerBlock = deviceProp.sharedMemPerBlock;
  224.     printf("  Warp size:                                     %d\n", deviceProp.warpSize);
  225.  
  226.     maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
  227.     maxThreadsPerMP = deviceProp.maxThreadsPerMultiProcessor;
  228.     multiProcessorCount = deviceProp.multiProcessorCount;
  229.     cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
  230.  
  231.     warpSize = deviceProp.warpSize;
  232.     printf("\n");
  233. }
  234.  
  235. int main(int argc, char **argv)
  236. {
  237.     clock_t start,end,kernelstart,kernelend;
  238.     start = clock();
  239.  
  240.     int devId = 0;
  241.     int warpSize, multiProcesorCount, cudaCores;
  242.     int runtimeVersion, driverVersion;
  243.     unsigned long long totalGlobalMem;
  244.     unsigned int sharedMemPerBlock, totalConstantMem;
  245.     int maxThreadsPerBlock, maxThreadsPerMP;
  246.  
  247.     cudaDeviceProp deviceProp;
  248.  
  249.     int status = EXIT_SUCCESS;
  250.     std::cout << argv[0] << " starting.. \n";
  251.     std::cout << "CUDA Image filtering - 03:40\n";
  252.  
  253.     unsigned char *g_r = 0; //graphics red
  254.     unsigned char *g_g = 0; //graphics green
  255.     unsigned char *g_b = 0; //graphics blue
  256.     unsigned char *g_or = 0; //graphics output red
  257.     unsigned char *g_og = 0; //graphics output green
  258.     unsigned char *g_ob = 0; //graphics output blue
  259.     cudaError_t cudaStatus;
  260.  
  261.     /* read cuda devices */
  262. #pragma region readDevices
  263.     int deviceCount = 0;
  264.     cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
  265.     if (error_id != cudaSuccess)
  266.     {
  267.         std::cout << "cudaGetDeviceCount returned " << (int)error_id << "->" << cudaGetErrorString(error_id) << "\n";
  268.         std::cout << "Result = FAIL\n";
  269.         exit(EXIT_FAILURE);
  270.     }
  271.     // This function call returns 0 if there are no CUDA capable devices.
  272.     if (deviceCount == 0)
  273.     {
  274.         std::cout << "There are no available device(s) that support CUDA\n";
  275.         std::cout << "Result = FAIL\n";
  276.         exit(EXIT_FAILURE);
  277.     }
  278.     else
  279.     {
  280.         std::cout << "Detected " << deviceCount << " CUDA Capable device(s)\n";
  281.         if (deviceCount > 1)
  282.         {
  283.             std::cout << "Application will run on device with id= " << devId <<"\n";
  284.         }
  285.     }
  286. #pragma endregion readDevices
  287.  
  288.     /* read input parameters */
  289. #pragma region readAndParseInput
  290.     if (argc < 3)
  291.     {
  292.         std::cout << "You passed " << argc << "arguments\n";
  293.         std::cout << "Not enough arguments passed to program\n";
  294.         printHelp();
  295.         std::cout << "Result = FAIL\n";
  296.         exit(EXIT_FAILURE);
  297.     }
  298.     float filter_value[49];
  299.     float factor[2];
  300.  
  301.     /* read filter file and load to constant mem */
  302.     const char field_terminator = ',';
  303.     const char line_terminator  = '\n';
  304.     const char enclosure_char   = '"';
  305.  
  306.     csv_parser file_parser;
  307.     file_parser.set_skip_lines(0);
  308.     file_parser.init(argv[1]);
  309.  
  310.     file_parser.set_enclosed_char(enclosure_char, ENCLOSURE_OPTIONAL); 
  311.     file_parser.set_field_term_char(field_terminator);
  312.     file_parser.set_line_term_char(line_terminator);
  313.  
  314.     std::string num;
  315.     float tmp;
  316.     for (int i = 0; i < 8; i++)
  317.     {
  318.         if(!file_parser.has_more_rows() && 7 != i)
  319.         {
  320.             std::cout << "Malformed csv filter\n";
  321.             std::cout << "line:" << i << std::endl;
  322.             printHelp();
  323.             std::cout << "Result = FAIL\n";
  324.             exit(EXIT_FAILURE);
  325.         }
  326.         csv_row row = file_parser.get_row();
  327.  
  328.         if (i < 7 && 7 == row.size())
  329.         {
  330.             for (int j = 0; j < 7; j++)
  331.             {
  332.                 num = std::string(row[j]);
  333.                 tmp = ::strtod(num.c_str(), 0);
  334.                 filter_value[i * 7 + j] = (float)tmp;
  335.             }
  336.         }
  337.         else if(7 == i && 2 == row.size())
  338.         {
  339.             num = std::string(row[0]);
  340.             tmp = ::strtod(num.c_str(), 0);
  341.             factor[0] = (float)tmp;
  342.             num = std::string(row[1]);
  343.             tmp = ::strtod(num.c_str(), 0);
  344.             factor[1] = (float)tmp;
  345.         }
  346.         else
  347.         {
  348.             std::cout << "Malformed csv filter\n";
  349.             std::cout << "line: " << i << std::endl;
  350.             std::cout << "row size: " << row.size() << endl;
  351.             printHelp();
  352.             std::cout << "Result = FAIL\n";
  353.             exit(EXIT_FAILURE);
  354.         }
  355.     }
  356.  
  357.     std::cout << "Filter from file " << argv[1] << " read succesfully \n";
  358.  
  359.     /* read bmp from file to gpu global mem */
  360.  
  361.     std::string file_name(argv[2]);
  362.     bitmap_image image(file_name);
  363.     if (!image)
  364.     {
  365.         std::cout << "Failed to open image " << file_name.c_str() << "\n";
  366.         printHelp();
  367.         std::cout << "Result = FAIL\n";
  368.         exit(EXIT_FAILURE);
  369.     }
  370.  
  371.     const unsigned int imgWidth = image.width();
  372.     const unsigned int imgHeight = image.height();
  373.     std::cout << "Loaded image " << imgWidth << "X" << imgHeight << " from file " << file_name.c_str() << " \n";
  374.  
  375.     unsigned int imgSize = imgWidth * imgHeight;
  376.     unsigned int imgSizeBytes = sizeof(unsigned char)* 3 * imgSize;
  377.  
  378.     /* check if filer fits into constant mem, load */
  379.     /* it will not, constant memory is too small*/
  380.     int neededConstMem = sizeof(float)* Filter_Total_Size;
  381.     /* check if image fits into global memory, alocate output tables */
  382.     int neededGlobalMem = 2 * imgSizeBytes;
  383.     std::cout << "Application will need total of " << neededGlobalMem <<  " bytes in global memory \n";
  384.     std::cout << "Application will need total of " << neededConstMem << " bytes in constant memory \n";
  385. #pragma endregion readAndParseInput
  386.  
  387. #pragma region readDeviceProperties
  388.     readDeviceAttributes(devId, driverVersion, runtimeVersion, totalGlobalMem, multiProcesorCount, cudaCores,
  389.         warpSize, totalConstantMem, sharedMemPerBlock, maxThreadsPerBlock, maxThreadsPerMP);
  390.  
  391.     if ((unsigned int)neededConstMem > totalConstantMem || neededGlobalMem > totalGlobalMem)
  392.     {
  393.         std::cout << "Not enough memory. Try smaller image or get better device. \n";
  394.         printHelp();
  395.         std::cout << "Result = FAIL\n";
  396.         exit(EXIT_FAILURE);
  397.     }
  398. #pragma endregion readDeviceProperties
  399.  
  400. #pragma region allocateAndCopyMem
  401.     unsigned char *r = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);
  402.     unsigned char *g = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);
  403.     unsigned char *b = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);
  404.  
  405.     unsigned char red, green, blue;
  406.     for (std::size_t i = 0; i < imgHeight; i++)
  407.     {
  408.         for (std::size_t j = 0; j < imgWidth; j++)
  409.         {
  410.             image.get_pixel((unsigned int)j, (unsigned int)i, red, green, blue);
  411.             r[(int)i * (int)imgWidth + (int)j] = red;
  412.             g[(int)i * (int)imgWidth + (int)j] = green;
  413.             b[(int)i * (int)imgWidth + (int)j] = blue;
  414.         }
  415.     }
  416.  
  417.     std::cout << "Allocated mem for rgb tables. " << imgSizeBytes << " bytes in total \n";
  418.  
  419.     cudaStatus = cudaSetDevice(devId);
  420.     cudaGetDeviceProperties(&deviceProp, devId);
  421.     if (cudaStatus != cudaSuccess) {
  422.         std::cout << "cudaSetDevice failed! Do you have a CUDA - capable GPU installed ?\n";
  423.         status = EXIT_FAILURE;
  424.         goto Error;
  425.     }
  426.  
  427.     // Allocate GPU buffers for six vectors (3 input, 3 output).
  428.     cudaStatus = cudaMalloc((void**)&g_r, sizeof(unsigned char)* imgSize);
  429.     if (cudaStatus != cudaSuccess) {
  430.         std::cout << "cudaMalloc failed!\n";
  431.         status = EXIT_FAILURE;
  432.         goto Error;
  433.     }
  434.     cudaStatus = cudaMalloc((void**)&g_g, sizeof(unsigned char)* imgSize);
  435.     if (cudaStatus != cudaSuccess) {
  436.         std::cout << "cudaMalloc failed!\n";
  437.         status = EXIT_FAILURE;
  438.         goto Error;
  439.     }
  440.     cudaStatus = cudaMalloc((void**)&g_b, sizeof(unsigned char)* imgSize);
  441.     if (cudaStatus != cudaSuccess) {
  442.         std::cout << "cudaMalloc failed!\n";
  443.         status = EXIT_FAILURE;
  444.         goto Error;
  445.     }
  446.     cudaStatus = cudaMalloc((void**)&g_or, sizeof(unsigned char)* imgSize);
  447.     if (cudaStatus != cudaSuccess) {
  448.         std::cout << "cudaMalloc failed!\n";
  449.         status = EXIT_FAILURE;
  450.         goto Error;
  451.     }
  452.     cudaStatus = cudaMalloc((void**)&g_og, sizeof(unsigned char)* imgSize);
  453.     if (cudaStatus != cudaSuccess) {
  454.         std::cout << "cudaMalloc failed!\n";
  455.         status = EXIT_FAILURE;
  456.         goto Error;
  457.     }
  458.     cudaStatus = cudaMalloc((void**)&g_ob, sizeof(unsigned char)* imgSize);
  459.     if (cudaStatus != cudaSuccess) {
  460.         std::cout << "cudaMalloc failed!\n";
  461.         status = EXIT_FAILURE;
  462.         goto Error;
  463.     }
  464.  
  465.     std::cout << "Space for input and output images mallocked.\n";
  466.  
  467.     // Copy input vectors from host memory to GPU buffers.
  468.     cudaStatus = cudaMemcpy(g_r, r, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
  469.     if (cudaStatus != cudaSuccess) {
  470.         std::cout << "cudaMalloc failed!\n";
  471.         status = EXIT_FAILURE;
  472.         goto Error;
  473.     }
  474.     cudaStatus = cudaMemcpy(g_g, g, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
  475.     if (cudaStatus != cudaSuccess) {
  476.         std::cout << "cudaMalloc failed!\n";
  477.         status = EXIT_FAILURE;
  478.         goto Error;
  479.     }
  480.     cudaStatus = cudaMemcpy(g_b, b, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
  481.     if (cudaStatus != cudaSuccess) {
  482.         std::cout << "cudaMalloc failed!\n";
  483.         status = EXIT_FAILURE;
  484.         goto Error;
  485.     }
  486.     std::cout << "Input data coppied into global mem.\n";
  487.     cudaStatus = cudaMemcpyToSymbol(g_filter, filter_value, sizeof(float)* Filter_Total_Size, 0, cudaMemcpyHostToDevice);
  488.     if (cudaStatus != cudaSuccess) {
  489.         std::cout << "cudaMemcpyToSymbol failed!\n";
  490.         status = EXIT_FAILURE;
  491.         goto Error;
  492.     }
  493.     cudaMemcpyToSymbol(g_factor, factor, sizeof(float)*2,0,cudaMemcpyHostToDevice);
  494.     getLastCudaError("Factor copy failed");
  495.     checkCudaErrors(cudaDeviceSynchronize());
  496.  
  497.     std::cout << "Filter data coppied into constant mem.\n";
  498.  
  499. #pragma endregion allocateAndCopyMem
  500.  
  501. #pragma region divideImage
  502.     /*
  503.     int warpSize, multiProcesorCount, cudaCores;
  504.     int runtimeVersion, driverVersion;
  505.     unsigned long long totalGlobalMem;
  506.     unsigned int sharedMemPerBlock, totalConstantMem;
  507.     int maxThreadsPerBlock, maxThreadsPerMP;
  508.     */
  509.  
  510.     float dev_score = (float)cudaCores / (float)warpSize;
  511.     float img_ratio = (float)imgWidth/(float)imgHeight;
  512.    
  513.     std::cout << "Image ratio: " << img_ratio << " cores/warp: " << dev_score << endl;
  514.  
  515.  
  516.     int blocksX = 64;
  517.     int blocksY = 64;
  518.     int threadsX = 8;// 8;
  519.     int threadsY = 8;// 8;
  520.  
  521.     float div_ratio = (float)threadsX / (float)threadsY;
  522.     //zaokrąglone w górę te podziały
  523.     unsigned int blockPartX = (imgWidth / blocksX) + (imgWidth%blocksX != 0);
  524.     unsigned int blockPartY = (imgHeight / blocksY) + (imgHeight%blocksY != 0);
  525.     //to wyjdzie zawsze conajmniej 1, chyba że wymiar obrazu to 0, a to już patola
  526.  
  527.     unsigned int threadPartX = (blockPartX / threadsX) + (blockPartX%threadsX != 0);
  528.     unsigned int threadPartY = (blockPartY / threadsY) + (blockPartY%threadsY != 0);
  529.     //tu też zawsze conajmniej 1
  530.     while(threadsX * threadsY < dev_score * warpSize && (threadsX * threadsY) < maxThreadsPerBlock)
  531.     {
  532.         if(img_ratio > div_ratio && 1 != threadPartX)
  533.         {
  534.             threadsX += 2;
  535.         }
  536.         else
  537.         {
  538.             threadsY += 2;
  539.         }
  540.         div_ratio = (float)threadsX / (float)threadsY;
  541.         threadPartX = (blockPartX / threadsX) + (blockPartX%threadsX != 0);
  542.         threadPartY = (blockPartY / threadsY) + (blockPartY%threadsY != 0);
  543.         if(1 == threadPartX && 1 == threadPartY)
  544.         { break; }
  545.     }
  546.  
  547.     //64x64
  548.     dim3 blocksPerGrid(blocksX, blocksY);
  549.     //and 8x8 threads per block
  550.     dim3 threadsPerBlock(threadsX, threadsY);
  551.  
  552.     std::cout << blocksX << "X" << blocksY << " blocks, each block procesing " << blockPartX << "X" << blockPartY << "pixels.\n";
  553.     std::cout << threadsX << "X" << threadsY << " threads per block, each procesing " << threadPartX << "X" << threadPartY << "pixels.\n";
  554. #pragma endregion divideImage
  555.  
  556. #pragma region runKernels
  557.     /* do magic - run kernel */
  558.     kernelstart = clock();
  559.  
  560.     /*
  561.     filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_r, g_or, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
  562.     getLastCudaError("Kernel execution failed");
  563.     checkCudaErrors(cudaDeviceSynchronize());
  564.     std::cout << "Red.\n";
  565.  
  566.     filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_g, g_og, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
  567.     getLastCudaError("Kernel execution failed");
  568.     checkCudaErrors(cudaDeviceSynchronize());
  569.     std::cout << "Green.\n";
  570.  
  571.     filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_b, g_ob, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
  572.     getLastCudaError("Kernel execution failed");
  573.     checkCudaErrors(cudaDeviceSynchronize());
  574.     std::cout << "Blue.\n";
  575.     getLastCudaError("Kernel execution failed");
  576.     checkCudaErrors(cudaDeviceSynchronize());
  577.     */
  578.  
  579.     newFilterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_r, g_or, g_g, g_og, g_b, g_ob, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
  580.     getLastCudaError("Kernel execution failed");
  581.     checkCudaErrors(cudaDeviceSynchronize());
  582.     std::cout << "Red.\n";  std::cout << "Green.\n"; std::cout << "Blue.\n";
  583.  
  584.     kernelend = clock();
  585.     //sekwencyjnie, hehe.
  586.     // Check if kernel execution generated an error
  587.  
  588.     /* start clock */
  589.  
  590.     /*Keep in mind that there is some driver overhead the first time you call
  591.     a particular kernel in your program, so when doing timing studies, you
  592.     should warm up your kernels by calling them once before your timer
  593.     starts.*/
  594.  
  595.     /* end clock */
  596.  
  597. #pragma endregion runKernels
  598.  
  599. #pragma region copyResults
  600.     // Copy output vector from GPU buffer to host memory.
  601.     cudaStatus = cudaMemcpy(r, g_or, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
  602.     if (cudaStatus != cudaSuccess) {
  603.         std::cout << "cudaMemcpy failed!\n";
  604.         status = EXIT_FAILURE;
  605.         goto Error;
  606.     }
  607.     cudaStatus = cudaMemcpy(g, g_og, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
  608.     if (cudaStatus != cudaSuccess) {
  609.         std::cout << "cudaMemcpy failed!\n";
  610.         status = EXIT_FAILURE;
  611.         goto Error;
  612.     }
  613.     cudaStatus = cudaMemcpy(b, g_ob, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
  614.     if (cudaStatus != cudaSuccess) {
  615.         std::cout << "cudaMemcpy failed!\n";
  616.         status = EXIT_FAILURE;
  617.         goto Error;
  618.     }
  619. #pragma endregion copyResults
  620.  
  621.     /* assemble and save output image */
  622. #pragma region saveOutput
  623.     for (std::size_t i = 0; i < imgHeight; i++)
  624.     {
  625.         for (std::size_t j = 0; j < imgWidth; j++)
  626.         {
  627.             red = r[(int)i * (int)imgWidth + (int)j];
  628.             green = g[(int)i * (int)imgWidth + (int)j];
  629.             blue = b[(int)i * (int)imgWidth + (int)j];
  630.             image.set_pixel((unsigned int)j, (unsigned int)i, red, green, blue);
  631.         }
  632.     }
  633.     time_t timer = NULL;
  634.     time(&timer);
  635.     struct tm * timeinfo = localtime(&timer);
  636.     char matko[14];
  637.     strftime(matko, 14, "%y%m%d_%H%M%S", timeinfo);
  638.     std::cout << "Saving result image to " << file_name.substr(0, file_name.size() - 4) + "_output_" + std::string(matko) + ".bmp" << " file.\n";
  639.     image.save_image(file_name.substr(0, file_name.size() - 4) + "_output_" + std::string(matko) + ".bmp");
  640. #pragma endregion saveOutput
  641.  
  642.  
  643.     // cudaDeviceReset must be called before exiting in order for profiling and
  644.     // tracing tools such as Nsight and Visual Profiler to show complete traces.
  645.     cudaStatus = cudaDeviceReset();
  646.     if (cudaStatus != cudaSuccess) {
  647.         fprintf(stderr, "cudaDeviceReset failed!");
  648.         return 1;
  649.     }
  650.  
  651. /* free every mallocked space and exit */
  652. #pragma region freeAndExit
  653. Error:
  654.     cudaFree(g_r);
  655.     cudaFree(g_g);
  656.     cudaFree(g_b);
  657.     cudaFree(g_or);
  658.     cudaFree(g_og);
  659.     cudaFree(g_ob);
  660.     //cudaFree(g_filter);
  661.     free(r);
  662.     free(g);
  663.     free(b);
  664.     end = clock();
  665.     double total_diff = double(end - start) / CLOCKS_PER_SEC;
  666.     printf ("Total elapsed time is %.6lf seconds.\n", total_diff );
  667.     if(NULL != kernelstart && NULL != kernelend)
  668.     {
  669.         double kernel_diff = double(kernelend - kernelstart) / CLOCKS_PER_SEC;
  670.         printf ("Kernel functions elapsed time is %.6lf seconds.\n", kernel_diff );
  671.     }
  672.     system("pause");
  673.     exit(status);
  674. #pragma endregion freeAndExit
  675. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement