Advertisement
desdemona

cuda znów

Jun 2nd, 2015
691
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 22.67 KB | None | 0 0
  1.  
  2. #include "cuda_runtime.h"
  3. #include "device_launch_parameters.h"
  4. #include "helper_cuda.h"
  5.  
  6. #include <stdlib.h>
  7. #include <stdio.h>
  8. #include <memory>
  9. #include <iostream>
  10. #include <cstdio>
  11. #include <cstdlib>
  12. #include <ctime>
  13.  
  14. #include "bitmap_image.hpp"
  15. #include "csv_parser/csv_parser.hpp"
  16.  
  17. #ifndef __CUDACC__  
  18.     #define __CUDACC__
  19. #endif
  20.  
  21. const int Filter_Size = 7;
  22. const int Filter_Total_Size = 49;
  23.  
  24. __device__ __constant__ float g_filter[Filter_Total_Size];
  25. __device__ __constant__ float g_factor[2];
  26.  
  27. __device__ int buffadr(unsigned int x, unsigned int y, unsigned int imageWidth, unsigned int imageHeight)
  28. {
  29.     if (x >= imageWidth || y >= imageHeight)
  30.     {
  31.         return -1;
  32.     }
  33.     return y*imageWidth + x;
  34. }
  35.  
  36. __global__ void filterKernel(const unsigned char *inputColor, unsigned char *outputColor, unsigned int imageWidth, unsigned int imageHeight,
  37.     unsigned int blockPartWidth, unsigned int blockPartHeight, unsigned int threadPartWidth, unsigned int threadPartHeight)
  38. {
  39.     __shared__ int blockXstart, blockYstart;
  40.     __shared__ int blockXend, blockYend;
  41.  
  42.     if (threadIdx.x == 0 && threadIdx.y == 0)
  43.     {
  44.         //policz i zapisz do shared czym zajmuje sie dany blok
  45.         blockXstart = blockIdx.x * blockPartWidth;
  46.         blockYstart = blockIdx.y * blockPartHeight;
  47.         blockXend = blockXstart + blockPartWidth; //tego miejsca juz nie liczymy
  48.         blockYend = blockYstart + blockPartHeight; //to też już poza obliczeniami
  49.         if (blockXend > imageWidth)
  50.         {
  51.             blockXend = imageWidth;
  52.         }
  53.         if (blockYend > imageHeight)
  54.         {
  55.             blockYend = imageHeight;
  56.         }
  57.         //policzyliśmy czym zajmuje dany block i zapisalismy, ok.
  58.     }
  59.  
  60.     __syncthreads();
  61.     //teraz każdy wątek liczy sobie swój start i koniec
  62.     unsigned int threadXstart = (threadIdx.x * threadPartWidth) + blockXstart;
  63.     unsigned int threadYstart = (threadIdx.y * threadPartHeight) + blockYstart;
  64.     unsigned int threadXend = threadXstart + threadPartWidth;
  65.     unsigned int threadYend = threadYstart + threadPartHeight;
  66.     threadXend = (threadXend > blockXend) ? blockXend : threadXend;
  67.     threadYend = (threadYend > blockYend) ? blockYend : threadYend;
  68.  
  69.     int x_, y_, bufadr_;
  70.     int c;
  71.     for (int y = threadYstart; y < threadYend; y++)
  72.     {
  73.         for (int x = threadXstart; x < threadXend; x++)
  74.         {
  75.             bufadr_ = buffadr(x, y, imageWidth, imageHeight);
  76.             if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
  77.             {
  78.                 continue;
  79.             }
  80.             c = 0;
  81.             for (int j = -3; j < 4; j++)
  82.             {
  83.                 for (int i = -3; i < 4; i++)
  84.                 {
  85.                     y_ = y + j;
  86.                     x_ = x + i;
  87.                     bufadr_ = buffadr(x_, y_, imageWidth, imageHeight);
  88.                     if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
  89.                     {
  90.                         continue;
  91.                     }
  92.                     c += g_filter[((j + 3)*7) + i + 3] * (int)inputColor[bufadr_];
  93.                 }
  94.             }
  95.             c = g_factor[0] * c + g_factor[1];
  96.            
  97.             c = (c < 0) ? 0 : c;
  98.             c = (c > 255) ? 255 : c;
  99.             outputColor[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c;
  100.         }
  101.     }
  102.    
  103. }
  104.  
  105. __global__ void newFilterKernel(const unsigned char *inputR, unsigned char *outputR, const unsigned char *inputG,
  106.     unsigned char *outputG, const unsigned char *inputB, unsigned char *outputB, unsigned int imageWidth, unsigned int imageHeight,
  107.     unsigned int blockPartWidth, unsigned int blockPartHeight, unsigned int threadPartWidth, unsigned int threadPartHeight)
  108. {
  109.     __shared__ int blockXstart, blockYstart;
  110.     __shared__ int blockXend, blockYend;
  111.  
  112.     if (threadIdx.x == 0 && threadIdx.y == 0)
  113.     {
  114.         //policz i zapisz do shared czym zajmuje sie dany blok
  115.         blockXstart = blockIdx.x * blockPartWidth;
  116.         blockYstart = blockIdx.y * blockPartHeight;
  117.         blockXend = blockXstart + blockPartWidth; //tego miejsca juz nie liczymy
  118.         blockYend = blockYstart + blockPartHeight; //to też już poza obliczeniami
  119.         if (blockXend > imageWidth)
  120.         {
  121.             blockXend = imageWidth;
  122.         }
  123.         if (blockYend > imageHeight)
  124.         {
  125.             blockYend = imageHeight;
  126.         }
  127.         //policzyliśmy czym zajmuje dany block i zapisalismy, ok.
  128.     }
  129.  
  130.     __syncthreads();
  131.     //teraz każdy wątek liczy sobie swój start i koniec
  132.     unsigned int threadXstart = (threadIdx.x * threadPartWidth) + blockXstart;
  133.     unsigned int threadYstart = (threadIdx.y * threadPartHeight) + blockYstart;
  134.     unsigned int threadXend = threadXstart + threadPartWidth;
  135.     unsigned int threadYend = threadYstart + threadPartHeight;
  136.     threadXend = (threadXend > blockXend) ? blockXend : threadXend;
  137.     threadYend = (threadYend > blockYend) ? blockYend : threadYend;
  138.  
  139.     int x_, y_, bufadr_;
  140.     int c_r, c_g, c_b;
  141.     float fil;
  142.     for (int y = threadYstart; y < threadYend; y++)
  143.     {
  144.         for (int x = threadXstart; x < threadXend; x++)
  145.         {
  146.             bufadr_ = buffadr(x, y, imageWidth, imageHeight);
  147.             if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
  148.             {
  149.                 continue;
  150.             }
  151.             c_r = c_g = c_b = 0;
  152.             for (int j = -3; j < 4; j++)
  153.             {
  154.                 for (int i = -3; i < 4; i++)
  155.                 {
  156.                     fil = g_filter[((j + 3)*7) + i + 3];
  157.                     if(0.0 == fil)
  158.                     { continue; }
  159.                     y_ = y + j;
  160.                     x_ = x + i;
  161.                     bufadr_ = buffadr(x_, y_, imageWidth, imageHeight);
  162.                     if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
  163.                     { continue; }
  164.                     //c += g_filter[((j + 3)*7) + i + 3] * (int)inputColor[bufadr_];
  165.                     c_r += fil * (int)inputR[bufadr_];
  166.                     c_g += fil * (int)inputG[bufadr_];
  167.                     c_b += fil * (int)inputB[bufadr_];
  168.                 }
  169.             }
  170.             c_r = g_factor[0] * c_r + g_factor[1];
  171.             c_g = g_factor[0] * c_g + g_factor[1];
  172.             c_b = g_factor[0] * c_b + g_factor[1];
  173.  
  174.             c_r = (c_r < 0) ? 0 : c_r;
  175.             c_r = (c_r > 255) ? 255 : c_r;
  176.             outputR[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_r;
  177.  
  178.             c_g = (c_g < 0) ? 0 : c_g;
  179.             c_g = (c_g > 255) ? 255 : c_g;
  180.             outputG[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_g;
  181.  
  182.             c_b = (c_b < 0) ? 0 : c_b;
  183.             c_b = (c_b > 255) ? 255 : c_b;
  184.             outputB[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_b;
  185.         }
  186.     }
  187. }
  188.  
  189. void printHelp()
  190. {
  191.     std::cout << "HELP: \n";
  192.     std::cout << "exe filter_file_path image_file_path\n";
  193.     std::cout << "\n";
  194.     system("pause");
  195. }
  196.  
  197. void readDeviceAttributes(int devId, int &driverVersion, int &runtimeVersion, unsigned long long &totalGlobalMem,
  198.     int &multiProcessorCount, int &cudaCores, int &warpSize, unsigned int &totalConstantMem,
  199.     unsigned int &sharedMemPerBlock, int &maxThreadsPerBlock, int &maxThreadsPerMP)
  200. {
  201.     /* code from deviceQuery example*/
  202.     cudaSetDevice(devId);
  203.     cudaDeviceProp deviceProp;
  204.     cudaGetDeviceProperties(&deviceProp, devId);
  205.     printf("\nDevice %d: \"%s\"\n", devId, deviceProp.name);
  206.     cudaDriverGetVersion(&driverVersion);
  207.     cudaRuntimeGetVersion(&runtimeVersion);
  208.     printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10, runtimeVersion / 1000, (runtimeVersion % 100) / 10);
  209.     printf("  CUDA Capability Major/Minor version number:    %d.%d\n", deviceProp.major, deviceProp.minor);
  210.  
  211.     totalGlobalMem = (unsigned long long) deviceProp.totalGlobalMem;
  212.     printf("  Total amount of global memory: (%llu bytes)\n", totalGlobalMem);
  213.     printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
  214.         deviceProp.multiProcessorCount,
  215.         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
  216.         _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
  217.  
  218.     printf("  Total amount of constant memory:               %lu bytes\n", deviceProp.totalConstMem);
  219.     totalConstantMem = deviceProp.totalConstMem;
  220.     printf("  Total amount of shared memory per block:       %lu bytes\n", deviceProp.sharedMemPerBlock);
  221.     sharedMemPerBlock = deviceProp.sharedMemPerBlock;
  222.     printf("  Warp size:                                     %d\n", deviceProp.warpSize);
  223.  
  224.     maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
  225.     maxThreadsPerMP = deviceProp.maxThreadsPerMultiProcessor;
  226.     multiProcessorCount = deviceProp.multiProcessorCount;
  227.     cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
  228.  
  229.     warpSize = deviceProp.warpSize;
  230.     printf("\n");
  231. }
  232.  
  233. int main(int argc, char **argv)
  234. {
  235.     clock_t start,end,kernelstart,kernelend;
  236.     start = clock();
  237.  
  238.     int devId = 0;
  239.     int warpSize, multiProcesorCount, cudaCores;
  240.     int runtimeVersion, driverVersion;
  241.     unsigned long long totalGlobalMem;
  242.     unsigned int sharedMemPerBlock, totalConstantMem;
  243.     int maxThreadsPerBlock, maxThreadsPerMP;
  244.  
  245.     cudaDeviceProp deviceProp;
  246.  
  247.     int status = EXIT_SUCCESS;
  248.     std::cout << argv[0] << " starting.. \n";
  249.     std::cout << "CUDA Image filtering - 03:40\n";
  250.  
  251.     unsigned char *g_r = 0; //graphics red
  252.     unsigned char *g_g = 0; //graphics green
  253.     unsigned char *g_b = 0; //graphics blue
  254.     unsigned char *g_or = 0; //graphics output red
  255.     unsigned char *g_og = 0; //graphics output green
  256.     unsigned char *g_ob = 0; //graphics output blue
  257.     cudaError_t cudaStatus;
  258.  
  259.     /* read cuda devices */
  260. #pragma region readDevices
  261.     int deviceCount = 0;
  262.     cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
  263.     if (error_id != cudaSuccess)
  264.     {
  265.         std::cout << "cudaGetDeviceCount returned " << (int)error_id << "->" << cudaGetErrorString(error_id) << "\n";
  266.         std::cout << "Result = FAIL\n";
  267.         exit(EXIT_FAILURE);
  268.     }
  269.     // This function call returns 0 if there are no CUDA capable devices.
  270.     if (deviceCount == 0)
  271.     {
  272.         std::cout << "There are no available device(s) that support CUDA\n";
  273.         std::cout << "Result = FAIL\n";
  274.         exit(EXIT_FAILURE);
  275.     }
  276.     else
  277.     {
  278.         std::cout << "Detected " << deviceCount << " CUDA Capable device(s)\n";
  279.         if (deviceCount > 1)
  280.         {
  281.             std::cout << "Application will run on device with id= " << devId <<"\n";
  282.         }
  283.     }
  284. #pragma endregion readDevices
  285.  
  286.     /* read input parameters */
  287. #pragma region readAndParseInput
  288.     if (argc < 3)
  289.     {
  290.         std::cout << "You passed " << argc << "arguments\n";
  291.         std::cout << "Not enough arguments passed to program\n";
  292.         printHelp();
  293.         std::cout << "Result = FAIL\n";
  294.         exit(EXIT_FAILURE);
  295.     }
  296.     float filter_value[49];
  297.     float factor[2];
  298.  
  299.     /* read filter file and load to constant mem */
  300.     const char field_terminator = ',';
  301.     const char line_terminator  = '\n';
  302.     const char enclosure_char   = '"';
  303.  
  304.     csv_parser file_parser;
  305.     file_parser.set_skip_lines(0);
  306.     file_parser.init(argv[1]);
  307.  
  308.     file_parser.set_enclosed_char(enclosure_char, ENCLOSURE_OPTIONAL); 
  309.     file_parser.set_field_term_char(field_terminator);
  310.     file_parser.set_line_term_char(line_terminator);
  311.  
  312.     std::string num;
  313.     float tmp;
  314.     for (int i = 0; i < 8; i++)
  315.     {
  316.         if(!file_parser.has_more_rows() && 7 != i)
  317.         {
  318.             std::cout << "Malformed csv filter\n";
  319.             std::cout << "line:" << i << std::endl;
  320.             printHelp();
  321.             std::cout << "Result = FAIL\n";
  322.             exit(EXIT_FAILURE);
  323.         }
  324.         csv_row row = file_parser.get_row();
  325.  
  326.         if (i < 7 && 7 == row.size())
  327.         {
  328.             for (int j = 0; j < 7; j++)
  329.             {
  330.                 num = std::string(row[j]);
  331.                 tmp = ::strtod(num.c_str(), 0);
  332.                 filter_value[i * 7 + j] = (float)tmp;
  333.             }
  334.         }
  335.         else if(7 == i && 2 == row.size())
  336.         {
  337.             num = std::string(row[0]);
  338.             tmp = ::strtod(num.c_str(), 0);
  339.             factor[0] = (float)tmp;
  340.             num = std::string(row[1]);
  341.             tmp = ::strtod(num.c_str(), 0);
  342.             factor[1] = (float)tmp;
  343.         }
  344.         else
  345.         {
  346.             std::cout << "Malformed csv filter\n";
  347.             std::cout << "line: " << i << std::endl;
  348.             std::cout << "row size: " << row.size() << endl;
  349.             printHelp();
  350.             std::cout << "Result = FAIL\n";
  351.             exit(EXIT_FAILURE);
  352.         }
  353.     }
  354.  
  355.     std::cout << "Filter from file " << argv[1] << " read succesfully \n";
  356.  
  357.     /* read bmp from file to gpu global mem */
  358.  
  359.     std::string file_name(argv[2]);
  360.     bitmap_image image(file_name);
  361.     if (!image)
  362.     {
  363.         std::cout << "Failed to open image " << file_name.c_str() << "\n";
  364.         printHelp();
  365.         std::cout << "Result = FAIL\n";
  366.         exit(EXIT_FAILURE);
  367.     }
  368.  
  369.     const unsigned int imgWidth = image.width();
  370.     const unsigned int imgHeight = image.height();
  371.     std::cout << "Loaded image " << imgWidth << "X" << imgHeight << " from file " << file_name.c_str() << " \n";
  372.  
  373.     unsigned int imgSize = imgWidth * imgHeight;
  374.     unsigned int imgSizeBytes = sizeof(unsigned char)* 3 * imgSize;
  375.  
  376.     /* check if filer fits into constant mem, load */
  377.     /* it will not, constant memory is too small*/
  378.     int neededConstMem = sizeof(float)* Filter_Total_Size;
  379.     /* check if image fits into global memory, alocate output tables */
  380.     int neededGlobalMem = 2 * imgSizeBytes;
  381.     std::cout << "Application will need total of " << neededGlobalMem <<  " bytes in global memory \n";
  382.     std::cout << "Application will need total of " << neededConstMem << " bytes in constant memory \n";
  383. #pragma endregion readAndParseInput
  384.  
  385. #pragma region readDeviceProperties
  386.     readDeviceAttributes(devId, driverVersion, runtimeVersion, totalGlobalMem, multiProcesorCount, cudaCores,
  387.         warpSize, totalConstantMem, sharedMemPerBlock, maxThreadsPerBlock, maxThreadsPerMP);
  388.  
  389.     if ((unsigned int)neededConstMem > totalConstantMem || neededGlobalMem > totalGlobalMem)
  390.     {
  391.         std::cout << "Not enough memory. Try smaller image or get better device. \n";
  392.         printHelp();
  393.         std::cout << "Result = FAIL\n";
  394.         exit(EXIT_FAILURE);
  395.     }
  396. #pragma endregion readDeviceProperties
  397.  
  398. #pragma region allocateAndCopyMem
  399.     unsigned char *r = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);
  400.     unsigned char *g = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);
  401.     unsigned char *b = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);
  402.  
  403.     unsigned char red, green, blue;
  404.     for (std::size_t i = 0; i < imgHeight; i++)
  405.     {
  406.         for (std::size_t j = 0; j < imgWidth; j++)
  407.         {
  408.             image.get_pixel((unsigned int)j, (unsigned int)i, red, green, blue);
  409.             r[(int)i * (int)imgWidth + (int)j] = red;
  410.             g[(int)i * (int)imgWidth + (int)j] = green;
  411.             b[(int)i * (int)imgWidth + (int)j] = blue;
  412.         }
  413.     }
  414.  
  415.     std::cout << "Allocated mem for rgb tables. " << imgSizeBytes << " bytes in total \n";
  416.  
  417.     cudaStatus = cudaSetDevice(devId);
  418.     cudaGetDeviceProperties(&deviceProp, devId);
  419.     if (cudaStatus != cudaSuccess) {
  420.         std::cout << "cudaSetDevice failed! Do you have a CUDA - capable GPU installed ?\n";
  421.         status = EXIT_FAILURE;
  422.         goto Error;
  423.     }
  424.  
  425.     // Allocate GPU buffers for six vectors (3 input, 3 output).
  426.     cudaStatus = cudaMalloc((void**)&g_r, sizeof(unsigned char)* imgSize);
  427.     if (cudaStatus != cudaSuccess) {
  428.         std::cout << "cudaMalloc failed!\n";
  429.         status = EXIT_FAILURE;
  430.         goto Error;
  431.     }
  432.     cudaStatus = cudaMalloc((void**)&g_g, sizeof(unsigned char)* imgSize);
  433.     if (cudaStatus != cudaSuccess) {
  434.         std::cout << "cudaMalloc failed!\n";
  435.         status = EXIT_FAILURE;
  436.         goto Error;
  437.     }
  438.     cudaStatus = cudaMalloc((void**)&g_b, sizeof(unsigned char)* imgSize);
  439.     if (cudaStatus != cudaSuccess) {
  440.         std::cout << "cudaMalloc failed!\n";
  441.         status = EXIT_FAILURE;
  442.         goto Error;
  443.     }
  444.     cudaStatus = cudaMalloc((void**)&g_or, sizeof(unsigned char)* imgSize);
  445.     if (cudaStatus != cudaSuccess) {
  446.         std::cout << "cudaMalloc failed!\n";
  447.         status = EXIT_FAILURE;
  448.         goto Error;
  449.     }
  450.     cudaStatus = cudaMalloc((void**)&g_og, sizeof(unsigned char)* imgSize);
  451.     if (cudaStatus != cudaSuccess) {
  452.         std::cout << "cudaMalloc failed!\n";
  453.         status = EXIT_FAILURE;
  454.         goto Error;
  455.     }
  456.     cudaStatus = cudaMalloc((void**)&g_ob, sizeof(unsigned char)* imgSize);
  457.     if (cudaStatus != cudaSuccess) {
  458.         std::cout << "cudaMalloc failed!\n";
  459.         status = EXIT_FAILURE;
  460.         goto Error;
  461.     }
  462.  
  463.     std::cout << "Space for input and output images mallocked.\n";
  464.  
  465.     // Copy input vectors from host memory to GPU buffers.
  466.     cudaStatus = cudaMemcpy(g_r, r, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
  467.     if (cudaStatus != cudaSuccess) {
  468.         std::cout << "cudaMemcpy failed!\n";
  469.         status = EXIT_FAILURE;
  470.         goto Error;
  471.     }
  472.     cudaStatus = cudaMemcpy(g_g, g, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
  473.     if (cudaStatus != cudaSuccess) {
  474.         std::cout << "cudaMemcpy failed!\n";
  475.         status = EXIT_FAILURE;
  476.         goto Error;
  477.     }
  478.     cudaStatus = cudaMemcpy(g_b, b, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
  479.     if (cudaStatus != cudaSuccess) {
  480.         std::cout << "cudaMemcpy failed!\n";
  481.         status = EXIT_FAILURE;
  482.         goto Error;
  483.     }
  484.     std::cout << "Input data coppied into global mem.\n";
  485.     cudaStatus = cudaMemcpyToSymbol(g_filter, filter_value, sizeof(float)* Filter_Total_Size, 0, cudaMemcpyHostToDevice);
  486.     if (cudaStatus != cudaSuccess) {
  487.         std::cout << "cudaMemcpyToSymbol failed!\n";
  488.         status = EXIT_FAILURE;
  489.         goto Error;
  490.     }
  491.     cudaMemcpyToSymbol(g_factor, factor, sizeof(float)*2,0,cudaMemcpyHostToDevice);
  492.     getLastCudaError("Factor copy failed");
  493.     checkCudaErrors(cudaDeviceSynchronize());
  494.  
  495.     std::cout << "Filter data coppied into constant mem.\n";
  496.  
  497. #pragma endregion allocateAndCopyMem
  498.  
  499. #pragma region divideImage
  500.     /*
  501.     int warpSize, multiProcesorCount, cudaCores;
  502.     int runtimeVersion, driverVersion;
  503.     unsigned long long totalGlobalMem;
  504.     unsigned int sharedMemPerBlock, totalConstantMem;
  505.     int maxThreadsPerBlock, maxThreadsPerMP;
  506.     */
  507.  
  508.     float dev_score = (float)cudaCores / (float)warpSize;
  509.     float img_ratio = (float)imgWidth/(float)imgHeight;
  510.    
  511.     std::cout << "Image ratio: " << img_ratio << " cores/warp: " << dev_score << endl;
  512.  
  513.  
  514.     int blocksX = 64;
  515.     int blocksY = 64;
  516.     int threadsX = 8;// 8;
  517.     int threadsY = 8;// 8;
  518.  
  519.     float div_ratio = (float)threadsX / (float)threadsY;
  520.     //zaokrąglone w górę te podziały
  521.     unsigned int blockPartX = (imgWidth / blocksX) + (imgWidth%blocksX != 0);
  522.     unsigned int blockPartY = (imgHeight / blocksY) + (imgHeight%blocksY != 0);
  523.     //to wyjdzie zawsze conajmniej 1, chyba że wymiar obrazu to 0, a to już patola
  524.  
  525.     unsigned int threadPartX = (blockPartX / threadsX) + (blockPartX%threadsX != 0);
  526.     unsigned int threadPartY = (blockPartY / threadsY) + (blockPartY%threadsY != 0);
  527.     //tu też zawsze conajmniej 1
  528.     while(threadsX * threadsY < dev_score * warpSize && (threadsX * threadsY) < maxThreadsPerBlock)
  529.     {
  530.         if(img_ratio > div_ratio && 1 != threadPartX)
  531.         {
  532.             threadsX += 2;
  533.         }
  534.         else if(1 != threadPartY)
  535.         {
  536.             threadsY += 2;
  537.         }
  538.         else
  539.         {
  540.             div_ratio = (float)threadsX / (float)threadsY;
  541.             threadPartX = (blockPartX / threadsX) + (blockPartX%threadsX != 0);
  542.             threadPartY = (blockPartY / threadsY) + (blockPartY%threadsY != 0);
  543.             break;
  544.         }
  545.         div_ratio = (float)threadsX / (float)threadsY;
  546.         threadPartX = (blockPartX / threadsX) + (blockPartX%threadsX != 0);
  547.         threadPartY = (blockPartY / threadsY) + (blockPartY%threadsY != 0);
  548.         if(1 == threadPartX && 1 == threadPartY)
  549.         { break; }
  550.     }
  551.  
  552.     //64x64
  553.     dim3 blocksPerGrid(blocksX, blocksY);
  554.     //and 8x8 threads per block
  555.     dim3 threadsPerBlock(threadsX, threadsY);
  556.  
  557.     std::cout << blocksX << "X" << blocksY << " blocks, each block procesing " << blockPartX << "X" << blockPartY << "pixels.\n";
  558.     std::cout << threadsX << "X" << threadsY << " threads per block, each procesing " << threadPartX << "X" << threadPartY << "pixels.\n";
  559. #pragma endregion divideImage
  560.  
  561. #pragma region runKernels
  562.     /* do magic - run kernel */
  563.     kernelstart = clock();
  564.  
  565.     /*
  566.     filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_r, g_or, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
  567.     getLastCudaError("Kernel execution failed");
  568.     checkCudaErrors(cudaDeviceSynchronize());
  569.     std::cout << "Red.\n";
  570.  
  571.     filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_g, g_og, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
  572.     getLastCudaError("Kernel execution failed");
  573.     checkCudaErrors(cudaDeviceSynchronize());
  574.     std::cout << "Green.\n";
  575.  
  576.     filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_b, g_ob, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
  577.     getLastCudaError("Kernel execution failed");
  578.     checkCudaErrors(cudaDeviceSynchronize());
  579.     std::cout << "Blue.\n";
  580.     getLastCudaError("Kernel execution failed");
  581.     checkCudaErrors(cudaDeviceSynchronize());
  582.     */
  583.  
  584.     newFilterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_r, g_or, g_g, g_og, g_b, g_ob, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
  585.     getLastCudaError("Kernel execution failed");
  586.     checkCudaErrors(cudaDeviceSynchronize());
  587.     std::cout << "Red.\n";  std::cout << "Green.\n"; std::cout << "Blue.\n";
  588.  
  589.     kernelend = clock();
  590.     //sekwencyjnie, hehe.
  591.     // Check if kernel execution generated an error
  592.  
  593.     /* start clock */
  594.  
  595.     /*Keep in mind that there is some driver overhead the first time you call
  596.     a particular kernel in your program, so when doing timing studies, you
  597.     should warm up your kernels by calling them once before your timer
  598.     starts.*/
  599.  
  600.     /* end clock */
  601.  
  602. #pragma endregion runKernels
  603.  
  604. #pragma region copyResults
  605.     // Copy output vector from GPU buffer to host memory.
  606.  
  607.     cudaStatus = cudaMemcpy(r, g_or, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
  608.     if (cudaStatus != cudaSuccess) {
  609.         std::cout << "cudaMemcpy failed! error: " << cudaStatus << endl;
  610.         status = EXIT_FAILURE;
  611.         goto Error;
  612.     }
  613.     cudaStatus = cudaMemcpy(g, g_og, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
  614.     if (cudaStatus != cudaSuccess) {
  615.         std::cout << "cudaMemcpy failed! error: " << cudaStatus << endl;
  616.         status = EXIT_FAILURE;
  617.         goto Error;
  618.     }
  619.     cudaStatus = cudaMemcpy(b, g_ob, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
  620.     if (cudaStatus != cudaSuccess) {
  621.         std::cout << "cudaMemcpy failed! error: " << cudaStatus << endl;
  622.         status = EXIT_FAILURE;
  623.         goto Error;
  624.     }
  625. #pragma endregion copyResults
  626.  
  627.     /* assemble and save output image */
  628. #pragma region saveOutput
  629.     for (std::size_t i = 0; i < imgHeight; i++)
  630.     {
  631.         for (std::size_t j = 0; j < imgWidth; j++)
  632.         {
  633.             red = r[(int)i * (int)imgWidth + (int)j];
  634.             green = g[(int)i * (int)imgWidth + (int)j];
  635.             blue = b[(int)i * (int)imgWidth + (int)j];
  636.             image.set_pixel((unsigned int)j, (unsigned int)i, red, green, blue);
  637.         }
  638.     }
  639.     time_t timer = NULL;
  640.     time(&timer);
  641.     struct tm * timeinfo = localtime(&timer);
  642.     char matko[14];
  643.     strftime(matko, 14, "%y%m%d_%H%M%S", timeinfo);
  644.     std::cout << "Saving result image to " << file_name.substr(0, file_name.size() - 4) + "_output_" + std::string(matko) + ".bmp" << " file.\n";
  645.     image.save_image(file_name.substr(0, file_name.size() - 4) + "_output_" + std::string(matko) + ".bmp");
  646. #pragma endregion saveOutput
  647.  
  648.  
  649.     // cudaDeviceReset must be called before exiting in order for profiling and
  650.     // tracing tools such as Nsight and Visual Profiler to show complete traces.
  651.     cudaStatus = cudaDeviceReset();
  652.     if (cudaStatus != cudaSuccess) {
  653.         fprintf(stderr, "cudaDeviceReset failed!");
  654.         return 1;
  655.     }
  656.  
  657. /* free every mallocked space and exit */
  658. #pragma region freeAndExit
  659. Error:
  660.     cudaFree(g_r);
  661.     cudaFree(g_g);
  662.     cudaFree(g_b);
  663.     cudaFree(g_or);
  664.     cudaFree(g_og);
  665.     cudaFree(g_ob);
  666.     //cudaFree(g_filter);
  667.     free(r);
  668.     free(g);
  669.     free(b);
  670.     end = clock();
  671.     double total_diff = double(end - start) / CLOCKS_PER_SEC;
  672.     printf ("Total elapsed time is %.6lf seconds.\n", total_diff );
  673.     if(NULL != kernelstart && NULL != kernelend)
  674.     {
  675.         double kernel_diff = double(kernelend - kernelstart) / CLOCKS_PER_SEC;
  676.         printf ("Kernel functions elapsed time is %.6lf seconds.\n", kernel_diff );
  677.     }
  678.     system("pause");
  679.     exit(status);
  680. #pragma endregion freeAndExit
  681. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement