Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #include "cuda_runtime.h"
- #include "device_launch_parameters.h"
- #include "helper_cuda.h"
- #include <stdlib.h>
- #include <stdio.h>
- #include <memory>
- #include <iostream>
- #include <cstdio>
- #include <cstdlib>
- #include <ctime>
- #include "bitmap_image.hpp"
- #include "csv_parser/csv_parser.hpp"
- #ifndef __CUDACC__
- #define __CUDACC__
- #endif
- const int Filter_Size = 7;
- const int Filter_Total_Size = 49;
- __device__ __constant__ float g_filter[Filter_Total_Size];
- __device__ __constant__ float g_factor[2];
- __device__ int buffadr(unsigned int x, unsigned int y, unsigned int imageWidth, unsigned int imageHeight)
- {
- if (x >= imageWidth || y >= imageHeight)
- {
- return -1;
- }
- return y*imageWidth + x;
- }
- __global__ void filterKernel(const unsigned char *inputColor, unsigned char *outputColor, unsigned int imageWidth, unsigned int imageHeight,
- unsigned int blockPartWidth, unsigned int blockPartHeight, unsigned int threadPartWidth, unsigned int threadPartHeight)
- {
- __shared__ int blockXstart, blockYstart;
- __shared__ int blockXend, blockYend;
- if (threadIdx.x == 0 && threadIdx.y == 0)
- {
- //policz i zapisz do shared czym zajmuje sie dany blok
- blockXstart = blockIdx.x * blockPartWidth;
- blockYstart = blockIdx.y * blockPartHeight;
- blockXend = blockXstart + blockPartWidth; //tego miejsca juz nie liczymy
- blockYend = blockYstart + blockPartHeight; //to też już poza obliczeniami
- if (blockXend > imageWidth)
- {
- blockXend = imageWidth;
- }
- if (blockYend > imageHeight)
- {
- blockYend = imageHeight;
- }
- //policzyliśmy czym zajmuje dany block i zapisalismy, ok.
- }
- __syncthreads();
- //teraz każdy wątek liczy sobie swój start i koniec
- unsigned int threadXstart = (threadIdx.x * threadPartWidth) + blockXstart;
- unsigned int threadYstart = (threadIdx.y * threadPartHeight) + blockYstart;
- unsigned int threadXend = threadXstart + threadPartWidth;
- unsigned int threadYend = threadYstart + threadPartHeight;
- threadXend = (threadXend > blockXend) ? blockXend : threadXend;
- threadYend = (threadYend > blockYend) ? blockYend : threadYend;
- int x_, y_, bufadr_;
- int c;
- for (int y = threadYstart; y < threadYend; y++)
- {
- for (int x = threadXstart; x < threadXend; x++)
- {
- bufadr_ = buffadr(x, y, imageWidth, imageHeight);
- if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
- {
- continue;
- }
- c = 0;
- for (int j = -3; j < 4; j++)
- {
- for (int i = -3; i < 4; i++)
- {
- y_ = y + j;
- x_ = x + i;
- bufadr_ = buffadr(x_, y_, imageWidth, imageHeight);
- if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
- {
- continue;
- }
- c += g_filter[((j + 3)*7) + i + 3] * (int)inputColor[bufadr_];
- }
- }
- c = g_factor[0] * c + g_factor[1];
- c = (c < 0) ? 0 : c;
- c = (c > 255) ? 255 : c;
- outputColor[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c;
- }
- }
- }
- __global__ void newFilterKernel(const unsigned char *inputR, unsigned char *outputR, const unsigned char *inputG,
- unsigned char *outputG, const unsigned char *inputB, unsigned char *outputB, unsigned int imageWidth, unsigned int imageHeight,
- unsigned int blockPartWidth, unsigned int blockPartHeight, unsigned int threadPartWidth, unsigned int threadPartHeight)
- {
- __shared__ int blockXstart, blockYstart;
- __shared__ int blockXend, blockYend;
- if (threadIdx.x == 0 && threadIdx.y == 0)
- {
- //policz i zapisz do shared czym zajmuje sie dany blok
- blockXstart = blockIdx.x * blockPartWidth;
- blockYstart = blockIdx.y * blockPartHeight;
- blockXend = blockXstart + blockPartWidth; //tego miejsca juz nie liczymy
- blockYend = blockYstart + blockPartHeight; //to też już poza obliczeniami
- if (blockXend > imageWidth)
- {
- blockXend = imageWidth;
- }
- if (blockYend > imageHeight)
- {
- blockYend = imageHeight;
- }
- //policzyliśmy czym zajmuje dany block i zapisalismy, ok.
- }
- __syncthreads();
- //teraz każdy wątek liczy sobie swój start i koniec
- unsigned int threadXstart = (threadIdx.x * threadPartWidth) + blockXstart;
- unsigned int threadYstart = (threadIdx.y * threadPartHeight) + blockYstart;
- unsigned int threadXend = threadXstart + threadPartWidth;
- unsigned int threadYend = threadYstart + threadPartHeight;
- threadXend = (threadXend > blockXend) ? blockXend : threadXend;
- threadYend = (threadYend > blockYend) ? blockYend : threadYend;
- int x_, y_, bufadr_;
- int c_r, c_g, c_b;
- float fil;
- for (int y = threadYstart; y < threadYend; y++)
- {
- for (int x = threadXstart; x < threadXend; x++)
- {
- bufadr_ = buffadr(x, y, imageWidth, imageHeight);
- if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
- {
- continue;
- }
- c_r = c_g = c_b = 0;
- for (int j = -3; j < 4; j++)
- {
- for (int i = -3; i < 4; i++)
- {
- fil = g_filter[((j + 3)*7) + i + 3];
- if(0.0 == fil)
- { continue; }
- y_ = y + j;
- x_ = x + i;
- bufadr_ = buffadr(x_, y_, imageWidth, imageHeight);
- if (bufadr_ < 0 || bufadr_ >= imageWidth*imageHeight)
- { continue; }
- //c += g_filter[((j + 3)*7) + i + 3] * (int)inputColor[bufadr_];
- c_r += fil * (int)inputR[bufadr_];
- c_g += fil * (int)inputG[bufadr_];
- c_b += fil * (int)inputB[bufadr_];
- }
- }
- c_r = g_factor[0] * c_r + g_factor[1];
- c_g = g_factor[0] * c_g + g_factor[1];
- c_b = g_factor[0] * c_b + g_factor[1];
- c_r = (c_r < 0) ? 0 : c_r;
- c_r = (c_r > 255) ? 255 : c_r;
- outputR[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_r;
- c_g = (c_g < 0) ? 0 : c_g;
- c_g = (c_g > 255) ? 255 : c_g;
- outputG[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_g;
- c_b = (c_b < 0) ? 0 : c_b;
- c_b = (c_b > 255) ? 255 : c_b;
- outputB[buffadr(x, y, imageWidth, imageHeight)] = (unsigned char)c_b;
- }
- }
- }
- void printHelp()
- {
- std::cout << "HELP: \n";
- std::cout << "exe filter_file_path image_file_path\n";
- std::cout << "\n";
- system("pause");
- }
- void readDeviceAttributes(int devId, int &driverVersion, int &runtimeVersion, unsigned long long &totalGlobalMem,
- int &multiProcessorCount, int &cudaCores, int &warpSize, unsigned int &totalConstantMem,
- unsigned int &sharedMemPerBlock, int &maxThreadsPerBlock, int &maxThreadsPerMP)
- {
- /* code from deviceQuery example*/
- cudaSetDevice(devId);
- cudaDeviceProp deviceProp;
- cudaGetDeviceProperties(&deviceProp, devId);
- printf("\nDevice %d: \"%s\"\n", devId, deviceProp.name);
- cudaDriverGetVersion(&driverVersion);
- cudaRuntimeGetVersion(&runtimeVersion);
- printf(" CUDA Driver Version / Runtime Version %d.%d / %d.%d\n", driverVersion / 1000, (driverVersion % 100) / 10, runtimeVersion / 1000, (runtimeVersion % 100) / 10);
- printf(" CUDA Capability Major/Minor version number: %d.%d\n", deviceProp.major, deviceProp.minor);
- totalGlobalMem = (unsigned long long) deviceProp.totalGlobalMem;
- printf(" Total amount of global memory: (%llu bytes)\n", totalGlobalMem);
- printf(" (%2d) Multiprocessors, (%3d) CUDA Cores/MP: %d CUDA Cores\n",
- deviceProp.multiProcessorCount,
- _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
- _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) * deviceProp.multiProcessorCount);
- printf(" Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem);
- totalConstantMem = deviceProp.totalConstMem;
- printf(" Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock);
- sharedMemPerBlock = deviceProp.sharedMemPerBlock;
- printf(" Warp size: %d\n", deviceProp.warpSize);
- maxThreadsPerBlock = deviceProp.maxThreadsPerBlock;
- maxThreadsPerMP = deviceProp.maxThreadsPerMultiProcessor;
- multiProcessorCount = deviceProp.multiProcessorCount;
- cudaCores = _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
- warpSize = deviceProp.warpSize;
- printf("\n");
- }
- int main(int argc, char **argv)
- {
- clock_t start,end,kernelstart,kernelend;
- start = clock();
- int devId = 0;
- int warpSize, multiProcesorCount, cudaCores;
- int runtimeVersion, driverVersion;
- unsigned long long totalGlobalMem;
- unsigned int sharedMemPerBlock, totalConstantMem;
- int maxThreadsPerBlock, maxThreadsPerMP;
- cudaDeviceProp deviceProp;
- int status = EXIT_SUCCESS;
- std::cout << argv[0] << " starting.. \n";
- std::cout << "CUDA Image filtering - 03:40\n";
- unsigned char *g_r = 0; //graphics red
- unsigned char *g_g = 0; //graphics green
- unsigned char *g_b = 0; //graphics blue
- unsigned char *g_or = 0; //graphics output red
- unsigned char *g_og = 0; //graphics output green
- unsigned char *g_ob = 0; //graphics output blue
- cudaError_t cudaStatus;
- /* read cuda devices */
- #pragma region readDevices
- int deviceCount = 0;
- cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
- if (error_id != cudaSuccess)
- {
- std::cout << "cudaGetDeviceCount returned " << (int)error_id << "->" << cudaGetErrorString(error_id) << "\n";
- std::cout << "Result = FAIL\n";
- exit(EXIT_FAILURE);
- }
- // This function call returns 0 if there are no CUDA capable devices.
- if (deviceCount == 0)
- {
- std::cout << "There are no available device(s) that support CUDA\n";
- std::cout << "Result = FAIL\n";
- exit(EXIT_FAILURE);
- }
- else
- {
- std::cout << "Detected " << deviceCount << " CUDA Capable device(s)\n";
- }
- #pragma endregion readDevices
- /* read input parameters */
- #pragma region readAndParseInput
- if (argc < 3)
- {
- std::cout << "You passed " << argc << "arguments\n";
- std::cout << "Not enough arguments passed to program\n";
- printHelp();
- std::cout << "Result = FAIL\n";
- exit(EXIT_FAILURE);
- }
- else if(argc == 4 && deviceCount > 1)
- {
- std::cout << "You passed " << argc << " arguments and there are multiple devices to choose from\n";
- int newDev = atoi( argv[3] );
- if(newDev >= deviceCount || newDev < 0)
- {
- std::cout << "No such device, program will run on first device found." << endl;
- }
- else
- {
- devId = newDev;
- std::cout << "Device " << devId << " chosen." << endl;
- }
- }
- float filter_value[49];
- float factor[2];
- /* read filter file and load to constant mem */
- const char field_terminator = ',';
- const char line_terminator = '\n';
- const char enclosure_char = '"';
- csv_parser file_parser;
- file_parser.set_skip_lines(0);
- file_parser.init(argv[1]);
- file_parser.set_enclosed_char(enclosure_char, ENCLOSURE_OPTIONAL);
- file_parser.set_field_term_char(field_terminator);
- file_parser.set_line_term_char(line_terminator);
- std::string num;
- float tmp;
- for (int i = 0; i < 8; i++)
- {
- if(!file_parser.has_more_rows() && 7 != i)
- {
- std::cout << "Malformed csv filter\n";
- std::cout << "line:" << i << std::endl;
- printHelp();
- std::cout << "Result = FAIL\n";
- exit(EXIT_FAILURE);
- }
- csv_row row = file_parser.get_row();
- if (i < 7 && 7 == row.size())
- {
- for (int j = 0; j < 7; j++)
- {
- num = std::string(row[j]);
- tmp = ::strtod(num.c_str(), 0);
- filter_value[i * 7 + j] = (float)tmp;
- }
- }
- else if(7 == i && 2 == row.size())
- {
- num = std::string(row[0]);
- tmp = ::strtod(num.c_str(), 0);
- factor[0] = (float)tmp;
- num = std::string(row[1]);
- tmp = ::strtod(num.c_str(), 0);
- factor[1] = (float)tmp;
- }
- else
- {
- std::cout << "Malformed csv filter\n";
- std::cout << "line: " << i << std::endl;
- std::cout << "row size: " << row.size() << endl;
- printHelp();
- std::cout << "Result = FAIL\n";
- exit(EXIT_FAILURE);
- }
- }
- std::cout << "Filter from file " << argv[1] << " read succesfully \n";
- /* read bmp from file to gpu global mem */
- std::string file_name(argv[2]);
- bitmap_image image(file_name);
- if (!image)
- {
- std::cout << "Failed to open image " << file_name.c_str() << "\n";
- printHelp();
- std::cout << "Result = FAIL\n";
- exit(EXIT_FAILURE);
- }
- const unsigned int imgWidth = image.width();
- const unsigned int imgHeight = image.height();
- std::cout << "Loaded image " << imgWidth << "X" << imgHeight << " from file " << file_name.c_str() << " \n";
- unsigned int imgSize = imgWidth * imgHeight;
- unsigned int imgSizeBytes = sizeof(unsigned char)* 3 * imgSize;
- /* check if filer fits into constant mem, load */
- /* it will not, constant memory is too small*/
- int neededConstMem = sizeof(float)* Filter_Total_Size;
- /* check if image fits into global memory, alocate output tables */
- int neededGlobalMem = 2 * imgSizeBytes;
- std::cout << "Application will need total of " << neededGlobalMem << " bytes in global memory \n";
- std::cout << "Application will need total of " << neededConstMem << " bytes in constant memory \n";
- #pragma endregion readAndParseInput
- #pragma region readDeviceProperties
- readDeviceAttributes(devId, driverVersion, runtimeVersion, totalGlobalMem, multiProcesorCount, cudaCores,
- warpSize, totalConstantMem, sharedMemPerBlock, maxThreadsPerBlock, maxThreadsPerMP);
- if ((unsigned int)neededConstMem > totalConstantMem || neededGlobalMem > totalGlobalMem)
- {
- std::cout << "Not enough memory. Try smaller image or get better device. \n";
- printHelp();
- std::cout << "Result = FAIL\n";
- exit(EXIT_FAILURE);
- }
- #pragma endregion readDeviceProperties
- #pragma region allocateAndCopyMem
- unsigned char *r = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);
- unsigned char *g = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);
- unsigned char *b = (unsigned char*)malloc(sizeof(unsigned char) * imgSize);
- unsigned char red, green, blue;
- for (std::size_t i = 0; i < imgHeight; i++)
- {
- for (std::size_t j = 0; j < imgWidth; j++)
- {
- image.get_pixel((unsigned int)j, (unsigned int)i, red, green, blue);
- r[(int)i * (int)imgWidth + (int)j] = red;
- g[(int)i * (int)imgWidth + (int)j] = green;
- b[(int)i * (int)imgWidth + (int)j] = blue;
- }
- }
- std::cout << "Allocated mem for rgb tables. " << imgSizeBytes << " bytes in total \n";
- cudaStatus = cudaSetDevice(devId);
- cudaGetDeviceProperties(&deviceProp, devId);
- if (cudaStatus != cudaSuccess) {
- std::cout << "cudaSetDevice failed! Do you have a CUDA - capable GPU installed ?\n";
- status = EXIT_FAILURE;
- goto Error;
- }
- // Allocate GPU buffers for six vectors (3 input, 3 output).
- cudaStatus = cudaMalloc((void**)&g_r, sizeof(unsigned char)* imgSize);
- if (cudaStatus != cudaSuccess) {
- std::cout << "cudaMalloc failed!\n";
- status = EXIT_FAILURE;
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&g_g, sizeof(unsigned char)* imgSize);
- if (cudaStatus != cudaSuccess) {
- std::cout << "cudaMalloc failed!\n";
- status = EXIT_FAILURE;
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&g_b, sizeof(unsigned char)* imgSize);
- if (cudaStatus != cudaSuccess) {
- std::cout << "cudaMalloc failed!\n";
- status = EXIT_FAILURE;
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&g_or, sizeof(unsigned char)* imgSize);
- if (cudaStatus != cudaSuccess) {
- std::cout << "cudaMalloc failed!\n";
- status = EXIT_FAILURE;
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&g_og, sizeof(unsigned char)* imgSize);
- if (cudaStatus != cudaSuccess) {
- std::cout << "cudaMalloc failed!\n";
- status = EXIT_FAILURE;
- goto Error;
- }
- cudaStatus = cudaMalloc((void**)&g_ob, sizeof(unsigned char)* imgSize);
- if (cudaStatus != cudaSuccess) {
- std::cout << "cudaMalloc failed!\n";
- status = EXIT_FAILURE;
- goto Error;
- }
- std::cout << "Space for input and output images mallocked.\n";
- // Copy input vectors from host memory to GPU buffers.
- cudaStatus = cudaMemcpy(g_r, r, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- std::cout << "cudaMemcpy failed!\n";
- status = EXIT_FAILURE;
- goto Error;
- }
- cudaStatus = cudaMemcpy(g_g, g, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- std::cout << "cudaMemcpy failed!\n";
- status = EXIT_FAILURE;
- goto Error;
- }
- cudaStatus = cudaMemcpy(g_b, b, sizeof(unsigned char)* imgSize, cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- std::cout << "cudaMemcpy failed!\n";
- status = EXIT_FAILURE;
- goto Error;
- }
- std::cout << "Input data coppied into global mem.\n";
- cudaStatus = cudaMemcpyToSymbol(g_filter, filter_value, sizeof(float)* Filter_Total_Size, 0, cudaMemcpyHostToDevice);
- if (cudaStatus != cudaSuccess) {
- std::cout << "cudaMemcpyToSymbol failed!\n";
- status = EXIT_FAILURE;
- goto Error;
- }
- cudaMemcpyToSymbol(g_factor, factor, sizeof(float)*2,0,cudaMemcpyHostToDevice);
- getLastCudaError("Factor copy failed");
- checkCudaErrors(cudaDeviceSynchronize());
- std::cout << "Filter data coppied into constant mem.\n";
- #pragma endregion allocateAndCopyMem
- #pragma region divideImage
- /*
- int warpSize, multiProcesorCount, cudaCores;
- int runtimeVersion, driverVersion;
- unsigned long long totalGlobalMem;
- unsigned int sharedMemPerBlock, totalConstantMem;
- int maxThreadsPerBlock, maxThreadsPerMP;
- */
- float dev_score = (float)cudaCores / (float)warpSize;
- float img_ratio = (float)imgWidth/(float)imgHeight;
- std::cout << "Image ratio: " << img_ratio << " cores/warp: " << dev_score << endl;
- int blocksX = 64;
- int blocksY = 64;
- int threadsX = 8;// 8;
- int threadsY = 8;// 8;
- float div_ratio = (float)threadsX / (float)threadsY;
- //zaokrąglone w górę te podziały
- unsigned int blockPartX = (imgWidth / blocksX) + (imgWidth%blocksX != 0);
- unsigned int blockPartY = (imgHeight / blocksY) + (imgHeight%blocksY != 0);
- //to wyjdzie zawsze conajmniej 1, chyba że wymiar obrazu to 0, a to już patola
- unsigned int threadPartX = (blockPartX / threadsX) + (blockPartX%threadsX != 0);
- unsigned int threadPartY = (blockPartY / threadsY) + (blockPartY%threadsY != 0);
- //tu też zawsze conajmniej 1
- while(threadsX * threadsY < dev_score * warpSize && (threadsX * threadsY) < maxThreadsPerBlock)
- {
- if(img_ratio > div_ratio && 1 != threadPartX)
- {
- threadsX += 2;
- }
- else if(1 != threadPartY)
- {
- threadsY += 2;
- }
- else
- {
- div_ratio = (float)threadsX / (float)threadsY;
- threadPartX = (blockPartX / threadsX) + (blockPartX%threadsX != 0);
- threadPartY = (blockPartY / threadsY) + (blockPartY%threadsY != 0);
- break;
- }
- div_ratio = (float)threadsX / (float)threadsY;
- threadPartX = (blockPartX / threadsX) + (blockPartX%threadsX != 0);
- threadPartY = (blockPartY / threadsY) + (blockPartY%threadsY != 0);
- if(1 == threadPartX && 1 == threadPartY)
- { break; }
- }
- //64x64
- dim3 blocksPerGrid(blocksX, blocksY);
- //and 8x8 threads per block
- dim3 threadsPerBlock(threadsX, threadsY);
- std::cout << blocksX << "X" << blocksY << " blocks, each block procesing " << blockPartX << "X" << blockPartY << "pixels.\n";
- std::cout << threadsX << "X" << threadsY << " threads per block, each procesing " << threadPartX << "X" << threadPartY << "pixels.\n";
- #pragma endregion divideImage
- #pragma region runKernels
- /* do magic - run kernel */
- kernelstart = clock();
- /*
- filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_r, g_or, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
- getLastCudaError("Kernel execution failed");
- checkCudaErrors(cudaDeviceSynchronize());
- std::cout << "Red.\n";
- filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_g, g_og, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
- getLastCudaError("Kernel execution failed");
- checkCudaErrors(cudaDeviceSynchronize());
- std::cout << "Green.\n";
- filterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_b, g_ob, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
- getLastCudaError("Kernel execution failed");
- checkCudaErrors(cudaDeviceSynchronize());
- std::cout << "Blue.\n";
- getLastCudaError("Kernel execution failed");
- checkCudaErrors(cudaDeviceSynchronize());
- */
- newFilterKernel <<<blocksPerGrid, threadsPerBlock >>>(g_r, g_or, g_g, g_og, g_b, g_ob, imgWidth, imgHeight, blockPartX, blockPartY, threadPartX, threadPartY);
- getLastCudaError("Kernel execution failed");
- checkCudaErrors(cudaDeviceSynchronize());
- std::cout << "Red.\n"; std::cout << "Green.\n"; std::cout << "Blue.\n";
- kernelend = clock();
- //sekwencyjnie, hehe.
- // Check if kernel execution generated an error
- /* start clock */
- /*Keep in mind that there is some driver overhead the first time you call
- a particular kernel in your program, so when doing timing studies, you
- should warm up your kernels by calling them once before your timer
- starts.*/
- /* end clock */
- #pragma endregion runKernels
- #pragma region copyResults
- // Copy output vector from GPU buffer to host memory.
- cudaStatus = cudaMemcpy(r, g_or, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- std::cout << "cudaMemcpy failed! error: " << cudaStatus << endl;
- status = EXIT_FAILURE;
- goto Error;
- }
- cudaStatus = cudaMemcpy(g, g_og, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- std::cout << "cudaMemcpy failed! error: " << cudaStatus << endl;
- status = EXIT_FAILURE;
- goto Error;
- }
- cudaStatus = cudaMemcpy(b, g_ob, sizeof(unsigned char)* imgSize, cudaMemcpyDeviceToHost);
- if (cudaStatus != cudaSuccess) {
- std::cout << "cudaMemcpy failed! error: " << cudaStatus << endl;
- status = EXIT_FAILURE;
- goto Error;
- }
- #pragma endregion copyResults
- /* assemble and save output image */
- #pragma region saveOutput
- for (std::size_t i = 0; i < imgHeight; i++)
- {
- for (std::size_t j = 0; j < imgWidth; j++)
- {
- red = r[(int)i * (int)imgWidth + (int)j];
- green = g[(int)i * (int)imgWidth + (int)j];
- blue = b[(int)i * (int)imgWidth + (int)j];
- image.set_pixel((unsigned int)j, (unsigned int)i, red, green, blue);
- }
- }
- time_t timer = NULL;
- time(&timer);
- struct tm * timeinfo = localtime(&timer);
- char matko[14];
- strftime(matko, 14, "%y%m%d_%H%M%S", timeinfo);
- std::cout << "Saving result image to " << file_name.substr(0, file_name.size() - 4) + "_output_" + std::string(matko) + ".bmp" << " file.\n";
- image.save_image(file_name.substr(0, file_name.size() - 4) + "_output_" + std::string(matko) + ".bmp");
- #pragma endregion saveOutput
- // cudaDeviceReset must be called before exiting in order for profiling and
- // tracing tools such as Nsight and Visual Profiler to show complete traces.
- cudaStatus = cudaDeviceReset();
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "cudaDeviceReset failed!");
- return 1;
- }
- /* free every mallocked space and exit */
- #pragma region freeAndExit
- Error:
- cudaFree(g_r);
- cudaFree(g_g);
- cudaFree(g_b);
- cudaFree(g_or);
- cudaFree(g_og);
- cudaFree(g_ob);
- //cudaFree(g_filter);
- free(r);
- free(g);
- free(b);
- end = clock();
- double total_diff = double(end - start) / CLOCKS_PER_SEC;
- printf ("Total elapsed time is %.6lf seconds.\n", total_diff );
- if(NULL != kernelstart && NULL != kernelend)
- {
- double kernel_diff = double(kernelend - kernelstart) / CLOCKS_PER_SEC;
- printf ("Kernel functions elapsed time is %.6lf seconds.\n", kernel_diff );
- }
- system("pause");
- exit(status);
- #pragma endregion freeAndExit
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement