Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- __global__ void merdzs(int* vfrom, int* vto, int size, int from, int tc) {
- int u = blockIdx.x*tc*size + threadIdx.x*size;
- int a = u; int b = u+size/2;
- int al = a+size/2; int bl = b+size/2;
- int k = u;
- while (a < al && b < bl) {
- if (vfrom[a] <= vfrom[b]) {
- vto[k] = vfrom[a];
- ++a;
- } else {
- vto[k] = vfrom[b];
- ++b;
- }
- ++k;
- }
- if (a < al) {
- while (a < al) {
- vto[k] = vfrom[a];
- ++a;
- ++k;
- }
- }
- else {
- while (b < bl) {
- vto[k] = vfrom[b];
- ++b;
- ++k;
- }
- }
- }
- void mergeSortGPU(int* result, int size, int* bufferValues, int* bufferValues2) {
- int p = 0;
- int mtc = 1024;
- for (int tc = size/2; tc >=1; tc >>= 1) {
- merdzs <<<(tc > mtc ? tc/mtc : 1), (tc > mtc ? mtc : tc) >>>(p % 2 == 0 ? bufferValues : bufferValues2, p % 2 == 0 ? bufferValues2 : bufferValues, size/tc, (p%2 == 0) ? 0 : 1, tc > mtc ? mtc : tc);
- ++p;
- }
- cudaDeviceSynchronize();
- if ((int)log2(size) % 2 == 1) {
- cudaMemcpy(result, bufferValues2, size * sizeof(int), cudaMemcpyDeviceToHost);
- } else {
- cudaMemcpy(result, bufferValues, size * sizeof(int), cudaMemcpyDeviceToHost);
- }
- }
- int GPUMergeSort(int* values, int* result, int size) {
- int* bufferValues; int* bufferValues2;
- cudaMalloc((void**)&bufferValues, size * sizeof(int));
- cudaMalloc((void**)&bufferValues2, size * sizeof(int));
- cudaMemcpy(bufferValues, values, size * sizeof(int), cudaMemcpyHostToDevice);
- std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
- mergeSortGPU(result, size, bufferValues, bufferValues2);
- std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
- millisecs_t duration(std::chrono::duration_cast<millisecs_t>(end - start));
- return duration.count();
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement