merge sort gpu

__global__ void merdzs(int* vfrom, int* vto, int size, int from, int tc) {
    int u = blockIdx.x*tc*size + threadIdx.x*size;
    int a = u; int b = u+size/2;
    int al = a+size/2; int bl = b+size/2;
    int k = u;
    while (a < al && b < bl) {
        if (vfrom[a] <= vfrom[b]) {
            vto[k] = vfrom[a];
            ++a;
        } else {
            vto[k] = vfrom[b];
            ++b;
        }
        ++k;
    }
    if (a < al) {
        while (a < al) {
            vto[k] = vfrom[a];
            ++a;
            ++k;
        }
    }
    else {
        while (b < bl) {
            vto[k] = vfrom[b];
            ++b;
            ++k;
        }
    }
}

void mergeSortGPU(int* result, int size, int* bufferValues, int* bufferValues2) {
    int p = 0;
    int mtc = 1024;
    for (int tc = size/2; tc >=1; tc >>= 1) {
        merdzs <<<(tc > mtc ? tc/mtc : 1), (tc > mtc ? mtc : tc) >>>(p % 2 == 0 ? bufferValues : bufferValues2, p % 2 == 0 ? bufferValues2 : bufferValues, size/tc, (p%2 == 0) ? 0 : 1,  tc > mtc ? mtc : tc);
        ++p;
    }
    cudaDeviceSynchronize();
    if ((int)log2(size) % 2 == 1) {
        cudaMemcpy(result, bufferValues2, size * sizeof(int), cudaMemcpyDeviceToHost);
    } else {
        cudaMemcpy(result, bufferValues, size * sizeof(int), cudaMemcpyDeviceToHost);
    }
}

int GPUMergeSort(int* values, int* result, int size) {

    int* bufferValues; int* bufferValues2;
    cudaMalloc((void**)&bufferValues, size * sizeof(int));
    cudaMalloc((void**)&bufferValues2, size * sizeof(int));

    cudaMemcpy(bufferValues, values, size * sizeof(int), cudaMemcpyHostToDevice);

    std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
    mergeSortGPU(result, size, bufferValues, bufferValues2);
    std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();

    millisecs_t duration(std::chrono::duration_cast<millisecs_t>(end - start));
    return duration.count();
}