Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- 1 #define _CUDA_NDARRAY_C
- 2
- 3 #include <Python.h>
- 4 #include <structmember.h>
- 5 #include "theano_mod_helper.h"
- 6
- 7 #include <numpy/arrayobject.h>
- 8 #include <iostream>
- 9
- 10 #include "cuda_ndarray.cuh"
- 11
- 12 #ifndef CNMEM_DLLEXPORT
- 13 #define CNMEM_DLLEXPORT
- 14 #endif
- 15
- 16 #include "cnmem.h"
- 17 #include "cnmem.cpp"
- 18
- 19 //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
- 20 #define COMPUTE_GPU_MEM_USED 0
- 21
- 22 //If true, we fill with NAN allocated device memory.
- 23 #define ALLOC_MEMSET 0
- 24
- 25 //If true, we print out when we free a device pointer, uninitialize a
- 26 //CudaNdarray, or allocate a device pointer
- 27 #define PRINT_FREE_MALLOC 0
- 28
- 29 //If true, we do error checking at the start of functions, to make sure there
- 30 //is not a pre-existing error when the function is called.
- 31 //You probably need to set the environment variable
- 32 //CUDA_LAUNCH_BLOCKING=1, and/or modify the CNDA_THREAD_SYNC
- 33 //preprocessor macro in cuda_ndarray.cuh
- 34 //if you want this to work.
- 35 #define PRECHECK_ERROR 0
- 36
- 37 cublasHandle_t handle = NULL;
- 38 int* err_var = NULL;
- 39
- 40 /////////////////////////
- 41 // Alloc and Free
- 42 /////////////////////////
- 43
- 44 static int g_gpu_context_active = 0;
- 45
- 46
- 47 PyObject *
- 48 CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args);
- 49 static PyObject *CudaNdarray_get_shape(CudaNdarray *self, void *closure);
- 50
- 51
- 52 /**
- 53 *
- 54 * In the test program I'm using, the _outstanding_mallocs decreases with every call.
- 55 * This suggests there are more free() calls being made than alloc(), but I can't figure out why.
- 56 *
- 57 */
- 58 int _outstanding_mallocs[] = {0,0};
- 59
- 60 #if COMPUTE_GPU_MEM_USED
- 61 size_t _allocated_size = 0;
- 62 size_t _max_allocated_size = 0;
- 63
- 64 const int TABLE_SIZE = 10000;
- 65 struct table_struct{
- 66 void* ptr;
- 67 size_t size;
- 68 };
- 69 table_struct _alloc_size_table[TABLE_SIZE];
- 70 #endif
- 71
- 72 void * device_malloc(size_t size)
- 73 {
- 74 return device_malloc(size, VERBOSE_DEVICE_MALLOC);
- 75 }
- 76
- 77 ///@TODO: thejaswi: link this option to a theano config variable?
- 78 static bool g_use_cnmem = false;
- 79 static const int g_max_devices = 8;
- 80 int initCnmem(int card_number_provided, int card_nb, size_t mem) {
- 81 static bool cnmemInitialized = false;
- 82 if(cnmemInitialized) {
- 83 return 0;
- 84 }
- 85 // On stderr to be at the same place as "Using gpu device..."
- 86 int numDevices = 0;
- 87 cnmemDevice_t devices[g_max_devices];
- 88 if(cudaGetDeviceCount(&numDevices) != cudaSuccess) {
- 89 PyErr_Format(PyExc_RuntimeError,
- 90 "initCnmem: 'cudaGetDeviceCount' failed! Reason=%s\n",
- 91 cudaGetErrorString(cudaGetLastError()));
- 92 return -1;
- 93 }
- 94 if(card_number_provided){
- 95 numDevices = 1;
- 96 int i = 0;
- 97 devices[i].device = card_nb;
- 98 devices[i].size = mem;
- 99 ///@TODO: thejaswi: add support for multiple streams
- 100 devices[i].numStreams = 0;
- 101 devices[i].streams = NULL;
- 102 devices[i].streamSizes = NULL;
- 103 }else{
- 104 for(int i=0;i<numDevices;++i) {
- 105 devices[i].device = i;
- 106 devices[i].size = mem;
- 107 ///@TODO: thejaswi: add support for multiple streams
- 108 devices[i].numStreams = 0;
- 109 devices[i].streams = NULL;
- 110 }
- 111 }
- 112
- 113 ///@TODO: thejaswi: passing custom cnmem flags?
- 114 cnmemStatus_t status = cnmemInit(numDevices, devices, CNMEM_FLAGS_DEFAULT);
- 115 if(status != CNMEM_STATUS_SUCCESS) {
- 116 PyErr_Format(PyExc_RuntimeError,
- 117 "initCnmem: cnmemInit call failed! Reason=%s. numdev=%d\n",
- 118 cnmemGetErrorString(status), numDevices);
- 119 return -1;
- 120 }
- 121 cnmemInitialized = true;
- 122 return 0;
- 123 }
- 124
- 125 void * device_malloc(size_t size, int verbose)
- 126 {
- 127 #if PRECHECK_ERROR
- 128 cudaThreadSynchronize();
- 129 cudaError_t prevError = cudaGetLastError();
- 130 if (cudaSuccess != prevError)
- 131 {
- 132 fprintf(stderr,
- 133 "Error existed before calling device_malloc. %s\n",
- 134 cudaGetErrorString(prevError)
- 135 );
- 136 }
- 137 #endif
- 138 void * rval=NULL;
- 139 ///@TODO: thejaswi: support for multiple-streams?
- 140 if(g_use_cnmem) {
- 141 cnmemStatus_t status = CNMEM_STATUS_SUCCESS;
- 142 status = cnmemMalloc(&rval, size, NULL);
- 143 if(status != CNMEM_STATUS_SUCCESS) {
- 144 PyErr_Format(PyExc_MemoryError,
- 145 "Error allocating %llu bytes of device memory (%s).",
- 146 (unsigned long long)size, cnmemGetErrorString(status));
- 147 return NULL;
- 148 }
- 149 }
- 150 else {
- 151 cudaError_t err = cudaMalloc(&rval, size);
- 152 if (cudaSuccess != err)
- 153 {
- 154 // Clear the error flag, cudaMalloc doesn't do it.
- 155 // Currently this returns the same thing as err, but if in future
- 156 // it returns something else I still don't see why we should ignore
- 157 // it. All we want to do here is reset the flag.
- 158 cudaGetLastError();
- 159 if (verbose)
- 160 {
- 161 size_t free = 0, total = 0;
- 162 cudaError_t err2 = cudaMemGetInfo(&free, &total);
- 163 if (err2 != cudaSuccess){
- 164 cudaGetLastError();
- 165 fprintf(stderr,
- 166 "Error when trying to find the memory information"
- 167 " on the GPU: %s\n", cudaGetErrorString(err2));
- 168 }
- 169 #if COMPUTE_GPU_MEM_USED
- 170 fprintf(stderr,
- 171 "Error allocating %llu bytes of device memory (%s)."
- 172 " new total bytes allocated: %llu."
- 173 " Driver report %llu bytes free and %llu bytes total \n",
- 174 (unsigned long long)size, cudaGetErrorString(err), (unsigned long long)_allocated_size,
- 175 (unsigned long long)free, (unsigned long long)total);
- 176 #else
- 177 fprintf(stderr,
- 178 "Error allocating %llu bytes of device memory (%s)."
- 179 " Driver report %llu bytes free and %llu bytes total \n",
- 180 (unsigned long long)size, cudaGetErrorString(err), (unsigned long long)free, (unsigned long long)total);
- 181 #endif
- 182 }
- 183 PyErr_Format(PyExc_MemoryError,
- 184 "Error allocating %llu bytes of device memory (%s).",
- 185 (unsigned long long)size, cudaGetErrorString(err));
- 186 return NULL;
- 187 }
- 188 }
- 189 if (rval != NULL){
- 190 // Can it happen that cudaMalloc return cudaSuccess, but return a NULL ptr?
- 191 // Could this be what happen if size is 0?
- 192 _outstanding_mallocs[0] += 1;
- 193
- 194 #if COMPUTE_GPU_MEM_USED
- 195 _allocated_size += size;
- 196 _max_allocated_size = std::max(_max_allocated_size, _allocated_size);
- 197 int i = 0;
- 198 for(;i<TABLE_SIZE;i++){
- 199 if(NULL==_alloc_size_table[i].ptr){
- 200 _alloc_size_table[i].ptr=rval;
- 201 _alloc_size_table[i].size=size;
- 202 break;
- 203 }
- 204 }
- 205 if (i == TABLE_SIZE){
- 206 fprintf(stderr,
- 207 "When tracking GPU malloc, our table size wasn't big enough."
- 208 " So we loose some tracking. Raise the value of TABLE_SIZE in the file cuda_ndarra.cu");
- 209 }
- 210 #endif
- 211 }
- 212 //fprintf(stderr,
- 213 //"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n",
- 214 //(long)size, cudaGetErrorString(err),_allocated_size,rval);
- 215
- 216 if(ALLOC_MEMSET){
- 217 //We init them to nan to make sure we catch more debug case.
- 218 cudaMemset(rval, 0xFF, size);
- 219 //printf("MEMSET\n");
- 220 }
- 221 #if PRINT_FREE_MALLOC
- 222 fprintf(stderr, "device malloc %p of size %d\n", rval, size);
- 223 #endif
- 224 return rval;
- 225 }
- 226
- 227 int device_free(void *ptr)
- 228 {
- 229 #if PRECHECK_ERROR
- 230 cudaThreadSynchronize();
- 231 cudaError_t prevError = cudaGetLastError();
- 232 if (cudaSuccess != prevError)
- 233 {
- 234 fprintf(stderr,
- 235 "Error existed before calling device_free. %s\n",
- 236 cudaGetErrorString(prevError)
- 237 );
- 238 }
- 239 #endif
- 240 #if PRINT_FREE_MALLOC
- 241 size_t free = 0, total = 0;
- 242 cudaError_t err2 = cudaMemGetInfo(&free, &total);
- 243 if (err2 != cudaSuccess){
- 244 cudaGetLastError();
- 245 fprintf(stderr,
- 246 "Error when tring to find the memory information"
- 247 " on the GPU: %s\n", cudaGetErrorString(err2));
- 248 }
- 249 #if COMPUTE_GPU_MEM_USED
- 250 {
- 251 int i = 0;
- 252 for(;i<TABLE_SIZE;i++)
- 253 if(_alloc_size_table[i].ptr==ptr){
- 254 break;
- 255 }
- 256 assert(i<TABLE_SIZE);
- 257 fprintf(stderr, "device_free %p of size %d."
- 258 " Driver report %d bytes free and %d bytes total \n",
- 259 ptr, _alloc_size_table[i].size, free, total);
- 260 }
- 261 #else
- 262 fprintf(stderr, "device_free %p."
- 263 " Driver report %d bytes free and %d bytes total \n",
- 264 ptr, free, total);
- 265 #endif
- 266 #endif
- 267
- 268 // if there is no gpu context, the call to cudaFree will fail; skip it entirely
- 269 if(!g_gpu_context_active) {
- 270 return 0;
- 271 }
- 272
- 273 ///@TODO: thejaswi: multi-stream support
- 274 if(g_use_cnmem) {
- 275 cnmemStatus_t status = cnmemFree(ptr, NULL);
- 276 if(status != CNMEM_STATUS_SUCCESS) {
- 277 fprintf(stderr, "device_free: cnmemFree call failed! Reason=%s\n",
- 278 cnmemGetErrorString(status));
- 279 }
- 280 }
- 281 else {
- 282 // We need sync as the Theano's GC could remove intermediate variable that
- 283 // are still needed as the gpu kernel are running or in the queue.
- 284 CNDA_BEGIN_ALLOW_THREADS
- 285 cudaThreadSynchronize();
- 286 CNDA_END_ALLOW_THREADS
- 287
- 288 cudaError_t err = cudaFree(ptr);
- 289 if (cudaSuccess != err)
- 290 {
- 291 // Clear the error flag, cudaFree doesn't do it.
- 292 // Currently this returns the same thing as err, but if in future
- 293 // it returns something else I still don't see why we should ignore
- 294 // it. All we want to do here is reset the flag.
- 295 cudaGetLastError();
- 296 size_t free = 0, total = 0;
- 297 cudaError_t err2 = cudaMemGetInfo(&free, &total);
- 298 if (err2 != cudaSuccess){
- 299 cudaGetLastError();
- 300 fprintf(stderr,
- 301 "Error when tring to find the memory information"
- 302 " on the GPU: %s\n", cudaGetErrorString(err2));
- 303 }
- 304 #if COMPUTE_GPU_MEM_USED
- 305 {
- 306 int i = 0;
- 307 for(;i<TABLE_SIZE;i++)
- 308 if(_alloc_size_table[i].ptr==ptr){
- 309 break;
- 310 }
- 311 assert(i<TABLE_SIZE);
- 312 fprintf(stderr,
- 313 "Error freeing device pointer %p (%s) of size %llu. %llu byte already allocated."
- 314 " Driver report %llu bytes free and %llu bytes total \n",
- 315 ptr, cudaGetErrorString(err),
- 316 (unsigned long long)_alloc_size_table[i].size, (unsigned long long)_allocated_size, (unsigned long long)free, (unsigned long long)total);
- 317 }
- 318 #else
- 319 fprintf(stderr,
- 320 "Error freeing device pointer %p (%s)."
- 321 " Driver report %llu bytes free and %llu bytes total \n",
- 322 ptr,
- 323 cudaGetErrorString(err), (unsigned long long)free, (unsigned long long)total);
- 324 #endif
- 325 if (NULL != PyErr_Occurred()){
- 326 fprintf(stderr,
- 327 "device_free: cudaFree() returned an error, but there is already an"
- 328 " Python error set. This happen during the clean up when there is a"
- 329 " first error and the CUDA driver is in a so bad state that it don't"
- 330 " work anymore. We keep the previous error set to help debugging it.");
- 331 return -1;
- 332 }
- 333 PyErr_Format(PyExc_MemoryError,
- 334 "error freeing device pointer %p (%s)",
- 335 ptr,
- 336 cudaGetErrorString(err));
- 337 return -1;
- 338 }
- 339 }
- 340 _outstanding_mallocs[0] -= (ptr != NULL);
- 341 #if COMPUTE_GPU_MEM_USED
- 342 int i=0;
- 343 size_t total_freed = 0;
- 344 for(;i<TABLE_SIZE;i++)
- 345 if(_alloc_size_table[i].ptr==ptr){
- 346 _allocated_size -= _alloc_size_table[i].size;
- 347 total_freed += _alloc_size_table[i].size;
- 348 _alloc_size_table[i].ptr=0;
- 349 _alloc_size_table[i].size=0;
- 350
- 351 break;
- 352 }
- 353 //if(i==TABLE_SIZE)
- 354 // printf("Unallocated unknow size!\n");
- 355 //fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
- 356 #endif
- 357 return 0;
- 358 }
- 359
- 360 static PyObject *
- 361 outstanding_mallocs(PyObject* self, PyObject * args)
- 362 {
- 363 return PyInt_FromLong(_outstanding_mallocs[0]);
- 364 }
- 365
- 366
- 367 static void *work_mem = NULL;
- 368 static size_t work_size = 0;
- 369
- 370 /*
- 371 * Returns a chunk of memory for temporary work inside of an op. You can only
- 372 * request a single chunk of memory at a time since it is reused.
- 373 */
- 374 void *get_work_mem(size_t sz) {
- 375 if (sz <= work_size)
- 376 return work_mem;
- 377 device_free(work_mem);
- 378 work_mem = device_malloc(sz);
- 379 work_size = sz;
- 380 if (work_mem == NULL)
- 381 work_size = 0;
- 382 return work_mem;
- 383 }
- 384
- 385 /////////////////////////
- 386 // Static helper methods
- 387 /////////////////////////
- 388
- 389 static void
- 390 CudaNdarray_null_init(CudaNdarray*self)
- 391 {
- 392 self->base = NULL;
- 393 self->nd = -1;
- 394 self->host_structure = NULL;
- 395 self->data_allocated = 0;
- 396 self->dev_structure_fresh = 1;
- 397 self->dev_structure = NULL;
- 398 self->devdata = NULL;
- 399 }
- 400
- 401 static int
- 402 CudaNdarray_uninit(CudaNdarray*self)
- 403 {
- 404 #if PRINT_FREE_MALLOC
- 405 fprintf(stderr, "CudaNdarray_uninit %p\n", self);
- 406 #endif
- 407 int rval = 0;
- 408 if (self->data_allocated) {
- 409 assert(self->devdata);
- 410 if (device_free(self->devdata))
- 411 {
- 412 fprintf(stderr,
- 413 "CudaNdarray_uninit: error freeing self->devdata. (self=%p, self->devata=%p)\n",
- 414 self, self->devdata);
- 415 rval = -1;
- 416 }
- 417 self->devdata = NULL;
- 418 self->data_allocated = 0;
- 419 }
- 420 if (self->dev_structure)
- 421 {
- 422 if (device_free(self->dev_structure))
- 423 {
- 424 fprintf(stderr,
- 425 "CudaNdarray_uninit: error freeing dev_structure memory %p (self=%p)\n",
- 426 self->dev_structure, self);
- 427 rval = -1;
- 428 }
- 429 self->dev_structure = NULL;
- 430 }
- 431 if (self->host_structure)
- 432 {
- 433 free(self->host_structure);
- 434 self->host_structure = NULL;
- 435 }
- 436 self->nd = -1;
- 437 Py_XDECREF(self->base);
- 438 self->base = NULL;
- 439 return rval;
- 440 }
- 441
- 442
- 443 //make the rightmost coords change fastest
- 444 //TODO: why does a downward for-loop not work????
- 445 //TODO: use the log2_dims and driver code to remove / and %
- 446 //TODO: skip the last division (when d == 0)
- 447 #define decl_k_elemwise_unary_rowmajor(name, F) \
- 448 __global__ void name (unsigned int numEls, \
- 449 unsigned int nd, \
- 450 const int * dim, \
- 451 const float * a_data, const int * a_str, \
- 452 float * z_data, const int * z_str) \
- 453 { \
- 454 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; \
- 455 const unsigned int numThreads = blockDim.x * gridDim.x; \
- 456 \
- 457 for (unsigned int i = idx; i < numEls; i += numThreads) \
- 458 { \
- 459 unsigned int ii = i; \
- 460 const float * a_i = a_data; \
- 461 float * z_i = z_data; \
- 462 for (unsigned int _d = 0; _d < nd; ++_d) \
- 463 { \
- 464 unsigned int d = nd - _d-1; \
- 465 int i_d = ii % dim[d]; /* i_d is our position in the d'th dimension */ \
- 466 ii = ii / dim[d]; \
- 467 a_i += i_d * a_str[d]; /* increment our a and z pointers by i_d elements */ \
- 468 z_i += i_d * z_str[d]; \
- 469 } \
- 470 z_i[0] = F(a_i[0]); \
- 471 } \
- 472 }
- 473
- 474 template<typename T> __device__ T unary_copy(T a) { return a; }
- 475 decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy, unary_copy<float>)
- 476
- 477 template<typename T> __device__ T unary_exp(T a) { return exp(a); }
- 478 decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_exp, unary_exp<float>)
- 479
- 480 /////////////////////////////
- 481 // Satisfying reqs to be Type
- 482 /////////////////////////////
- 483
- 484 //DON'T use directly(if their is other CudaNdarray that point to it, it will cause problem)! use Py_DECREF() instead
- 485 static void
- 486 CudaNdarray_dealloc(CudaNdarray* self)
- 487 {
- 488 if (0) std::cerr << "CudaNdarray dealloc " << self << " " << self->devdata << '\n';
- 489 if(Py_REFCNT(self) > 1)
- 490 printf("WARNING:CudaNdarray_dealloc called when there is still active reference to it.\n");
- 491 CudaNdarray_uninit(self);
- 492 Py_TYPE(self)->tp_free((PyObject*)self);
- 493 --_outstanding_mallocs[1];
- 494 if (0)
- 495 {
- 496 fprintf(stderr, "device_malloc_counts: (device) %i (obj) %i\n",
- 497 _outstanding_mallocs[0],
- 498 _outstanding_mallocs[1]);
- 499 }
- 500 }
- 501
- 502 static PyObject *
- 503 CudaNdarray_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
- 504 {
- 505 CudaNdarray *self;
- 506
- 507 self = (CudaNdarray *)type->tp_alloc(type, 0);
- 508 if (self != NULL)
- 509 {
- 510 CudaNdarray_null_init(self);
- 511 ++_outstanding_mallocs[1];
- 512 }
- 513 return (PyObject *)self;
- 514 }
- 515 static int
- 516 CudaNdarray_init(CudaNdarray *self, PyObject *args, PyObject *kwds)
- 517 {
- 518 PyObject *arr=NULL;
- 519
- 520 if (! PyArg_ParseTuple(args, "O", &arr))
- 521 return -1;
- 522 if (! PyArray_Check(arr))
- 523 {
- 524 PyErr_SetString(PyExc_TypeError, "PyArray arg required");
- 525 return -1;
- 526 }
- 527 int rval = CudaNdarray_CopyFromArray(self, (PyArrayObject*)arr);
- 528 return rval;
- 529 }
- 530 static PyMemberDef CudaNdarray_members[] =
- 531 {
- 532 /*
- 533 {"first", T_OBJECT_EX, offsetof(CudaNdarray, first), 0,
- 534 "first name"},
- 535 {"last", T_OBJECT_EX, offsetof(CudaNdarray, last), 0,
- 536 "last name"},
- 537 {"number", T_INT, offsetof(CudaNdarray, number), 0,
- 538 "noddy number"},
- 539 */
- 540 {NULL} /* Sentinel */
- 541 };
- 542
- 543 PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
- 544 {
- 545 PyObject * dtype = NULL;
- 546 if (args && !PyArg_ParseTuple(args, "|O", &dtype))
- 547 return NULL;
- 548 if (dtype) {
- 549 PyArray_Descr* dtype2;
- 550 // PyArray_DescrConverter try to convert anything to a PyArray_Descr.
- 551 if(!PyArray_DescrConverter(dtype, &dtype2))
- 552 {
- 553 PyObject * str = PyObject_Repr(dtype);
- 554 PyErr_Format(PyExc_TypeError,
- 555 "CudaNdarray dtype parameter not understood: %s",
- 556 PyString_AsString(str)
- 557 );
- 558 Py_CLEAR(str);
- 559 return NULL;
- 560 }
- 561 int typeNum = dtype2->type_num;
- 562 Py_DECREF(dtype2);
- 563 if (typeNum != NPY_FLOAT32)
- 564 {
- 565 PyObject * str = PyObject_Repr(dtype);
- 566 PyErr_Format(PyExc_TypeError,
- 567 "CudaNdarray support only support float32 dtype, provided: %d",
- 568 typeNum
- 569 );
- 570 Py_CLEAR(str);
- 571 return NULL;
- 572 }
- 573 }
- 574
- 575 int verbose = 0;
- 576 if(self->nd>=0 && CudaNdarray_SIZE(self)==0){
- 577 npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp));
- 578 assert (npydims);
- 579 for (int i = 0; i < self->nd; ++i) npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]);
- 580 PyObject * rval = PyArray_SimpleNew(self->nd, npydims, REAL_TYPENUM);
- 581 free(npydims);
- 582 if (!rval){
- 583 return NULL;
- 584 }
- 585 assert (PyArray_ITEMSIZE((PyArrayObject *)rval) == sizeof(real));
- 586 return rval;
- 587 }
- 588 if ((self->nd < 0) || (self->devdata == 0))
- 589 {
- 590 PyErr_SetString(PyExc_ValueError, "can't copy from un-initialized CudaNdarray");
- 591 return NULL;
- 592 }
- 593 CudaNdarray * contiguous_self = NULL;
- 594 if (CudaNdarray_is_c_contiguous(self))
- 595 {
- 596 contiguous_self = self;
- 597 Py_INCREF(contiguous_self);
- 598 if (verbose) std::cerr << "CreateArrayObj already contiguous" << contiguous_self << '\n';
- 599 }
- 600 else
- 601 {
- 602 contiguous_self = (CudaNdarray*)CudaNdarray_Copy(self);
- 603 if (verbose) std::cerr << "CreateArrayObj created contiguous" << contiguous_self << '\n';
- 604 }
- 605 if (!contiguous_self)
- 606 {
- 607 return NULL;
- 608 }
- 609
- 610 npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp));
- 611 assert (npydims);
- 612 for (int i = 0; i < self->nd; ++i)
- 613 npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]);
- 614 PyArrayObject * rval = (PyArrayObject *) PyArray_SimpleNew(self->nd,
- 615 npydims,
- 616 REAL_TYPENUM);
- 617 free(npydims);
- 618 if (!rval)
- 619 {
- 620 Py_DECREF(contiguous_self);
- 621 return NULL;
- 622 }
- 623
- 624 assert (PyArray_ITEMSIZE(rval) == sizeof(real));
- 625
- 626 npy_intp rval_size = PyArray_SIZE(rval);
- 627 void *rval_data = PyArray_DATA(rval);
- 628 cudaError_t err;
- 629 CNDA_BEGIN_ALLOW_THREADS;
- 630
- 631 err = cudaMemcpy(rval_data, contiguous_self->devdata,
- 632 rval_size * sizeof(real),
- 633 cudaMemcpyDeviceToHost
- 634 );
- 635 //CNDA_THREAD_SYNC; // unneeded because cudaMemcpy is blocking anyway
- 636 CNDA_END_ALLOW_THREADS;
- 637
- 638 if (cudaSuccess != err)
- 639 {
- 640 PyErr_Format(PyExc_RuntimeError, "error (%s)copying data to host",
- 641 cudaGetErrorString(err));
- 642 Py_DECREF(rval);
- 643 rval = NULL;
- 644 }
- 645
- 646 Py_DECREF(contiguous_self);
- 647 return (PyObject *)rval;
- 648 }
- 649
- 650 // TODO-- we have two functions here, ZEROS and Zeros.
- 651 // ZEROS is meant to be called just from C code (you don't need to pass it PyObject * s)
- 652 // but this naming is very weird, makes it look like a macro
- 653 // we should figure out the correct convention and change to that
- 654 PyObject* CudaNdarray_ZEROS(int n, int * dims)
- 655 {
- 656
- 657 size_t total_elements = 1;
- 658
- 659 for(size_t i=0;i<n;i++){
- 660 // Detect overflow on unsigned integer
- 661 if (dims[i] != 0 && total_elements > (SIZE_MAX / dims[i])) {
- 662 PyErr_Format(PyExc_RuntimeError,
- 663 "Can't store in size_t for the bytes requested %llu * %llu",
- 664 (unsigned long long)total_elements,
- 665 (unsigned long long)dims[i]);
- 666 return NULL;
- 667 }
- 668 total_elements*=dims[i];
- 669 }
- 670
- 671 // total_elements now contains the size of the array, in reals
- 672 if (total_elements > (SIZE_MAX / sizeof(real))){
- 673 PyErr_Format(PyExc_RuntimeError,
- 674 "Can't store in size_t for the bytes requested %llu * 4",
- 675 (unsigned long long)total_elements);
- 676 return NULL;
- 677 }
- 678 size_t total_size = total_elements * sizeof(real);
- 679
- 680 CudaNdarray* rval = (CudaNdarray*)CudaNdarray_New();
- 681 if (!rval)
- 682 {
- 683 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: call to New failed");
- 684 return NULL;
- 685 }
- 686
- 687 if (CudaNdarray_alloc_contiguous(rval, n, dims))
- 688 {
- 689 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: allocation failed.");
- 690 Py_DECREF(rval);
- 691 return NULL;
- 692 }
- 693
- 694 // Fill with zeros
- 695 //fprintf(stdout, "Sizeof: %d\n", total_size);
- 696 if (cudaSuccess != cudaMemset(rval->devdata, 0, total_size))
- 697 {
- 698 PyErr_Format(PyExc_MemoryError,
- 699 "CudaNdarray_ZEROS: Error memsetting %llu bytes of device memory.",
- 700 (unsigned long long)total_size);
- 701 Py_DECREF(rval);
- 702 return NULL;
- 703 }
- 704
- 705 if (cnda_copy_structure_to_device(rval))
- 706 {
- 707 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: syncing structure to device failed");
- 708 Py_DECREF(rval);
- 709 return NULL;
- 710 }
- 711 return (PyObject*) rval;
- 712 }
- 713
- 714 // declared as a static method (hence 1st parameter is not used)
- 715 // Based on _Copy and _dimshuffle
- 716 PyObject* CudaNdarray_Zeros(PyObject* _unused, PyObject* shape)
- 717 {
- 718 if(!shape)
- 719 {
- 720 PyErr_SetString(PyExc_TypeError, "CudaNdarray_Zeros: function takes at least 1 argument (0 given)");
- 721 return NULL;
- 722 }
- 723 if(!PySequence_Check(shape))
- 724 {
- 725 PyErr_SetString(PyExc_TypeError, "shape argument must be a sequence");
- 726 return NULL;
- 727 }
- 728
- 729 int shplen = PySequence_Length(shape);
- 730
- 731 if (shplen == 0)
- 732 {
- 733 return CudaNdarray_ZEROS(0, NULL);
- 734 }
- 735
- 736 int* newdims = (int *)malloc(sizeof(int) * shplen);
- 737
- 738 if (!newdims)
- 739 {
- 740 PyErr_SetString(PyExc_MemoryError,
- 741 "CudaNdarray_Zeros: Failed to allocate temporary space");
- 742 return NULL;
- 743 }
- 744
- 745 // start from the end to compute strides
- 746 for (int i = shplen-1; i >= 0; --i)
- 747 {
- 748 PyObject* shp_el_obj = PySequence_GetItem(shape, i);
- 749 if(shp_el_obj == NULL)
- 750 {
- 751 // shouldn't happen since we checked length before...
- 752 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_Zeros: Index out of bound in sequence");
- 753 free(newdims);
- 754 return NULL;
- 755 }
- 756
- 757 int shp_el = PyInt_AsLong(shp_el_obj);
- 758 Py_DECREF(shp_el_obj);
- 759
- 760 if (shp_el < 0)
- 761 {
- 762 PyErr_SetString(PyExc_ValueError, "CudaNdarray_Zeros: shape must contain only non-negative values for size of a dimension");
- 763 free(newdims);
- 764 return NULL;
- 765 }
- 766
- 767 newdims[i] = shp_el;
- 768 }
- 769
- 770 PyObject* rval = CudaNdarray_ZEROS(shplen,newdims);
- 771
- 772 free(newdims);
- 773
- 774 return (PyObject*)rval;
- 775 }
- 776
- 777
- 778
- 779
- 780
- 781 PyObject * CudaNdarray_Copy(const CudaNdarray * self)
- 782 {
- 783 PyObject * rval = CudaNdarray_New();
- 784 if ((!rval) || (-1 == self->nd))
- 785 {
- 786 return rval;
- 787 }
- 788 if (CudaNdarray_alloc_contiguous((CudaNdarray*)rval, self->nd, CudaNdarray_HOST_DIMS(self)))
- 789 {
- 790 Py_DECREF(rval);
- 791 return NULL;
- 792 }
- 793 if (CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)rval, self))
- 794 {
- 795 Py_DECREF(rval);
- 796 return NULL;
- 797 }
- 798 return rval;
- 799 }
- 800 PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo)
- 801 {
- 802 assert(PyDict_Check(memo));
- 803 PyObject * selfkey = PyInt_FromLong((long)self);
- 804 assert(selfkey);
- 805 if (PyDict_Contains(memo, selfkey))
- 806 {
- 807 PyObject * rval = PyDict_GetItem(memo, selfkey);
- 808 Py_DECREF(selfkey);
- 809 Py_XINCREF(rval);
- 810 return rval;
- 811 }
- 812 else
- 813 {
- 814 PyObject * rval = CudaNdarray_Copy(self);
- 815 if (0) std::cerr << "DeepCopy created " << rval << " devdata " << ((CudaNdarray*)rval)->devdata << "\n";
- 816 if (NULL == rval)
- 817 {
- 818 Py_DECREF(selfkey);
- 819 return NULL;
- 820 }
- 821 if (PyDict_SetItem(memo, selfkey, rval))
- 822 {
- 823 Py_DECREF(rval);
- 824 Py_DECREF(selfkey);
- 825 return NULL;
- 826 }
- 827 Py_DECREF(selfkey);
- 828 return rval;
- 829 }
- 830 }
- 831 PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask)
- 832 {
- 833 if (!PySequence_Check(py_reduce_mask))
- 834 {
- 835 PyErr_SetString(PyExc_TypeError, "reduce_mask must be sequence of ints");
- 836 return NULL;
- 837 }
- 838 int len = PySequence_Length(py_reduce_mask);
- 839 if (len != self->nd)
- 840 {
- 841 PyErr_SetString(PyExc_TypeError, "length of reduce_mask must match self->nd");
- 842 return NULL;
- 843 }
- 844 CudaNdarray * self_sum = (CudaNdarray*)CudaNdarray_New();
- 845 if (!self_sum)
- 846 {
- 847 return NULL;
- 848 }
- 849 //TODO: allocate a fixed size dimshuffle_pattern_cache on the stack,
- 850 // and use it if it is big enough.
- 851 int * dimshuffle_pattern = (int*)malloc(len * 2 * sizeof(int));
- 852 int * sum_dims = dimshuffle_pattern + len;
- 853 int n_remaining_dims = 0;
- 854 if (!dimshuffle_pattern)
- 855 {
- 856 Py_DECREF(self_sum);
- 857 PyErr_SetString(PyExc_MemoryError, "failed to alloc internal storage");
- 858 return NULL;
- 859 }
- 860 for (int i = 0; i < len; ++i)
- 861 {
- 862 PyObject *o_i = PySequence_GetItem(py_reduce_mask, i);
- 863 int o_i_int = PyInt_AsLong(o_i);
- 864 Py_XDECREF(o_i);
- 865 if (PyErr_Occurred())
- 866 {
- 867 Py_DECREF(self_sum);
- 868 free(dimshuffle_pattern);
- 869 return NULL;
- 870 }
- 871 if (o_i_int) // this is a dimension over which we are reducing
- 872 {
- 873 sum_dims[i] = 1;
- 874 }
- 875 else
- 876 {
- 877 sum_dims[i] = CudaNdarray_HOST_DIMS(self)[i];
- 878 dimshuffle_pattern[n_remaining_dims++] = i;
- 879 }
- 880 }
- 881 if (0 || CudaNdarray_alloc_contiguous(self_sum, len, sum_dims)
- 882 || CudaNdarray_reduce_sum(self_sum, self)
- 883 || CudaNdarray_dimshuffle(self_sum, n_remaining_dims, dimshuffle_pattern))
- 884 {
- 885 Py_DECREF(self_sum);
- 886 free(dimshuffle_pattern);
- 887 return NULL;
- 888 }
- 889 free(dimshuffle_pattern);
- 890 return (PyObject*)self_sum;
- 891 }
- 892
- 893 // Reshape self to the new shape gived by the tuple shape.
- 894 //
- 895 // If self is c contiguous, it return a view. Otherwise it always do a copy.
- 896 // TODO: make it return a view when the strides allow it even if it is not
- 897 // c contiguous
- 898 PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
- 899 {
- 900 if(!CudaNdarray_is_c_contiguous(self))
- 901 {
- 902 // allocate new space
- 903 //TODO: test to see if we can re-use old one and take a new param to
- 904 // use this
- 905 CudaNdarray* rval = (CudaNdarray*) CudaNdarray_Copy(self);
- 906 if (!rval)
- 907 {
- 908 return NULL;
- 909 }
- 910
- 911 CudaNdarray* ret = (CudaNdarray*) CudaNdarray_Reshape(rval, shape);
- 912 Py_XDECREF(rval);
- 913 return (PyObject*)ret;
- 914 }
- 915
- 916 // check shape tuple
- 917 unsigned int rval_nd;
- 918 unsigned int * rval_dims;
- 919 size_t rval_size = 1;
- 920
- 921 if (PyTuple_Check(shape)){
- 922 // copy shape to integer array
- 923 rval_nd = PyTuple_Size(shape);
- 924 }else if (PyInt_Check(shape)){
- 925 rval_nd = 1;
- 926 }else{
- 927 PyErr_SetString(PyExc_TypeError, "shape must be tuple of integers or an integer");
- 928 return NULL;
- 929 }
- 930 rval_dims = (unsigned int*)malloc(rval_nd * sizeof(int));
- 931
- 932 if(PyTuple_Check(shape)){
- 933 for (int i = 0; i < rval_nd; ++i)
- 934 {
- 935 rval_dims[i] = PyInt_AsLong(PyTuple_GetItem(shape, i)); //GetItem returns borrowed reference
- 936 if (PyErr_Occurred()) //error in AsLong
- 937 {
- 938 free(rval_dims);
- 939 return NULL;
- 940 }
- 941 if(rval_dims[i]<0){
- 942 PyErr_Format(PyExc_ValueError, "Reshape has invalid dimension %i (must be >=0)",rval_dims[i]);
- 943 free(rval_dims);
- 944 return NULL;
- 945 }
- 946 rval_size = rval_size * rval_dims[i];
- 947 }
- 948 }else{
- 949 rval_size = PyInt_AsLong(shape);
- 950 rval_dims[0] = rval_size;
- 951 }
- 952 // calculate new size, assert same as old size
- 953 if (rval_size != CudaNdarray_SIZE(self))
- 954 {
- 955 PyErr_Format(PyExc_ValueError, "size must remain unchanged, changed from %lld to %lld", CudaNdarray_SIZE(self), rval_size);
- 956 free(rval_dims);
- 957 return NULL;
- 958 }
- 959 if (rval_size==0)
- 960 {
- 961 PyObject * rval = CudaNdarray_NewDims(rval_nd, rval_dims);
- 962 free(rval_dims);
- 963 return rval;
- 964 }
- 965
- 966 //return a view, not a copy
- 967 //we can do this as we checked self is c_contiguous
- 968 CudaNdarray * rval = (CudaNdarray * )CudaNdarray_New(rval_nd);
- 969
- 970 if (!rval || 0 != rval->data_allocated
- 971 ||CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
- 972 {
- 973 Py_XDECREF(rval);
- 974 free(rval_dims);
- 975 return NULL;
- 976 }
- 977 //set dim and stride
- 978 int size = 1;
- 979 for (int i = rval_nd-1; i >= 0; --i)
- 980 {
- 981 CudaNdarray_set_stride(rval, i, (rval_dims[i] == 1) ? 0 : size);
- 982 CudaNdarray_set_dim(rval, i, rval_dims[i]);
- 983 size = size * rval_dims[i];
- 984 }
- 985 free(rval_dims);
- 986 return (PyObject*)rval;
- 987 }
- 988
- 989 PyObject * CudaNdarray_View(const CudaNdarray * self)
- 990 {
- 991 CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New(self->nd);
- 992 if (!rval || CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
- 993 {
- 994 Py_XDECREF(rval);
- 995 rval = NULL;
- 996 }
- 997 else
- 998 {
- 999 for (int i = 0; i < self->nd; ++i)
- 1000 {
- 1001 CudaNdarray_set_dim(rval, i, CudaNdarray_HOST_DIMS(self)[i]);
- 1002 CudaNdarray_set_stride(rval, i, CudaNdarray_HOST_STRIDES(self)[i]);
- 1003 }
- 1004 }
- 1005 return (PyObject*)rval;
- 1006 }
- 1007
- 1008 /*
- 1009 * d0,... are the output dims
- 1010 * indices are a list of index to operate on
- 1011 * They are int32 viewed as float32.
- 1012 * a is the output
- 1013 * b is the input
- 1014 * dB0, the source leading dimensions size
- 1015 */
- 1016 template <int operator_num>
- 1017 __global__ void k_take_3(const int d0, const int d1, const int d2,
- 1018 const npy_int64* indices,
- 1019 float* a,
- 1020 const int sA0, const int sA1, const int sA2,
- 1021 const float* b, const int dB0,
- 1022 const int sB0, const int sB1, const int sB2,
- 1023 int* err){
- 1024 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
- 1025 npy_int64 idx = indices[i0];
- 1026 if (idx<0)
- 1027 idx += dB0; // To allow negative indexing.
- 1028 if ((idx < 0) || (idx >= dB0)){
- 1029 // Any value other the 0 probably work. But to be more safe, I want
- 1030 // to change all bits to prevent problem with concurrent write that
- 1031 // could cross cache line. But this should not happen with the
- 1032 // current code and driver.
- 1033 *err = 0xFFFF;
- 1034 continue;
- 1035 }
- 1036 for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x){
- 1037 for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y){
- 1038 int a_idx = i0*sA0 + i1*sA1 + i2*sA2;
- 1039 int b_idx = idx*sB0 + i1*sB1 + i2*sB2;
- 1040 a[a_idx] = b[b_idx];
- 1041 }
- 1042 }
- 1043 }
- 1044 }
- 1045
- 1046 // We try to be similar to the PyArray_TakeFrom function
- 1047 //http://docs.scipy.org/doc/numpy/reference/c-api.array.html
- 1048 //TODO: support other clip mode then raise(clip, wrap)
- 1049 //self is the input that we copy data from.
- 1050 //The indices that we receive MUST be an CudaNdarray(float32)
- 1051 // that is in fact a view to int64 indices
- 1052 PyObject*
- 1053 CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
- 1054 int verbose = 0;
- 1055 PyObject * indices_obj = NULL;
- 1056 //int axis; Default None, that mean the flattened array.
- 1057 PyObject * axis_obj = Py_None;
- 1058 PyObject * out_obj = Py_None;
- 1059 PyObject * clipmode_obj = NULL;
- 1060 int max_threads = 1; // max threads per blocks
- 1061
- 1062 if (! PyArg_ParseTuple(args, "O|OOOi", &indices_obj, &axis_obj,
- 1063 &out_obj, &clipmode_obj, &max_threads))
- 1064 return NULL;
- 1065
- 1066 //Check argument indices
- 1067 //TODO: if not a numpy.ndarray, convert to numpy.ndarray
- 1068 //TODO: If a CudaNdarray, accept it and suppose the data is int32? is float32 number of int?
- 1069 //TODO: Support ndarray of other dtype then int32
- 1070 //TODO: support list of indices that are not c_contiguous
- 1071 CudaNdarray * indices = NULL;
- 1072 if (CudaNdarray_Check(indices_obj)) {
- 1073 if (verbose) printf("cudandarray indices\n");
- 1074 indices = (CudaNdarray*) indices_obj;
- 1075 Py_INCREF(indices);
- 1076 } else if (PyArray_Check(indices_obj)) {
- 1077 if (verbose) printf("ndarray indices\n");
- 1078 if (PyArray_TYPE((PyArrayObject *)indices_obj) != NPY_INT64) {
- 1079 PyErr_SetString(PyExc_TypeError,
- 1080 "CudaNdarray_TakeFrom: need a ndarray for indices"
- 1081 " with dtype int64");
- 1082 return NULL;
- 1083 }
- 1084 if (PyArray_NDIM(((PyArrayObject*)indices_obj)) != 1) {
- 1085 PyErr_SetString(PyExc_TypeError,
- 1086 "CudaNdarray_TakeFrom: need a CudaNdarray of"
- 1087 " indices with only 1 dimensions");
- 1088 return NULL;
- 1089 }
- 1090 // We need indices_obj to be contiguous, in order to take a view
- 1091 // with a different dtype.
- 1092 if (!PyArray_IS_C_CONTIGUOUS((PyArrayObject*) indices_obj)) {
- 1093 PyObject* indices_obj_contig = PyArray_NewCopy((PyArrayObject*) indices_obj, NPY_CORDER);
- 1094 if (!indices_obj_contig)
- 1095 return NULL;
- 1096 indices_obj = indices_obj_contig;
- 1097 } else {
- 1098 // Keep the refcount consistent
- 1099 Py_INCREF(indices_obj);
- 1100 }
- 1101 PyArray_Descr* float32_descr = PyArray_DescrFromType(NPY_FLOAT32);
- 1102 PyObject * indices_float32 = NULL;
- 1103 indices_float32 = PyArray_View((PyArrayObject*)indices_obj,
- 1104 float32_descr, NULL);
- 1105 if (verbose) printf("ndarray indices\n");
- 1106 if (!indices_float32) {
- 1107 Py_DECREF(indices_obj);
- 1108 return NULL;
- 1109 }
- 1110
- 1111 indices = (CudaNdarray*) CudaNdarray_New();
- 1112 if (verbose) printf("\nndarray after new\n");
- 1113 if (! indices){
- 1114 Py_DECREF(indices_obj);
- 1115 Py_DECREF(indices_float32);
- 1116 return NULL;
- 1117 }
- 1118 if (CudaNdarray_CopyFromArray(indices,
- 1119 (PyArrayObject *)indices_float32)){
- 1120 Py_DECREF(indices_obj);
- 1121 Py_DECREF(indices_float32);
- 1122 return NULL;
- 1123 }
- 1124 Py_DECREF(indices_obj);
- 1125 Py_DECREF(indices_float32);
- 1126 } else {
- 1127 PyObject* py_s = PyObject_Str(indices_obj);
- 1128 const char* s = PyString_AsString(py_s);
- 1129 Py_DECREF(py_s);
- 1130 PyErr_Format(PyExc_TypeError,
- 1131 "CudaNdarray_TakeFrom: need an ndarray of int64 or a"
- 1132 " CudaNdarray(float32) that is a view from int64 data"
- 1133 " for indices. Got %s", s);
- 1134 return NULL;
- 1135 }
- 1136
- 1137 if (verbose) {
- 1138 printf("indices used on the gpu\n");
- 1139 fprint_CudaNdarray(stdout, indices);
- 1140 PyObject * used_indices = CudaNdarray_CreateArrayObj(indices);
- 1141 PyObject_Print(used_indices, stdout, 0);
- 1142 Py_DECREF(used_indices);
- 1143 }
- 1144 if (verbose) printf("after print of object\n");
- 1145 if(!CudaNdarray_is_c_contiguous(indices) != 0) {
- 1146 PyErr_SetString(PyExc_NotImplementedError,
- 1147 "CudaNdarray_TakeFrom: The indices must be contiguous in memory.");
- 1148 Py_DECREF(indices);
- 1149 return NULL;
- 1150 }
- 1151 int nb_indices = CudaNdarray_SIZE((CudaNdarray *)indices) / 2;// int64 are 8 bytes, float32 are 4 bytes
- 1152
- 1153 //Check argument axis
- 1154 //TODO: implement the default and other axis
- 1155 long axis = PyInt_AsLong(axis_obj);
- 1156
- 1157 if (axis != 0) {
- 1158 PyErr_Format(PyExc_NotImplementedError,
- 1159 "CudaNdarray_TakeFrom: only axis=0 is currently supported."
- 1160 " Got %ld.", axis);
- 1161 Py_DECREF(indices);
- 1162 return NULL;
- 1163 }
- 1164
- 1165 //Check argument out_obj
- 1166 CudaNdarray * out = NULL;
- 1167 if (out_obj && CudaNdarray_Check(out_obj))
- 1168 out = (CudaNdarray*) out_obj;
- 1169 if (out && (out->nd != self->nd ||
- 1170 CudaNdarray_HOST_DIMS(out)[0] != nb_indices))
- 1171 out = NULL;
- 1172 int * dims = (int *)malloc(sizeof(int) * self->nd);
- 1173 dims[0] = nb_indices;
- 1174
- 1175 for (int i=1 ; i<self->nd ; i++) {
- 1176 dims[i] = CudaNdarray_HOST_DIMS(self)[i];
- 1177 if (out && CudaNdarray_HOST_DIMS(out)[i] != dims[i]) {
- 1178 out = NULL;
- 1179 }
- 1180 }
- 1181 if (!out) {
- 1182 out = (CudaNdarray*)CudaNdarray_New();
- 1183 if (!out){
- 1184 Py_DECREF(indices);
- 1185 free(dims);
- 1186 return NULL;
- 1187 }
- 1188 if (CudaNdarray_alloc_contiguous(out, self->nd, dims)) {
- 1189 Py_DECREF(out);
- 1190 Py_DECREF(indices);
- 1191 free(dims);
- 1192 return NULL;
- 1193 }
- 1194 }else {
- 1195 Py_INCREF(out);
- 1196 }
- 1197
- 1198 //Check argument clipmode
- 1199 if (clipmode_obj) {
- 1200 char * clipmode = PyString_AsString(clipmode_obj);
- 1201 if (! clipmode){
- 1202 Py_DECREF(indices);
- 1203 Py_DECREF(out);
- 1204 free(dims);
- 1205 return NULL;
- 1206 }
- 1207 if (strcmp(clipmode, "raise") != 0) {
- 1208 PyErr_Format(PyExc_NotImplementedError,
- 1209 "CudaNdarray_TakeFrom: only the raise mode is currently supported. Got '%s'",
- 1210 clipmode);
- 1211 Py_DECREF(indices);
- 1212 Py_DECREF(out);
- 1213 free(dims);
- 1214 return NULL;
- 1215 }
- 1216 }
- 1217 void (*k3)(const int, const int, const int,
- 1218 const npy_int64*,
- 1219 float*, const int, const int, const int,
- 1220 const float*, const int,
- 1221 const int, const int, const int,
- 1222 int*);
- 1223 k3 = k_take_3<CPY>;
- 1224
- 1225 // Create the memory place that will store the error information.
- 1226 if(init_err_var() != 0) return NULL;
- 1227
- 1228 dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
- 1229 if(CudaNdarray_HOST_DIMS(out)[0] == 0){
- 1230 // We take 0 elements, so no need for the rest of the code.
- 1231 // This speed up that case AND fix crash otherwise.
- 1232 free(dims);
- 1233 Py_DECREF(indices);
- 1234 return (PyObject *)out;
- 1235 }
- 1236
- 1237 switch (self->nd) {
- 1238 case 1:
- 1239 {
- 1240 dim3 n_threads(1, 1, 1);
- 1241 if (verbose)
- 1242 printf("cudaGetLastError=%d, nd=%d"
- 1243 " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
- 1244 " n_threads.x=%i, n_threads.y=%i)\n",
- 1245 cudaGetLastError(), self->nd,
- 1246 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
- 1247 k3<<<n_blocks, n_threads>>>(
- 1248 dims[0],
- 1249 1,
- 1250 1,
- 1251 (npy_int64*) CudaNdarray_DEV_DATA(indices),
- 1252 CudaNdarray_DEV_DATA(out),
- 1253 CudaNdarray_HOST_STRIDES(out)[0], //strides
- 1254 1,
- 1255 1,
- 1256 CudaNdarray_DEV_DATA(self),
- 1257 CudaNdarray_HOST_DIMS(self)[0], //For indices check
- 1258 CudaNdarray_HOST_STRIDES(self)[0], //strides
- 1259 1,
- 1260 1,
- 1261 err_var);
- 1262 }
- 1263 break;
- 1264 case 2:
- 1265 {
- 1266 dim3 n_threads(std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads), 1, 1);
- 1267
- 1268 if (verbose)
- 1269 printf("cudaGetLastError=%d, nd=%d"
- 1270 " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
- 1271 " n_threads.x=%i, n_threads.y=%i)\n",
- 1272 cudaGetLastError(), self->nd,
- 1273 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
- 1274
- 1275 k3<<<n_blocks, n_threads>>>(
- 1276 dims[0], //dimensions
- 1277 dims[1],
- 1278 1,
- 1279 (npy_int64*) CudaNdarray_DEV_DATA(indices),
- 1280 CudaNdarray_DEV_DATA(out),
- 1281 CudaNdarray_HOST_STRIDES(out)[0], //strides
- 1282 CudaNdarray_HOST_STRIDES(out)[1],
- 1283 1,
- 1284 CudaNdarray_DEV_DATA(self),
- 1285 CudaNdarray_HOST_DIMS(self)[0], //For indices check
- 1286 CudaNdarray_HOST_STRIDES(self)[0], //strides
- 1287 CudaNdarray_HOST_STRIDES(self)[1],
- 1288 1,
- 1289 err_var);
- 1290 }
- 1291 break;
- 1292 case 3:
- 1293 {
- 1294 int ty = std::min(CudaNdarray_HOST_DIMS(out)[2], max_threads);
- 1295 int tx = std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads / ty);
- 1296 dim3 n_threads(tx, ty, 1);
- 1297 if (verbose)
- 1298 printf("cudaGetLastError=%d, nd=%d"
- 1299 " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
- 1300 " n_threads.x=%i, n_threads.y=%i)\n",
- 1301 cudaGetLastError(), self->nd,
- 1302 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
- 1303 k3<<<n_blocks, n_threads>>>(
- 1304 dims[0], //dimensions
- 1305 dims[1],
- 1306 dims[2],
- 1307 (npy_int64*) CudaNdarray_DEV_DATA(indices),
- 1308 CudaNdarray_DEV_DATA(out),
- 1309 CudaNdarray_HOST_STRIDES(out)[0], //strides
- 1310 CudaNdarray_HOST_STRIDES(out)[1],
- 1311 CudaNdarray_HOST_STRIDES(out)[2],
- 1312 CudaNdarray_DEV_DATA(self),
- 1313 CudaNdarray_HOST_DIMS(self)[0], //For indices check
- 1314 CudaNdarray_HOST_STRIDES(self)[0], //strides
- 1315 CudaNdarray_HOST_STRIDES(self)[1],
- 1316 CudaNdarray_HOST_STRIDES(self)[2],
- 1317 err_var);
- 1318 }
- 1319 break;
- 1320 default:
- 1321 PyErr_SetString(PyExc_NotImplementedError,
- 1322 "CudaNdarray_TakeFrom: only input with 1, 2 or 3"
- 1323 " dimensions are currently supported");
- 1324
- 1325 }
- 1326 free(dims);
- 1327 CNDA_THREAD_SYNC;
- 1328 cudaError_t err = cudaGetLastError();
- 1329 if (cudaSuccess != err) {
- 1330 PyErr_Format(PyExc_RuntimeError,
- 1331 "Cuda error: %s: %s.\n",
- 1332 "CudaNdarray_TakeFrom",
- 1333 cudaGetErrorString(err));
- 1334 Py_DECREF(indices);
- 1335 Py_DECREF(out);
- 1336 return NULL;
- 1337 }
- 1338
- 1339 int index_err = check_err_var();
- 1340 Py_DECREF(indices);
- 1341 if (index_err != 0) {
- 1342 Py_DECREF(out);
- 1343 return NULL;
- 1344 }
- 1345
- 1346 if (verbose) printf("TAKE SUCCEDED\n");
- 1347 return (PyObject *)out;
- 1348 }
- 1349
- 1350
- 1351 PyObject * CudaNdarray_SetStride(CudaNdarray * self, PyObject *args)
- 1352 {
- 1353 int pos, stride;
- 1354 if (! PyArg_ParseTuple(args, "ii", &pos, &stride))
- 1355 return NULL;
- 1356 if ((pos < 0) || (pos >= self->nd))
- 1357 {
- 1358 PyErr_Format(PyExc_ValueError, "position argument out of legal range [0, %i)", self->nd);
- 1359 return NULL;
- 1360 }
- 1361 CudaNdarray_set_stride(self, pos, stride);
- 1362 if (cnda_copy_structure_to_device(self))
- 1363 {
- 1364 return NULL;
- 1365 }
- 1366 Py_INCREF(Py_None);
- 1367 return Py_None;
- 1368 }
- 1369 PyObject * CudaNdarray_SetShapeI(CudaNdarray * self, PyObject *args)
- 1370 {
- 1371 int pos, dim;
- 1372 if (! PyArg_ParseTuple(args, "ii", &pos, &dim))
- 1373 return NULL;
- 1374 if ((pos < 0) || (pos >= self->nd))
- 1375 {
- 1376 PyErr_Format(PyExc_ValueError, "position argument out of legal range [0, %i)", self->nd);
- 1377 return NULL;
- 1378 }
- 1379 CudaNdarray_set_dim(self, pos, dim);
- 1380 if (cnda_copy_structure_to_device(self))
- 1381 {
- 1382 return NULL;
- 1383 }
- 1384 Py_INCREF(Py_None);
- 1385 return Py_None;
- 1386 }
- 1387
- 1388 static PyObject *
- 1389 CudaNdarray_exp(CudaNdarray* self)
- 1390 {
- 1391 CudaNdarray * rval = (CudaNdarray *)CudaNdarray_New();
- 1392 if ((NULL == rval) || CudaNdarray_alloc_contiguous(rval, self->nd, CudaNdarray_HOST_DIMS(self)))
- 1393 {
- 1394 Py_XDECREF(rval);
- 1395 return NULL;
- 1396 }
- 1397 unsigned int size = 1;
- 1398 for (int i = 0; i < self->nd; i++)
- 1399 {
- 1400 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
- 1401 }
- 1402 unsigned int threads_per_block = std::min(size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
- 1403 unsigned int n_blocks = std::min(ceil_intdiv(size,threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
- 1404 k_elemwise_unary_rowmajor_exp<<<n_blocks,threads_per_block>>>(size, self->nd, CudaNdarray_DEV_DIMS(self),
- 1405 CudaNdarray_DEV_DATA(self), CudaNdarray_DEV_STRIDES(self),
- 1406 CudaNdarray_DEV_DATA(rval), CudaNdarray_DEV_STRIDES(rval));
- 1407
- 1408 //TODO: don't do this right away, do it when we need the result
- 1409 CNDA_THREAD_SYNC;
- 1410 cudaError_t err = cudaGetLastError();
- 1411 if( cudaSuccess != err)
- 1412 {
- 1413 Py_DECREF(rval);
- 1414 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
- 1415 return NULL;
- 1416 }
- 1417
- 1418 return (PyObject*)rval;
- 1419 }
- 1420
- 1421 static PyMethodDef CudaNdarray_methods[] =
- 1422 {
- 1423 {"__array__",
- 1424 (PyCFunction)CudaNdarray_CreateArrayObj, METH_VARARGS,
- 1425 "Copy from the device to a numpy ndarray"},
- 1426 {"__copy__",
- 1427 (PyCFunction)CudaNdarray_View, METH_NOARGS,
- 1428 "Create a shallow copy of this object. used by module copy"},
- 1429 {"__deepcopy__",
- 1430 (PyCFunction)CudaNdarray_DeepCopy, METH_O,
- 1431 "Create a copy of this object"},
- 1432 {"zeros",
- 1433 (PyCFunction)CudaNdarray_Zeros, METH_STATIC | METH_O,
- 1434 "Create a new CudaNdarray with specified shape, filled with zeros."},
- 1435 {"copy",
- 1436 (PyCFunction)CudaNdarray_Copy, METH_NOARGS,
- 1437 "Create a copy of this object"},
- 1438 {"is_c_contiguous",
- 1439 (PyCFunction)CudaNdarray_IS_C_Contiguous, METH_NOARGS,
- 1440 "Return True is the object is c contiguous. False otherwise."},
- 1441 {"reduce_sum",
- 1442 (PyCFunction)CudaNdarray_ReduceSum, METH_O,
- 1443 "Reduce over the given dimensions by summation"},
- 1444 {"exp",
- 1445 (PyCFunction)CudaNdarray_exp, METH_NOARGS,
- 1446 "Return the exponential of all elements"},
- 1447 {"reshape",
- 1448 (PyCFunction)CudaNdarray_Reshape, METH_O,
- 1449 "Return a reshaped view (or copy) of this ndarray\n\
- 1450 The required argument is a tuple of integers specifying the shape of the new ndarray."},
- 1451 {"view",
- 1452 (PyCFunction)CudaNdarray_View, METH_NOARGS,
- 1453 "Return an alias of this ndarray"},
- 1454 {"_set_stride",
- 1455 (PyCFunction)CudaNdarray_SetStride, METH_VARARGS,
- 1456 "For integer arguments (i, s), set the 'i'th stride to 's'"},
- 1457 {"take",
- 1458 (PyCFunction)CudaNdarray_TakeFrom, METH_VARARGS,
- 1459 "Equivalent of numpy.take"},
- 1460 {"_set_shape_i",
- 1461 (PyCFunction)CudaNdarray_SetShapeI, METH_VARARGS,
- 1462 "For integer arguments (i, s), set the 'i'th shape to 's'"},
- 1463 {NULL, NULL, NULL, NULL} /* Sentinel */
- 1464 };
- 1465
- 1466
- 1467 ////////////////////
- 1468 // Number protocol
- 1469 ////////////////////
- 1470
- 1471 __global__ void kAdd_contiguous(float* a, float* b, float* dest, unsigned int numEls) {
- 1472 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
- 1473 const unsigned int numThreads = blockDim.x * gridDim.x;
- 1474
- 1475 for (unsigned int i = idx; i < numEls; i += numThreads) {
- 1476 dest[i] = a[i] + b[i];
- 1477 }
- 1478 }
- 1479
- 1480 // Will be called by __add__ in Python
- 1481 static PyObject *
- 1482 CudaNdarray_add(PyObject* py_self, PyObject * py_other)
- 1483 {
- 1484 if (! CudaNdarray_Check(py_self)) {
- 1485 PyErr_SetString(PyExc_TypeError, "need a CudaNdarray on left");
- 1486 return NULL;
- 1487 }
- 1488 if (! CudaNdarray_Check(py_other)) {
- 1489 PyErr_SetString(PyExc_TypeError, "need a CudaNdarray on right");
- 1490 return NULL;
- 1491 }
- 1492 CudaNdarray * self = (CudaNdarray *)py_self;
- 1493 CudaNdarray * other = (CudaNdarray *)py_other;
- 1494 if(!CudaNdarray_is_c_contiguous(self) || !CudaNdarray_is_c_contiguous(other)){
- 1495 PyErr_SetString(PyExc_TypeError, "We have implementet only the c_contiguous version for now.");
- 1496 return NULL;
- 1497 }
- 1498
- 1499 //standard elemwise size checks
- 1500 if (self->nd != other->nd)
- 1501 {
- 1502 PyErr_SetString(PyExc_TypeError, "CudaNdarray_add: need same number of dims");
- 1503 return NULL;
- 1504 }
- 1505 //standard elemwise dim checks
- 1506 unsigned int size = 1;
- 1507 for (int i = 0; i< self->nd; ++i)
- 1508 {
- 1509 if (CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
- 1510 {
- 1511 PyErr_SetString(PyExc_TypeError, "need same dimensions");
- 1512 return NULL;
- 1513 }
- 1514 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
- 1515 }
- 1516 CudaNdarray * rval = (CudaNdarray *)CudaNdarray_New();
- 1517 if (!rval || CudaNdarray_alloc_contiguous(rval, self->nd, CudaNdarray_HOST_DIMS(self)))
- 1518 {
- 1519 Py_XDECREF(rval);
- 1520 return NULL;
- 1521 }
- 1522
- 1523 if(CudaNdarray_SIZE((CudaNdarray *)py_self)==0 && CudaNdarray_SIZE((CudaNdarray *)py_other)==0){
- 1524 return (PyObject *) rval;
- 1525 }
- 1526
- 1527 int threads_per_block = std::min(size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
- 1528 int n_blocks = std::min(ceil_intdiv(size,(unsigned int)threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
- 1529 kAdd_contiguous<<<n_blocks,threads_per_block>>>(
- 1530 self->devdata, other->devdata, rval->devdata, size);
- 1531 CNDA_THREAD_SYNC;
- 1532 cudaError_t err = cudaGetLastError();
- 1533 if( cudaSuccess != err)
- 1534 {
- 1535 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kAdd", cudaGetErrorString(err));
- 1536 Py_DECREF(rval);
- 1537 return NULL;
- 1538 }
- 1539 return (PyObject *) rval;
- 1540 }
- 1541
- 1542 template <int operator_num>
- 1543 __global__ void k_ielem_3(const int d0, const int d1, const int d2,
- 1544 float* a, const int sA0, const int sA1, const int sA2,
- 1545 const float* b, const int sB0, const int sB1, const int sB2){
- 1546 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
- 1547 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
- 1548 for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){
- 1549 switch (operator_num)
- 1550 {
- 1551 case IADD:
- 1552 a[i0*sA0 + i1*sA1 + i2*sA2] += b[i0*sB0 + i1*sB1 + i2*sB2];
- 1553 break;
- 1554 case IDIV:
- 1555 a[i0*sA0 + i1*sA1 + i2*sA2] /= b[i0*sB0 + i1*sB1 + i2*sB2];
- 1556 break;
- 1557 case CPY:
- 1558 a[i0*sA0 + i1*sA1 + i2*sA2] = b[i0*sB0 + i1*sB1 + i2*sB2];
- 1559 break;
- 1560 }
- 1561 }
- 1562 }
- 1563 }
- 1564 }
- 1565
- 1566 template <int operator_num>
- 1567 __global__ void k_ielem_4(const int d0, const int d1, const int d2, const int d3,
- 1568 float* a, const int sA0, const int sA1,
- 1569 const int sA2, const int sA3,
- 1570 const float* b, const int sB0, const int sB1,
- 1571 const int sB2, const int sB3){
- 1572 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
- 1573 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
- 1574 for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){
- 1575 for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){
- 1576 switch (operator_num) {
- 1577 case IADD:
- 1578 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
- 1579 += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
- 1580 break;
- 1581 case IDIV:
- 1582 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
- 1583 /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
- 1584 break;
- 1585 case CPY:
- 1586 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
- 1587 = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
- 1588 break;
- 1589 }
- 1590 }
- 1591 }
- 1592 }
- 1593 }
- 1594 }
- 1595
- 1596 template <int operator_num>
- 1597 __global__ void k_ielem_6(const int d0, const int d1,
- 1598 const int d2, const int d3,
- 1599 const int d4, const int d5,
- 1600 float* a, const int sA0, const int sA1,
- 1601 const int sA2, const int sA3,
- 1602 const int sA4, const int sA5,
- 1603 const float* b, const int sB0, const int sB1,
- 1604 const int sB2, const int sB3,
- 1605 const int sB4, const int sB5
- 1606 ){
- 1607 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
- 1608 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
- 1609 for (int i2 = blockIdx.z; i2 < d2; i2 += gridDim.z){
- 1610 for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x){
- 1611 for (int i4 = threadIdx.y; i4 < d4; i4 += blockDim.y){
- 1612 for (int i5 = threadIdx.z; i5 < d5; i5 += blockDim.z){
- 1613 switch (operator_num) {
- 1614 case IADD:
- 1615 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
- 1616 += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
- 1617 break;
- 1618 case IDIV:
- 1619 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
- 1620 /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
- 1621 break;
- 1622 case CPY:
- 1623 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
- 1624 = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
- 1625 break;
- 1626 }
- 1627 }
- 1628 }
- 1629 }
- 1630 }
- 1631 }
- 1632 }
- 1633 }
- 1634
- 1635 /*
- 1636 CudaNdarray_inplace_elemwise
- 1637 Compute elemwise, working inplace on A.
- 1638 Currently implemented A / B, A + B and A = B
- 1639 (the last is not tested and not used!)
- 1640
- 1641 py_self - the CudaNdarray that we'll modify (A)
- 1642 py_other - the other argument (B)
- 1643 fct_nb - which operation to perform (operator_t)
- 1644
- 1645 Returns 0 on success.
- 1646 Returns -1 on failure, and sets Python exception.
- 1647
- 1648 */
- 1649 int
- 1650 CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb)
- 1651 {
- 1652 int verbose = 0;
- 1653 void (*k3)(const int, const int, const int,
- 1654 float*, const int, const int, const int,
- 1655 const float*, const int, const int, const int);
- 1656 void (*k4)(const int, const int, const int, const int,
- 1657 float*, const int, const int,
- 1658 const int, const int,
- 1659 const float*, const int, const int,
- 1660 const int, const int);
- 1661 void (*k6)(const int, const int,
- 1662 const int, const int,
- 1663 const int, const int,
- 1664 float*, const int, const int,
- 1665 const int, const int,
- 1666 const int, const int,
- 1667 const float*, const int, const int,
- 1668 const int, const int,
- 1669 const int, const int);
- 1670 switch (fct_nb)
- 1671 {
- 1672 case IADD:
- 1673 k3 = k_ielem_3<IADD>;
- 1674 k4 = k_ielem_4<IADD>;
- 1675 k6 = k_ielem_6<IADD>;
- 1676 break;
- 1677 case IDIV:
- 1678 k3 = k_ielem_3<IDIV>;
- 1679 k4 = k_ielem_4<IDIV>;
- 1680 k6 = k_ielem_6<IDIV>;
- 1681 break;
- 1682 case CPY:
- 1683 k3 = k_ielem_3<CPY>;
- 1684 k4 = k_ielem_4<CPY>;
- 1685 k6 = k_ielem_6<CPY>;
- 1686 break;
- 1687 default:
- 1688 assert (0);
- 1689 PyErr_Format(
- 1690 PyExc_TypeError,
- 1691 "CudaNdarray_inplace_elemwise invalid fct_nb (%i).",
- 1692 (int)fct_nb);
- 1693 return -1;
- 1694 }
- 1695 if (!CudaNdarray_Check(py_self)) {
- 1696 PyErr_SetString(
- 1697 PyExc_TypeError,
- 1698 "CudaNdarray_inplace_elemwise need a CudaNdarray on left");
- 1699 return -1;
- 1700 }
- 1701 CudaNdarray * new_other = NULL;
- 1702 if (!CudaNdarray_Check(py_other)) {
- 1703 new_other = (CudaNdarray*) CudaNdarray_New();
- 1704 if(!new_other)
- 1705 {
- 1706 return -1;
- 1707 }
- 1708 if(CudaNdarray_CopyFromArray(new_other, (PyArrayObject *) py_other))
- 1709 {
- 1710 Py_XDECREF(new_other);
- 1711 return -1;
- 1712 }
- 1713 py_other = (PyObject *) new_other;
- 1714 }
- 1715
- 1716 CudaNdarray * self = (CudaNdarray *)py_self;
- 1717 CudaNdarray * other = (CudaNdarray *)py_other;
- 1718
- 1719 if (verbose)
- 1720 {
- 1721 fprintf(stderr,
- 1722 "INPLACE ADD/DIV for self->nd=%d other->nd=%d\n",
- 1723 self->nd, other->nd);
- 1724 }
- 1725
- 1726 //standard elemwise nb dim checks
- 1727 if (self->nd < other->nd)
- 1728 {
- 1729 PyErr_Format(
- 1730 PyExc_TypeError,
- 1731 "CudaNdarray_inplace_elemwise: The destination need more or the"
- 1732 " same number of dimensions then the source. Got %d and %d.",
- 1733 self->nd, other->nd);
- 1734 Py_XDECREF(new_other);
- 1735 return -1;
- 1736 }
- 1737
- 1738 //broadcast to the same number of dimensions.
- 1739 int* other_dims = (int*) alloca(self->nd * sizeof(int));
- 1740 int* other_strides = (int*) alloca(self->nd * sizeof(int));
- 1741 int added_dims = self->nd - other->nd;
- 1742 // Add the added broadcasted dimensions
- 1743 for (int i = 0; i< added_dims; ++i)
- 1744 {
- 1745 other_dims[i] = 1;
- 1746 other_strides[i] = 0;
- 1747 }
- 1748 // Copy the existing dimensions
- 1749 for (int i = 0; i< other->nd; ++i)
- 1750 {
- 1751 other_dims[i+added_dims] = CudaNdarray_HOST_DIMS(other)[i];
- 1752 other_strides[i+added_dims] = CudaNdarray_HOST_STRIDES(other)[i];
- 1753 }
- 1754
- 1755 //standard elemwise dim checks
- 1756 unsigned int size = 1;
- 1757 for (int i = 0; i< self->nd; ++i)
- 1758 {
- 1759 if ((CudaNdarray_HOST_DIMS(self)[i] != other_dims[i])
- 1760 && (other_dims[i] != 1))
- 1761 {
- 1762 PyErr_SetString(
- 1763 PyExc_ValueError,
- 1764 "CudaNdarray_inplace_elemwise need same dimensions (or broadcastable dimension)");
- 1765 Py_XDECREF(new_other);
- 1766 return -1;
- 1767 }
- 1768 // if we're broadcasting other, then make sure it has stride 0
- 1769 assert ((CudaNdarray_HOST_DIMS(self)[i] == other_dims[i])
- 1770 || (other_strides[i] == 0));
- 1771 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
- 1772 }
- 1773
- 1774 if (size==0)
- 1775 {
- 1776 int other_size = CudaNdarray_SIZE((CudaNdarray *)py_other);
- 1777 if (!(other_size == 0 || other_size == 1))
- 1778 {
- 1779 PyErr_SetString(
- 1780 PyExc_ValueError,
- 1781 "CudaNdarray_inplace_elemwise cannot work inplace on"
- 1782 " un-initialized array when the new value have more than"
- 1783 " 0 or 1 broadcastable dimensions");
- 1784 Py_XDECREF(new_other);
- 1785 return 0;
- 1786 }
- 1787 Py_XDECREF(new_other);
- 1788 return 0;
- 1789 }
- 1790
- 1791 switch(self->nd)
- 1792 {
- 1793 case 0:
- 1794 {
- 1795 dim3 n_blocks(1, 1, 1);
- 1796 dim3 n_threads(1);
- 1797 k3<<<n_blocks, n_threads>>>(
- 1798 1, //d0
- 1799 1, //d1
- 1800 1, //d2
- 1801 CudaNdarray_DEV_DATA(self),
- 1802 1, //strides
- 1803 1,
- 1804 1,
- 1805 CudaNdarray_DEV_DATA(other),
- 1806 1, //strides
- 1807 1,
- 1808 1);
- 1809 CNDA_THREAD_SYNC;
- 1810 cudaError_t err = cudaGetLastError();
- 1811 if (cudaSuccess != err)
- 1812 {
- 1813 PyErr_Format(
- 1814 PyExc_RuntimeError,
- 1815 "CudaNdarray_inplace_elemwise case0: Cuda error: %s: %s.\n",
- 1816 "k3",
- 1817 cudaGetErrorString(err));
- 1818 Py_XDECREF(new_other);
- 1819 return -1;
- 1820 }
- 1821 }
- 1822 break;
- 1823 case 1:
- 1824 {
- 1825 dim3 n_blocks(1, 1, 1);
- 1826 dim3 n_threads(
- 1827 std::min(
- 1828 CudaNdarray_HOST_DIMS(self)[0],
- 1829 NUM_VECTOR_OP_THREADS_PER_BLOCK));
- 1830 k3<<<n_blocks, n_threads>>>(
- 1831 1, //dimensions
- 1832 1,
- 1833 CudaNdarray_HOST_DIMS(self)[0],
- 1834 CudaNdarray_DEV_DATA(self),
- 1835 1, //strides
- 1836 1,
- 1837 CudaNdarray_HOST_STRIDES(self)[0],
- 1838 CudaNdarray_DEV_DATA(other),
- 1839 1, //strides
- 1840 1,
- 1841 other_strides[0]);
- 1842 CNDA_THREAD_SYNC;
- 1843 cudaError_t err = cudaGetLastError();
- 1844 if (cudaSuccess != err)
- 1845 {
- 1846 PyErr_Format(
- 1847 PyExc_RuntimeError,
- 1848 "CudaNdarray_inplace_elemwise case1: Cuda error: %s: %s.\n",
- 1849 "k3",
- 1850 cudaGetErrorString(err));
- 1851 Py_XDECREF(new_other);
- 1852 return -1;
- 1853 }
- 1854 }
- 1855 break;
- 1856 case 2:
- 1857 {
- 1858 //TODO: if both self and other are f-contiguous
- 1859 // Then flip the block and thread dimensions
- 1860 // to make contiguous reads & writes
- 1861 dim3 n_blocks(1,
- 1862 std::min(
- 1863 CudaNdarray_HOST_DIMS(self)[0],
- 1864 NUM_VECTOR_OP_BLOCKS));
- 1865 dim3 n_threads(
- 1866 std::min(
- 1867 CudaNdarray_HOST_DIMS(self)[1],
- 1868 NUM_VECTOR_OP_THREADS_PER_BLOCK));
- 1869 k3<<<n_blocks, n_threads>>>(1,
- 1870 CudaNdarray_HOST_DIMS(self)[0],
- 1871 CudaNdarray_HOST_DIMS(self)[1],
- 1872 CudaNdarray_DEV_DATA(self),
- 1873 1,
- 1874 CudaNdarray_HOST_STRIDES(self)[0],
- 1875 CudaNdarray_HOST_STRIDES(self)[1],
- 1876 CudaNdarray_DEV_DATA(other),
- 1877 1,
- 1878 other_strides[0],
- 1879 other_strides[1]);
- 1880 CNDA_THREAD_SYNC;
- 1881 cudaError_t err = cudaGetLastError();
- 1882 if (cudaSuccess != err)
- 1883 {
- 1884 PyErr_Format(
- 1885 PyExc_RuntimeError,
- 1886 "CudaNdarray_inplace_elemwise case2: Cuda error: %s: %s.\n",
- 1887 "k3",
- 1888 cudaGetErrorString(err));
- 1889 Py_XDECREF(new_other);
- 1890 return -1;
- 1891 }
- 1892 }
- 1893 break;
- 1894 case 3:
- 1895 {
- 1896 //TODO: Dimshuffle so that at least one of the arrays
- 1897 // has a contiguous dimension on the thread idx.
- 1898 dim3 n_blocks(
- 1899 std::min(
- 1900 CudaNdarray_HOST_DIMS(self)[0],
- 1901 NUM_VECTOR_OP_BLOCKS),
- 1902 CudaNdarray_HOST_DIMS(self)[1]);
- 1903 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
- 1904 n_blocks.y /= 2;
- 1905 dim3 n_threads(
- 1906 std::min(
- 1907 CudaNdarray_HOST_DIMS(self)[2],
- 1908 NUM_VECTOR_OP_THREADS_PER_BLOCK));
- 1909 k3<<<n_blocks, n_threads>>>(
- 1910 CudaNdarray_HOST_DIMS(self)[0],
- 1911 CudaNdarray_HOST_DIMS(self)[1],
- 1912 CudaNdarray_HOST_DIMS(self)[2],
- 1913 CudaNdarray_DEV_DATA(self),
- 1914 CudaNdarray_HOST_STRIDES(self)[0],
- 1915 CudaNdarray_HOST_STRIDES(self)[1],
- 1916 CudaNdarray_HOST_STRIDES(self)[2],
- 1917 CudaNdarray_DEV_DATA(other),
- 1918 other_strides[0],
- 1919 other_strides[1],
- 1920 other_strides[2]);
- 1921 CNDA_THREAD_SYNC;
- 1922 cudaError_t err = cudaGetLastError();
- 1923 if (cudaSuccess != err)
- 1924 {
- 1925 PyErr_Format(
- 1926 PyExc_RuntimeError,
- 1927 "CudaNdarray_inplace_elemwise case3: Cuda error: %s: %s.\n",
- 1928 "k3",
- 1929 cudaGetErrorString(err));
- 1930 Py_XDECREF(new_other);
- 1931 return -1;
- 1932 }
- 1933 }
- 1934 break;
- 1935 case 4:
- 1936 {
- 1937 dim3 n_blocks(
- 1938 std::min(
- 1939 CudaNdarray_HOST_DIMS(self)[0],
- 1940 NUM_VECTOR_OP_BLOCKS),
- 1941 CudaNdarray_HOST_DIMS(self)[1]
- 1942 );
- 1943 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
- 1944 n_blocks.y /= 2;
- 1945 dim3 n_threads(
- 1946 std::min(
- 1947 CudaNdarray_HOST_DIMS(self)[2],
- 1948 NUM_VECTOR_OP_THREADS_PER_BLOCK)
- 1949 //TODO: DON"T YOU NEED OT PUT DIMS[3] in here???
- 1950 );
- 1951 k4<<<n_blocks, n_threads>>>(
- 1952 CudaNdarray_HOST_DIMS(self)[0],
- 1953 CudaNdarray_HOST_DIMS(self)[1],
- 1954 CudaNdarray_HOST_DIMS(self)[2],
- 1955 CudaNdarray_HOST_DIMS(self)[3],
- 1956 CudaNdarray_DEV_DATA(self),
- 1957 CudaNdarray_HOST_STRIDES(self)[0],
- 1958 CudaNdarray_HOST_STRIDES(self)[1],
- 1959 CudaNdarray_HOST_STRIDES(self)[2],
- 1960 CudaNdarray_HOST_STRIDES(self)[3],
- 1961 CudaNdarray_DEV_DATA(other),
- 1962 other_strides[0],
- 1963 other_strides[1],
- 1964 other_strides[2],
- 1965 other_strides[3]);
- 1966 CNDA_THREAD_SYNC;
- 1967 cudaError_t err = cudaGetLastError();
- 1968 if (cudaSuccess != err)
- 1969 {
- 1970 PyErr_Format(
- 1971 PyExc_RuntimeError,
- 1972 "CudaNdarray_inplace_elemwise case4: Cuda error: %s: %s.\n",
- 1973 "k4",
- 1974 cudaGetErrorString(err));
- 1975 Py_XDECREF(new_other);
- 1976 return -1;
- 1977 }
- 1978 }
- 1979 break;
- 1980 case 5:
- 1981 {
- 1982 dim3 n_blocks(
- 1983 std::min(
- 1984 CudaNdarray_HOST_DIMS(self)[1],
- 1985 NUM_VECTOR_OP_BLOCKS),
- 1986 CudaNdarray_HOST_DIMS(self)[2]);
- 1987 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
- 1988 n_blocks.y /= 2;
- 1989 dim3 n_threads(
- 1990 std::min(
- 1991 CudaNdarray_HOST_DIMS(self)[3],
- 1992 NUM_VECTOR_OP_THREADS_PER_BLOCK)
- 1993 //TODO: DON"T YOU NEED OT PUT DIMS[3] in here???
- 1994 );
- 1995 for (int i = 0; i < CudaNdarray_HOST_DIMS(self)[0]; ++i)
- 1996 {
- 1997 k4<<<n_blocks, n_threads>>>(
- 1998 CudaNdarray_HOST_DIMS(self)[1],
- 1999 CudaNdarray_HOST_DIMS(self)[2],
- 2000 CudaNdarray_HOST_DIMS(self)[3],
- 2001 CudaNdarray_HOST_DIMS(self)[4],
- 2002 CudaNdarray_DEV_DATA(self) + i * CudaNdarray_HOST_STRIDES(self)[0],
- 2003 CudaNdarray_HOST_STRIDES(self)[1],
- 2004 CudaNdarray_HOST_STRIDES(self)[2],
- 2005 CudaNdarray_HOST_STRIDES(self)[3],
- 2006 CudaNdarray_HOST_STRIDES(self)[4],
- 2007 CudaNdarray_DEV_DATA(other) + i * other_strides[0],
- 2008 other_strides[1],
- 2009 other_strides[2],
- 2010 other_strides[3],
- 2011 other_strides[4]);
- 2012 CNDA_THREAD_SYNC;
- 2013 cudaError_t err = cudaGetLastError();
- 2014 if( cudaSuccess != err)
- 2015 {
- 2016 PyErr_Format(
- 2017 PyExc_RuntimeError,
- 2018 "CudaNdarray_inplace_elemwise case5: Cuda error: %s: %s. n_block=(%ld,%ld) n_threads=%ld\n",
- 2019 "k5 with loop over k4",
- 2020 cudaGetErrorString(err),
- 2021 (long) n_blocks.x, (long) n_blocks.y, (long) n_threads.x);
- 2022 Py_XDECREF(new_other);
- 2023 return -1;
- 2024 }
- 2025 }
- 2026 }
- 2027 break;
- 2028 case 6:
- 2029 {
- 2030 dim3 n_blocks(
- 2031 std::min(
- 2032 CudaNdarray_HOST_DIMS(self)[0],
- 2033 NUM_VECTOR_OP_BLOCKS),
- 2034 CudaNdarray_HOST_DIMS(self)[1],
- 2035 CudaNdarray_HOST_DIMS(self)[2]
- 2036 );
- 2037 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
- 2038 n_blocks.y /= 2;
- 2039 // GTX285(compute capabilities 1.3) don't support n_blocks.z > 1
- 2040 // (compute capabilities 2.0) support 65535 for n_blocks.z
- 2041 //while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
- 2042 // n_blocks.z /= 2;
- 2043 n_blocks.z = 1;
- 2044 dim3 n_threads(
- 2045 std::min(
- 2046 CudaNdarray_HOST_DIMS(self)[3],
- 2047 NUM_VECTOR_OP_THREADS_PER_BLOCK)
- 2048 //TODO: DON'T YOU NEED TO PUT DIMS[4] in here???
- 2049 //TODO: DON'T YOU NEED TO PUT DIMS[5] in here???
- 2050 );
- 2051 k6<<<n_blocks, n_threads>>>(
- 2052 CudaNdarray_HOST_DIMS(self)[0],
- 2053 CudaNdarray_HOST_DIMS(self)[1],
- 2054 CudaNdarray_HOST_DIMS(self)[2],
- 2055 CudaNdarray_HOST_DIMS(self)[3],
- 2056 CudaNdarray_HOST_DIMS(self)[4],
- 2057 CudaNdarray_HOST_DIMS(self)[5],
- 2058 CudaNdarray_DEV_DATA(self),
- 2059 CudaNdarray_HOST_STRIDES(self)[0],
- 2060 CudaNdarray_HOST_STRIDES(self)[1],
- 2061 CudaNdarray_HOST_STRIDES(self)[2],
- 2062 CudaNdarray_HOST_STRIDES(self)[3],
- 2063 CudaNdarray_HOST_STRIDES(self)[4],
- 2064 CudaNdarray_HOST_STRIDES(self)[5],
- 2065 CudaNdarray_DEV_DATA(other),
- 2066 other_strides[0],
- 2067 other_strides[1],
- 2068 other_strides[2],
- 2069 other_strides[3],
- 2070 other_strides[4],
- 2071 other_strides[5]);
- 2072 CNDA_THREAD_SYNC;
- 2073 cudaError_t err = cudaGetLastError();
- 2074 if (cudaSuccess != err)
- 2075 {
- 2076 PyErr_Format(
- 2077 PyExc_RuntimeError,
- 2078 "CudaNdarray_inplace_elemwise case6: Cuda error: %s: %s. n_blocks=(%ld, %ld, %ld) n_threads=(%ld)\n",
- 2079 "k6",
- 2080 cudaGetErrorString(err),
- 2081 (long) n_blocks.x, (long) n_blocks.y, (long) n_blocks.z,
- 2082 (long) n_threads.x);
- 2083 Py_XDECREF(new_other);
- 2084 return -1;
- 2085 }
- 2086 }
- 2087 break;
- 2088 default:
- 2089 {
- 2090 PyErr_Format(
- 2091 PyExc_NotImplementedError,
- 2092 "inplace_elemwise w nd=%i\n",
- 2093 self->nd);
- 2094 Py_XDECREF(new_other);
- 2095 return -1;
- 2096 }
- 2097 }
- 2098 if (verbose)
- 2099 fprintf(stderr, "INPLACE ADD/DIV end\n");
- 2100 Py_XDECREF(new_other);
- 2101 return 0;
- 2102 }
- 2103
- 2104 /*
- 2105 * We need this inplace Add to support IncSubTensor
- 2106 * It returns py_self on success with an additional reference. Else NULL.
- 2107 */
- 2108 // Will be called by __iadd__ in Python
- 2109 PyObject *
- 2110 CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
- 2111 {
- 2112 if (CudaNdarray_inplace_elemwise(py_self, py_other, IADD))
- 2113 {
- 2114 return NULL;
- 2115 }
- 2116 Py_INCREF(py_self);
- 2117 return py_self;
- 2118 }
- 2119
- 2120 /*
- 2121 * We need this inplace div for cuda/tests/test_basic_ops.py:test_shared_options
- 2122 * It returns py_self on success with an additional reference. Else NULL.
- 2123 */
- 2124 // Will be called by __idiv__ in Python
- 2125 static PyObject *
- 2126 CudaNdarray_inplace_div(PyObject* py_self, PyObject * py_other)
- 2127 {
- 2128 if (CudaNdarray_inplace_elemwise(py_self, py_other, IDIV))
- 2129 {
- 2130 return NULL;
- 2131 }
- 2132 Py_INCREF(py_self);
- 2133 return py_self;
- 2134 }
- 2135
- 2136 // The PyNumberMethods struct layout changed in a non-trivial way from 2 to 3.
- 2137 #if PY_MAJOR_VERSION == 3
- 2138 static PyNumberMethods CudaNdarrayNumberMethods =
- 2139 {
- 2140 (binaryfunc)CudaNdarray_add, //binaryfunc nb_add; __add__
- 2141 0, //binaryfunc nb_subtract;
- 2142 0, //binaryfunc nb_multiply;
- 2143 0, //binaryfunc nb_remainder;
- 2144 0, //binaryfunc nb_divmod;
- 2145 0, //ternaryfunc nb_power;
- 2146 0, //unaryfunc nb_negative;
- 2147 0, //unaryfunc nb_positive;
- 2148 0, //unaryfunc nb_absolute;
- 2149 0, //inquiry nb_bool;
- 2150 0, //unaryfunc nb_invert;
- 2151 0, //binaryfunc nb_lshift;
- 2152 0, //binaryfunc nb_rshift;
- 2153 0, //binaryfunc nb_and;
- 2154 0, //binaryfunc nb_xor;
- 2155 0, //binaryfunc nb_or;
- 2156 0, //unaryfunc nb_int;
- 2157 0, //void *nb_reserved;
- 2158 0, //unaryfunc nb_float;
- 2159
- 2160 (binaryfunc)CudaNdarray_inplace_add, //binaryfunc nb_inplace_add; __iadd__
- 2161 0, //binaryfunc nb_inplace_subtract;
- 2162 0, //binaryfunc nb_inplace_multiply;
- 2163 0, //binaryfunc nb_inplace_remainder;
- 2164 0, //ternaryfunc nb_inplace_power;
- 2165 0, //binaryfunc nb_inplace_lshift;
- 2166 0, //binaryfunc nb_inplace_rshift;
- 2167 0, //binaryfunc nb_inplace_and;
- 2168 0, //binaryfunc nb_inplace_xor;
- 2169 0, //binaryfunc nb_inplace_or;
- 2170
- 2171 0, //binaryfunc nb_floor_divide;
- 2172 0, //binaryfunc nb_true_divide;
- 2173 0, //binaryfunc nb_inplace_floor_divide;
- 2174 (binaryfunc)CudaNdarray_inplace_div, //binaryfunc nb_inplace_true_divide; __idiv__
- 2175
- 2176 0, //unaryfunc nb_index
- 2177 };
- 2178 #else
- 2179 static PyNumberMethods CudaNdarrayNumberMethods =
- 2180 {
- 2181 (binaryfunc)CudaNdarray_add, //binaryfunc nb_add; __add__
- 2182 0, //binaryfunc nb_subtract; __sub__
- 2183 0, //binaryfunc nb_multiply; __mul__
- 2184 0, //binaryfunc nb_divide; __div__
- 2185 0, //binaryfunc nb_remainder; __mod__
- 2186 0, //binaryfunc nb_divmod; __divmod__
- 2187 0, //ternaryfunc nb_power; __pow__
- 2188 0, //unaryfunc nb_negative; __neg__
- 2189 0, //unaryfunc nb_positive; __pos__
- 2190 0, //unaryfunc nb_absolute; __abs__
- 2191 0, //inquiry nb_nonzero; __nonzero__ /* Used by PyObject_IsTrue */
- 2192 0, //unaryfunc nb_invert; __invert__
- 2193 0, //binaryfunc nb_lshift; __lshift__
- 2194 0, //binaryfunc nb_rshift; __rshift__
- 2195 0, //binaryfunc nb_and; __and__
- 2196 0, //binaryfunc nb_xor; __xor__
- 2197 0, //binaryfunc nb_or; __or__
- 2198 0, //coercion nb_coerce; __coerce__ /* Used by the coerce() function */
- 2199 0, //unaryfunc nb_int; __int__
- 2200 0, //unaryfunc nb_long; __long__
- 2201 0, //unaryfunc nb_float; __float__
- 2202 0, //unaryfunc nb_oct; __oct__
- 2203 0, //unaryfunc nb_hex; __hex__
- 2204
- 2205 /* Added in release 2.0 */
- 2206 (binaryfunc)CudaNdarray_inplace_add, //binaryfunc nb_inplace_add; __iadd__
- 2207 0, //binaryfunc nb_inplace_subtract; __isub__
- 2208 0, //binaryfunc nb_inplace_multiply; __imul__
- 2209 (binaryfunc)CudaNdarray_inplace_div, //binaryfunc nb_inplace_divide; __idiv__
- 2210 0, //binaryfunc nb_inplace_remainder; __imod__
- 2211 0, //ternaryfunc nb_inplace_power; __ipow__
- 2212 0, //binaryfunc nb_inplace_lshift; __ilshift__
- 2213 0, //binaryfunc nb_inplace_rshift; __irshift__
- 2214 0, //binaryfunc nb_inplace_and; __iand__
- 2215 0, //binaryfunc nb_inplace_xor; __ixor__
- 2216 0, //binaryfunc nb_inplace_or; __ior__
- 2217
- 2218 /* Added in release 2.2 */
- 2219 0, //binaryfunc nb_floor_divide; __floordiv__
- 2220 0, //binaryfunc nb_true_divide; __truediv__
- 2221 0, //binaryfunc nb_inplace_floor_divide; __ifloordiv__
- 2222 0, //binaryfunc nb_inplace_true_divide; __itruediv__
- 2223
- 2224 #if PY_MINOR_VERSION > 4
- 2225 /* Added in release 2.5 */
- 2226 0 //unaryfunc nb_index; __index__
- 2227 #endif
- 2228 };
- 2229 #endif
- 2230
- 2231
- 2232 /////////////////////
- 2233 // Mapping protocol
- 2234 /////////////////////
- 2235
- 2236 // Will by called by __len__ in Python
- 2237 static Py_ssize_t
- 2238 CudaNdarray_len(PyObject * py_self)
- 2239 {
- 2240 CudaNdarray * self = (CudaNdarray*) py_self;
- 2241 if (self->nd <= 0)
- 2242 {
- 2243 return (Py_ssize_t) 0;
- 2244 }
- 2245 else
- 2246 {
- 2247 return (Py_ssize_t) CudaNdarray_HOST_DIMS(self)[0];
- 2248 }
- 2249 }
- 2250
- 2251 // Will by called by __getitem__ in Python
- 2252 PyObject *
- 2253 CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
- 2254 {
- 2255 int verbose = 0;
- 2256 if (verbose) fprintf(stderr, "Subscript .... \n");
- 2257 CudaNdarray * self = (CudaNdarray*) py_self;
- 2258 PyObject * py_rval = NULL;
- 2259 CudaNdarray * rval = NULL;
- 2260 PyObject * intobj = NULL;
- 2261
- 2262 //PyObject_Print(key, stderr, 0);
- 2263
- 2264 if (key == Py_Ellipsis)
- 2265 {
- 2266 Py_INCREF(py_self);
- 2267 return py_self;
- 2268 }
- 2269 if ((intobj=PyNumber_Int(key))) //INDEXING BY INTEGER
- 2270 //else if (PyInt_Check(key)) //INDEXING BY INTEGER
- 2271 {
- 2272 int d_idx = PyInt_AsLong(intobj);
- 2273 Py_DECREF(intobj); intobj=NULL;
- 2274 //int d_idx = PyInt_AsLong(key);
- 2275 if (self->nd == 0)
- 2276 {
- 2277 PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed");
- 2278 return NULL;
- 2279 }
- 2280 int d_dim = CudaNdarray_HOST_DIMS(self)[0];
- 2281 int offset = 0;
- 2282
- 2283 if ((d_idx >= 0) && (d_idx < d_dim))
- 2284 {
- 2285 //normal indexing
- 2286 offset += d_idx * CudaNdarray_HOST_STRIDES(self)[0];
- 2287 }
- 2288 else if ((d_idx < 0) && (d_idx >= -d_dim))
- 2289 {
- 2290 //end-based indexing
- 2291 // d_idx is negative
- 2292 offset += (d_dim + d_idx) * CudaNdarray_HOST_STRIDES(self)[0];
- 2293 }
- 2294 else
- 2295 {
- 2296 PyErr_Format(PyExc_IndexError,
- 2297 "index out of bounds. Asked %d, but size of %d",
- 2298 d_idx, d_dim);
- 2299 return NULL;
- 2300 }
- 2301
- 2302 //allocate our subtensor view
- 2303 py_rval = CudaNdarray_new_nd(self->nd - 1);
- 2304 rval = (CudaNdarray*) py_rval;
- 2305 if (!rval) return NULL;
- 2306 assert (0 == rval->data_allocated);
- 2307
- 2308 //initialize the view's data pointer to our own.
- 2309 if (CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self) + offset, self))
- 2310 {
- 2311 Py_DECREF(rval);
- 2312 return NULL;
- 2313 }
- 2314 for (int d = 1; d < self->nd; ++d)
- 2315 {
- 2316 CudaNdarray_set_stride(rval, d-1, CudaNdarray_HOST_STRIDES(self)[d]);
- 2317 CudaNdarray_set_dim(rval, d-1, CudaNdarray_HOST_DIMS(self)[d]);
- 2318 }
- 2319 }
- 2320 else
- 2321 {
- 2322 PyErr_Clear();
- 2323 }
- 2324 if (PySlice_Check(key)) //INDEXING BY SLICE
- 2325 {
- 2326 if (verbose) fprintf(stderr, "by slice\n");
- 2327 if (self->nd == 0)
- 2328 {
- 2329 PyErr_SetString(PyExc_ValueError, "cannot slice a 0-d array");
- 2330 return NULL;
- 2331 }
- 2332
- 2333 int d_dim = CudaNdarray_HOST_DIMS(self)[0];
- 2334 Py_ssize_t start, stop, step, slen;
- 2335 if (PySlice_GetIndicesEx(SLICE_CAST(key), d_dim, &start, &stop, &step, &slen))
- 2336 {
- 2337 if (verbose)
- 2338 fprintf(stderr, "PySlice_GetIndicesEx failed\n");
- 2339 return NULL;
- 2340 }
- 2341 if (verbose)
- 2342 {
- 2343 std::cerr << "start " << start << "\n";
- 2344 std::cerr << "stop " << stop << "\n";
- 2345 std::cerr << "step " << step << "\n";
- 2346 std::cerr << "slen " << slen << "\n";
- 2347 }
- 2348
- 2349 //allocate our subtensor view
- 2350 py_rval = CudaNdarray_new_nd(self->nd);
- 2351 rval = (CudaNdarray*) py_rval;
- 2352 if (!rval) return NULL;
- 2353 assert (0 == rval->data_allocated);
- 2354
- 2355
- 2356 //initialize the view's data pointer to our own.
- 2357 if (CudaNdarray_set_device_data(rval,
- 2358 CudaNdarray_DEV_DATA(self) + start * CudaNdarray_HOST_STRIDES(self)[0],
- 2359 self))
- 2360 {
- 2361 Py_DECREF(rval);
- 2362 return NULL;
- 2363 }
- 2364 //initialize dimension 0 of rval
- 2365 CudaNdarray_set_stride(rval, 0,
- 2366 (slen == 1) ? 0 : step * CudaNdarray_HOST_STRIDES(self)[0]);
- 2367 CudaNdarray_set_dim(rval, 0, slen);
- 2368 if (verbose) std::cerr << "rval stride " << CudaNdarray_HOST_STRIDES(rval)[0] << "\n";
- 2369 // initialize dimensions > 0 of rval
- 2370 for (int d = 1; d < self->nd; ++d)
- 2371 {
- 2372 CudaNdarray_set_stride(rval, d, CudaNdarray_HOST_STRIDES(self)[d]);
- 2373 CudaNdarray_set_dim(rval, d, CudaNdarray_HOST_DIMS(self)[d]);
- 2374 }
- 2375 }
- 2376 if (PyTuple_Check(key)) //INDEXING BY TUPLE
- 2377 {
- 2378 if (verbose) fprintf(stderr, "by tuple\n");
- 2379 //elements of the tuple can be either integers or slices
- 2380 //the dimensionality of the view we will return is diminished for each slice in the tuple
- 2381
- 2382 if (PyTuple_Size(key) > self->nd)
- 2383 {
- 2384 PyErr_SetString(PyExc_IndexError, "index error");
- 2385 return NULL;
- 2386 }
- 2387
- 2388 //calculate the number of dimensions in the return value
- 2389 int rval_nd = self->nd;
- 2390 for (int d = 0; d < PyTuple_Size(key); ++d)
- 2391 {
- 2392 //On some paltform PyInt_Check(<type 'numpy.int64'>) return true, other it return false.
- 2393 //So we use PyArray_IsAnyScalar that should covert everything.
- 2394 rval_nd -= PyArray_IsAnyScalar(PyTuple_GetItem(key, d));
- 2395 }
- 2396
- 2397 //allocate our subtensor view
- 2398 py_rval = CudaNdarray_new_nd(rval_nd);
- 2399 rval = (CudaNdarray*) py_rval;
- 2400 if (!rval) return NULL;
- 2401 assert (0 == rval->data_allocated);
- 2402
- 2403 //initialize the view's data pointer to our own.
- 2404 if (CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
- 2405 {
- 2406 Py_DECREF(rval);
- 2407 return NULL;
- 2408 }
- 2409
- 2410 // rval_d will refer to the current dimension in the rval.
- 2411 // It will not be incremented for integer keys, but will be incremented for slice
- 2412 // keys
- 2413 int rval_d = 0;
- 2414
- 2415 for (int d = 0; d < self->nd; ++d)
- 2416 {
- 2417 // keys can be shorter than self->nd.
- 2418 // when that happens, it means that the remaining dimensions are "full slices"
- 2419 if (d >=PyTuple_Size(key))
- 2420 {
- 2421 CudaNdarray_set_stride(rval, rval_d, CudaNdarray_HOST_STRIDES(self)[d]);
- 2422 CudaNdarray_set_dim(rval, rval_d, CudaNdarray_HOST_DIMS(self)[d]);
- 2423 ++rval_d;
- 2424 }
- 2425 else
- 2426 {
- 2427 PyObject * key_d = PyTuple_GetItem(key, d);
- 2428
- 2429 if (PySlice_Check(key_d))
- 2430 {
- 2431 Py_ssize_t start, stop, step, slen;
- 2432 if (PySlice_GetIndicesEx(SLICE_CAST(key_d), CudaNdarray_HOST_DIMS(self)[d], &start, &stop, &step, &slen))
- 2433 {
- 2434 Py_DECREF(rval);
- 2435 return NULL;
- 2436 }
- 2437 rval->devdata += start * CudaNdarray_HOST_STRIDES(self)[d];
- 2438 CudaNdarray_set_stride(rval, rval_d,
- 2439 (slen == 1) ? 0 : step * CudaNdarray_HOST_STRIDES(self)[d]);
- 2440 CudaNdarray_set_dim(rval, rval_d, slen);
- 2441 if (0)
- 2442 {
- 2443 std::cerr << "start " << start << "\n";
- 2444 std::cerr << "stop " << stop << "\n";
- 2445 std::cerr << "step " << step << "\n";
- 2446 std::cerr << "slen " << slen << "\n";
- 2447 }
- 2448 ++rval_d;
- 2449 }
- 2450 else if ((intobj=PyNumber_Int(key_d)))
- 2451 {
- 2452 assert(PyArray_IsAnyScalar(key_d));
- 2453 int d_idx = PyInt_AsLong(intobj);
- 2454 Py_DECREF(intobj);
- 2455 intobj = NULL;
- 2456 int d_dim = CudaNdarray_HOST_DIMS(self)[d];
- 2457
- 2458 if ((d_idx >= 0) && (d_idx < d_dim))
- 2459 {
- 2460 //normal indexing
- 2461 rval->devdata += d_idx * CudaNdarray_HOST_STRIDES(self)[d];
- 2462 }
- 2463 else if ((d_idx < 0) && (d_idx >= -d_dim))
- 2464 {
- 2465 //end-based indexing
- 2466 rval->devdata += (d_dim + d_idx) * CudaNdarray_HOST_STRIDES(self)[d];
- 2467 }
- 2468 else
- 2469 {
- 2470 PyErr_Format(PyExc_IndexError,
- 2471 "index out of bounds. Asked %d for dimensions %d, but size of %d",
- 2472 d_idx, d, d_dim);
- 2473 Py_DECREF(rval);
- 2474 return NULL;
- 2475 }
- 2476 }
- 2477 else
- 2478 {
- 2479 PyErr_Clear(); // clear the error set by PyNumber_Int
- 2480 PyErr_SetString(PyExc_IndexError, "index must be either int or slice");
- 2481 Py_DECREF(rval);
- 2482 return NULL;
- 2483 }
- 2484 }
- 2485 }
- 2486 }
- 2487 if (py_rval)
- 2488 {
- 2489 if (verbose) fprint_CudaNdarray(stderr, self);
- 2490 if (verbose) fprint_CudaNdarray(stderr, rval);
- 2491 }
- 2492 else
- 2493 {
- 2494 PyErr_SetString(PyExc_NotImplementedError, "Unknown key type");
- 2495 return NULL;
- 2496 }
- 2497 return py_rval;
- 2498 }
- 2499
- 2500 // Will by called by __setitem__ in Python
- 2501 // See http://docs.python.org/dev/py3k/c-api/object.html#PyObject_SetItem
- 2502 // Doesn't handle broadcasting, e.g. a[:] = 5
- 2503 // Can only be assigned from a CudaNdarray on the right side
- 2504 // Or a ndarray
- 2505 // Or a python scalar with value 0 when the left side part is c contiguous.
- 2506 static int
- 2507 CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
- 2508 {
- 2509 int verbose = 0;
- 2510 if (verbose) fprintf(stderr, "CudaNdarray_setitem start\n");
- 2511 // We try to copy directly into this CudaNdarray from the ndarray
- 2512 CudaNdarray* rval = (CudaNdarray*)CudaNdarray_Subscript(o, key);
- 2513 CudaNdarray* new_value = NULL;
- 2514
- 2515 if(!rval){
- 2516 // CudaNdarray_Subscript failed and set the error msg.
- 2517 Py_XDECREF(rval);
- 2518 return -1;
- 2519 }
- 2520
- 2521 if(rval != (CudaNdarray*)o &&
- 2522 (rval->data_allocated ||
- 2523 // The new array should have a base
- 2524 !(((CudaNdarray*)rval)->base) ||
- 2525 // If the original array has no base, the base of the new
- 2526 // array should be the original one
- 2527 (!((CudaNdarray*)o)->base && ((CudaNdarray*)rval)->base != o) ||
- 2528 // Else, the two arrays should have the same base
- 2529 (((CudaNdarray*)o)->base && ((CudaNdarray*)rval)->base != ((CudaNdarray*)o)->base)))
- 2530 {
- 2531 // This case shouldn't happen, based on what I see in Subscript
- 2532 // but just in case it happens sometime in the future
- 2533
- 2534 PyErr_Format(PyExc_RuntimeError,
- 2535 "__getitem__ must return a CudaNdarray that refers to"
- 2536 " the original CudaNdarray, not a copy. rval.base=%p"
- 2537 " o.base=%p o=%p",
- 2538 (((CudaNdarray*)rval)->base), ((CudaNdarray*)o)->base, o);
- 2539 Py_DECREF(rval);
- 2540 return -1;
- 2541 }
- 2542
- 2543 PyObject * intobj = NULL;
- 2544 if (CudaNdarray_Check(o) && PyArray_Check(value)){
- 2545 if (verbose)
- 2546 fprintf(stderr,
- 2547 "CudaNdarray_setitem dest is a CudaNdarray and"
- 2548 " value is a ndarray\n");
- 2549 new_value = (CudaNdarray*) CudaNdarray_New();
- 2550 if(!new_value)
- 2551 {
- 2552 return -1;
- 2553 }
- 2554 if (CudaNdarray_CopyFromArray(new_value, (PyArrayObject *) value))
- 2555 {
- 2556 Py_XDECREF(new_value);
- 2557 Py_XDECREF(rval);
- 2558 return -1;
- 2559 }
- 2560 value = (PyObject *) new_value;
- 2561 }
- 2562 else if ((intobj=PyNumber_Int(value)))
- 2563 {
- 2564 if (verbose)
- 2565 fprintf(stderr,
- 2566 "CudaNdarray_setitem dest and value is a python number\n");
- 2567 if(! CudaNdarray_is_c_contiguous(rval)){
- 2568 PyErr_SetString(PyExc_NotImplementedError,
- 2569 "CudaNdarray.__setitem__: When the new value is a scalar"
- 2570 " of value 0 the part where we copy to must be c contiguous.");
- 2571 Py_XDECREF(rval);
- 2572 return -1;
- 2573 }
- 2574
- 2575 long val = PyInt_AsLong(intobj);
- 2576 Py_DECREF(intobj); intobj=NULL;
- 2577 if (val == 0)
- 2578 {
- 2579 cudaError_t err = cudaMemset(rval->devdata, 0,
- 2580 CudaNdarray_SIZE(rval) * sizeof(real));
- 2581 Py_XDECREF(rval);
- 2582 if (err)
- 2583 {
- 2584 // Clear the error flag, cudaMemset doesn't do it.
- 2585 // Currently this returns the same thing as err, but if in future
- 2586 // it returns something else I still don't see why we should ignore
- 2587 // it. All we want to do here is reset the flag.
- 2588 cudaGetLastError();
- 2589 PyErr_SetString(PyExc_RuntimeError,
- 2590 "CudaNdarray.__setitem__: cudaMemset failed");
- 2591 return -1;
- 2592 }
- 2593 return 0;
- 2594 } else {
- 2595 Py_XDECREF(rval);
- 2596 PyErr_SetString(PyExc_NotImplementedError,
- 2597 "CudaNdarray.__setitem__: we support setting only python"
- 2598 " scalar of value 0, numpy nd array and CudaNdarray.");
- 2599 return -1;
- 2600 }
- 2601 }
- 2602
- 2603 PyErr_Clear(); // clear PyNumber_Int error.
- 2604
- 2605 if(!CudaNdarray_Check(o) || !CudaNdarray_Check(value))
- 2606 {
- 2607 PyErr_SetString(PyExc_TypeError,
- 2608 "CudaNdarray.__setitem__: left must be a CudaNdarrays and right"
- 2609 " must be a CudaNdarrays, an ndarray or a python scalar of value 0.");
- 2610 Py_XDECREF(new_value);
- 2611 return -1;
- 2612 }
- 2613
- 2614 if (verbose)
- 2615 fprintf(stderr, "CudaNdarray_setitem dest and value are CudaNdarray\n");
- 2616
- 2617 if (cnda_copy_structure_to_device(rval))
- 2618 {
- 2619 PyErr_SetString(PyExc_RuntimeError,
- 2620 "CudaNdarray.__setitem__: syncing structure to device failed");
- 2621 Py_DECREF(rval);
- 2622 Py_XDECREF(new_value);
- 2623
- 2624 if (verbose)
- 2625 fprintf(stderr, "CudaNdarray_setitem error end\n");
- 2626 return -1;
- 2627 }
- 2628
- 2629 PyObject *baseSavedForComparison = rval->base;
- 2630
- 2631 if (CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true))
- 2632 {
- 2633 Py_DECREF((PyObject*)rval);
- 2634 Py_XDECREF(new_value);
- 2635
- 2636 if (verbose)
- 2637 fprintf(stderr, "CudaNdarray_setitem error end\n");
- 2638 return -1;
- 2639 }
- 2640
- 2641 assert (rval->base == baseSavedForComparison);
- 2642 assert (rval->dev_structure_fresh);
- 2643
- 2644 // Clean up locally-created references
- 2645 Py_DECREF(rval);
- 2646 Py_XDECREF(new_value);
- 2647
- 2648 return 0;
- 2649 }
- 2650
- 2651
- 2652 PyMappingMethods CudaNdarrayMappingMethods = {
- 2653 CudaNdarray_len, //lenfunc mp_length; __len__
- 2654 CudaNdarray_Subscript, //binaryfunc mp_subscript; __getitem__
- 2655 CudaNdarray_setitem //objobjargproc mp_ass_subscript; __setitem__
- 2656 };
- 2657
- 2658 ////////////////////
- 2659 //
- 2660 ////////////////////
- 2661
- 2662 static PyObject *
- 2663 CudaNdarray_get_shape(CudaNdarray *self, void *closure)
- 2664 {
- 2665 if (self->nd < 0)
- 2666 {
- 2667 PyErr_SetString(PyExc_ValueError, "CudaNdarray not initialized");
- 2668 return NULL;
- 2669 }
- 2670 PyObject * rval = PyTuple_New(self->nd);
- 2671 for (int i = 0; i < self->nd; ++i)
- 2672 {
- 2673 if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(CudaNdarray_HOST_DIMS(self)[i])))
- 2674 {
- 2675 Py_XDECREF(rval);
- 2676 return NULL;
- 2677 }
- 2678
- 2679 }
- 2680 return rval;
- 2681 }
- 2682
- 2683 static int
- 2684 CudaNdarray_set_shape(CudaNdarray *self, PyObject *value, void *closure)
- 2685 {
- 2686 PyErr_SetString(PyExc_NotImplementedError, "TODO: call reshape");
- 2687 return -1;
- 2688 }
- 2689
- 2690 static PyObject *
- 2691 CudaNdarray_get_strides(CudaNdarray *self, void *closure)
- 2692 {
- 2693 if (self->nd < 0)
- 2694 {
- 2695 PyErr_SetString(PyExc_ValueError, "CudaNdarray not initialized");
- 2696 return NULL;
- 2697 }
- 2698 PyObject * rval = PyTuple_New(self->nd);
- 2699 for (int i = 0; i < self->nd; ++i)
- 2700 {
- 2701 if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(CudaNdarray_HOST_STRIDES(self)[i])))
- 2702 {
- 2703 Py_XDECREF(rval);
- 2704 return NULL;
- 2705 }
- 2706
- 2707 }
- 2708 return rval;
- 2709 }
- 2710
- 2711 static int
- 2712 CudaNdarray_set_strides(CudaNdarray *self, PyObject *value, void *closure)
- 2713 {
- 2714 //npy_intp newstrides_bytes[PyTuple_Size(value)];
- 2715 if (PyTuple_Check(value)){
- 2716 if (PyTuple_Size(value) != CudaNdarray_NDIM(self)){
- 2717 PyErr_SetString(PyExc_ValueError,
- 2718 "The new strides tuple must have the same length"
- 2719 " as the number of dimensions");
- 2720 return -1;
- 2721 }
- 2722 }else if (PyList_Check(value)){
- 2723 if (PyList_Size(value) != CudaNdarray_NDIM(self)){
- 2724 PyErr_SetString(PyExc_ValueError,
- 2725 "The new strides list must have the same length"
- 2726 " as the number of dimensions");
- 2727 return -1;
- 2728 }
- 2729 }else{
- 2730 PyErr_SetString(PyExc_ValueError,
- 2731 "The new strides need to be encoded in a tuple or list");
- 2732 return -1;
- 2733 }
- 2734 npy_intp* newstrides = (npy_intp*) alloca(CudaNdarray_NDIM(self) * sizeof(npy_intp));
- 2735 if (PyTuple_Check(value)){
- 2736 for(int i=0; i < CudaNdarray_NDIM(self); i++){
- 2737 newstrides[i] = PyInt_AsLong(PyTuple_GetItem(value, Py_ssize_t(i)));
- 2738 //newstrides_bytes[i] = newstrides[i] * 4;
- 2739 }
- 2740 }else if (PyList_Check(value)){
- 2741 for(int i=0; i < CudaNdarray_NDIM(self); i++){
- 2742 newstrides[i] = PyInt_AsLong(PyList_GetItem(value, Py_ssize_t(i)));
- 2743 //newstrides_bytes[i] = newstrides[i] * 4;
- 2744 }
- 2745 }
- 2746 /*
- 2747 // Do not do this check, as ExtractDiag needs that, and NumPy does not seem
- 2748 // to do it.
- 2749 npy_intp dims[PyTuple_Size(value)];
- 2750 for(int i=0; i < CudaNdarray_NDIM(self); i++){
- 2751 dims[i] = CudaNdarray_HOST_DIMS(self)[i];
- 2752 }
- 2753 if (!PyArray_CheckStrides(4,
- 2754 CudaNdarray_NDIM(self),
- 2755 0, 0,
- 2756 dims,
- 2757 newstrides_bytes)){
- 2758 PyErr_SetString(PyExc_ValueError, "bad new strides");
- 2759 return -1;
- 2760 }
- 2761 */
- 2762 for(int i=0; i < CudaNdarray_NDIM(self); i++){
- 2763 CudaNdarray_set_stride(self, i, newstrides[i]);
- 2764 }
- 2765 return 0;
- 2766 }
- 2767
- 2768 static PyObject *
- 2769 CudaNdarray_get_dev_data(CudaNdarray *self, void *closure)
- 2770 {
- 2771 float * p = CudaNdarray_DEV_DATA(self);
- 2772 //printf("get_dev_data %p %li \n", p, (long int)p );
- 2773 return PyInt_FromSize_t((size_t) CudaNdarray_DEV_DATA(self));
- 2774 }
- 2775
- 2776 static int
- 2777 CudaNdarray_set_dev_data(CudaNdarray *self, PyObject *value, void *closure)
- 2778 {
- 2779 Py_ssize_t newdevdata = PyInt_AsSsize_t(value);
- 2780 //printf("set_dev_data %p %li \n",(float*)newdevdata ,newdevdata);
- 2781 if (PyErr_Occurred())
- 2782 {
- 2783 return -1;
- 2784 }
- 2785 return CudaNdarray_set_device_data(self, (float*)newdevdata, (CudaNdarray*)self->base);
- 2786 }
- 2787
- 2788 static PyObject *
- 2789 CudaNdarray_get_dtype(CudaNdarray *self, void *closure)
- 2790 {
- 2791 return PyString_FromString("float32");
- 2792 }
- 2793
- 2794 static PyObject *
- 2795 CudaNdarray_get_ndim(CudaNdarray *self, void *closure)
- 2796 {
- 2797 return PyInt_FromLong(self->nd);
- 2798 }
- 2799
- 2800 static PyObject *
- 2801 CudaNdarray_get_base(CudaNdarray *self, void *closure)
- 2802 {
- 2803 PyObject * base = self->base;
- 2804 if (!base)
- 2805 {
- 2806 // We cannot return a NULL pointer, use None instead
- 2807 base = Py_None;
- 2808 }
- 2809 Py_INCREF(base);
- 2810 return base;
- 2811 }
- 2812
- 2813 void put_in_dict(PyObject * dict, const char * key, int val)
- 2814 {
- 2815 PyObject * k = PyString_FromString(key);
- 2816 PyObject * v = PyInt_FromLong(val);
- 2817 PyDict_SetItem(dict, k, v);
- 2818 Py_DECREF(k);
- 2819 Py_DECREF(v);
- 2820 }
- 2821
- 2822 PyObject *
- 2823 GetDeviceProperties(PyObject* _unused, PyObject* args)
- 2824 {
- 2825 int dev_id = -1;
- 2826 if (! PyArg_ParseTuple(args, "i", &dev_id))
- 2827 return NULL;
- 2828 cudaDeviceProp deviceProp;
- 2829 cudaGetDeviceProperties(&deviceProp, dev_id);
- 2830
- 2831 PyObject * dict = PyDict_New();
- 2832 PyObject * str= PyString_FromString("name");
- 2833 PyObject * i = PyString_FromString(deviceProp.name);
- 2834 PyDict_SetItem(dict, str, i);
- 2835 Py_DECREF(str);
- 2836 Py_DECREF(i);
- 2837
- 2838 put_in_dict(dict, "major", deviceProp.major);
- 2839 put_in_dict(dict, "minor", deviceProp.minor);
- 2840 #if CUDART_VERSION >= 2020
- 2841 int driverVersion = 0, runtimeVersion = 0;
- 2842 cudaDriverGetVersion(&driverVersion);
- 2843 cudaRuntimeGetVersion(&runtimeVersion);
- 2844 put_in_dict(dict, "driverVersion", driverVersion);
- 2845 put_in_dict(dict, "runtimeVersion", runtimeVersion);
- 2846 #endif
- 2847 #if CUDART_VERSION >= 2000
- 2848
- 2849 put_in_dict(dict, "multiProcessorCount", deviceProp.multiProcessorCount);
- 2850 //if ConvertSMVer2Cores is not defined in cuda_runtime_api.h, the run time is too old.
- 2851 int sm_cores = -1;
- 2852 if(deviceProp.major==1)
- 2853 sm_cores = 32;
- 2854 else if(deviceProp.major==2 && deviceProp.minor==0)
- 2855 sm_cores = 32;
- 2856 else if(deviceProp.major==2 && deviceProp.minor==1)
- 2857 sm_cores = 48;
- 2858 put_in_dict(dict, "coresCount", sm_cores * deviceProp.multiProcessorCount);
- 2859 #endif
- 2860 put_in_dict(dict, "totalConstMem", deviceProp.totalConstMem);
- 2861 put_in_dict(dict, "sharedMemPerBlock", deviceProp.sharedMemPerBlock);
- 2862 put_in_dict(dict, "regsPerBlock", deviceProp.regsPerBlock);
- 2863 put_in_dict(dict, "warpSize", deviceProp.warpSize);
- 2864 put_in_dict(dict, "maxThreadsPerBlock", deviceProp.maxThreadsPerBlock);
- 2865 put_in_dict(dict, "maxThreadsDim0", deviceProp.maxThreadsDim[0]);
- 2866 put_in_dict(dict, "maxThreadsDim1", deviceProp.maxThreadsDim[1]);
- 2867 put_in_dict(dict, "maxThreadsDim2", deviceProp.maxThreadsDim[2]);
- 2868 put_in_dict(dict, "maxGridSize0", deviceProp.maxGridSize[0]);
- 2869 put_in_dict(dict, "maxGridSize1", deviceProp.maxGridSize[1]);
- 2870 put_in_dict(dict, "maxGridSize2", deviceProp.maxGridSize[2]);
- 2871 put_in_dict(dict, "memPitch", deviceProp.memPitch);
- 2872 put_in_dict(dict, "textureAlignment", deviceProp.textureAlignment);
- 2873 put_in_dict(dict, "clockRate", deviceProp.clockRate);
- 2874 #if CUDART_VERSION >= 2000
- 2875 put_in_dict(dict, "deviceOverlap", deviceProp.deviceOverlap);
- 2876 #endif
- 2877 #if CUDART_VERSION >= 2020
- 2878 put_in_dict(dict, "kernelExecTimeoutEnabled", deviceProp.kernelExecTimeoutEnabled);
- 2879 put_in_dict(dict, "integrated", deviceProp.integrated);
- 2880 put_in_dict(dict, "canMapHostMemory", deviceProp.canMapHostMemory);
- 2881 put_in_dict(dict, "computeMode", deviceProp.computeMode);
- 2882 //in the doc of this fct tell that 0 - Normal mode, 1 - only 1 context, 2 - no context
- 2883 #endif
- 2884 #if CUDART_VERSION >= 3000
- 2885 put_in_dict(dict, "concurrentKernels", deviceProp.concurrentKernels);
- 2886 #endif
- 2887 #if CUDART_VERSION >= 3010
- 2888 put_in_dict(dict, "ECCEnabled", deviceProp.ECCEnabled);
- 2889 #endif
- 2890 #if CUDART_VERSION >= 3020
- 2891 put_in_dict(dict, "tccDriver", deviceProp.tccDriver);
- 2892 #endif
- 2893
- 2894 return dict;
- 2895 }
- 2896
- 2897 /*
- 2898 * Returns in *free and *total respectively, the free and total amount of memory available for allocation by the device in bytes.
- 2899 */
- 2900 PyObject *
- 2901 GetDeviceMemInfo(PyObject* _unused, PyObject* dummy)
- 2902 {
- 2903 size_t free = 0, total = 0;
- 2904 if(g_gpu_context_active == 0){
- 2905 PyErr_Format(PyExc_RuntimeError, "No gpu device selected yet. Please make sure the gpu device was initialized by Theano before.");
- 2906 return NULL;
- 2907 }
- 2908
- 2909 cudaError_t err = cudaMemGetInfo(&free, &total);
- 2910 if (err != cudaSuccess){
- 2911 // Clear the error flag, cudaMemGetInfo doesn't do it.
- 2912 // Currently this returns the same thing as err, but if in future
- 2913 // it returns something else I still don't see why we should ignore
- 2914 // it. All we want to do here is reset the flag.
- 2915 cudaGetLastError();
- 2916 PyErr_Format(PyExc_RuntimeError,
- 2917 "Error while getting memory info about the gpu: %s",
- 2918 cudaGetErrorString(err));
- 2919 return NULL;
- 2920 }
- 2921 return PyTuple_Pack(2, PyLong_FromLong(free), PyLong_FromLong(total));
- 2922 }
- 2923
- 2924 /*
- 2925 * Synchronize with all the gpu device stream.
- 2926 */
- 2927 PyObject *
- 2928 CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
- 2929 {
- 2930 CNDA_BEGIN_ALLOW_THREADS
- 2931 cudaThreadSynchronize();
- 2932 CNDA_END_ALLOW_THREADS
- 2933 Py_INCREF(Py_None);
- 2934 return Py_None;
- 2935 }
- 2936
- 2937 /*
- 2938 * Exist and return true if we link with cublas v2.
- 2939 */
- 2940 PyObject *
- 2941 CudaNdarray_cublasv2(PyObject* _unused, PyObject* dummy)
- 2942 {
- 2943 Py_INCREF(Py_True);
- 2944 return Py_True;
- 2945 }
- 2946
- 2947 PyObject *
- 2948 CudaNdarray_select_a_gpu(PyObject* _unused, PyObject* dummy)
- 2949 {
- 2950 void * rval = NULL;
- 2951 cudaError_t err;
- 2952 int num_gpus = 0;
- 2953
- 2954 err = cudaGetDeviceCount(&num_gpus);
- 2955 if (cudaSuccess != err){
- 2956 printf("ERR!\\n");
- 2957 PyErr_Format(PyExc_RuntimeError,
- 2958 "Not able to get number of GPUs (%s).",
- 2959 cudaGetErrorString(err));
- 2960 return NULL;
- 2961 }
- 2962
- 2963 for (int device = 0; device < num_gpus; device++) {
- 2964 cudaSetDevice(device);
- 2965 err = cudaDeviceSynchronize(); // << CUDA context gets created here.
- 2966 cudaGetLastError(); // reset the error state
- 2967 if (cudaSuccess == err)
- 2968 break;
- 2969 }
- 2970
- 2971 if (cudaSuccess != err){
- 2972 printf("ERR!\\n");
- 2973 PyErr_Format(PyExc_RuntimeError,
- 2974 "Not able to select available GPU from %d cards (%s).",
- 2975 num_gpus, cudaGetErrorString(err));
- 2976 return NULL;
- 2977 }
- 2978
- 2979 Py_INCREF(Py_None);
- 2980 return Py_None;
- 2981 }
- 2982
- 2983 #if COMPUTE_GPU_MEM_USED
- 2984 /*
- 2985 * Return the size in bytes that Theano currently have allocated on the gpu.
- 2986 */
- 2987 PyObject *
- 2988 GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy)
- 2989 {
- 2990 PyObject* a = PyLong_FromLong(_allocated_size);
- 2991 PyObject* b = PyLong_FromLong(_max_allocated_size);
- 2992
- 2993 PyObject* tuple = PyTuple_New(2);
- 2994 PyTuple_SetItem(tuple, 0, a);
- 2995 PyTuple_SetItem(tuple, 1, b);
- 2996 return tuple;
- 2997 }
- 2998 #endif
- 2999
- 3000 static PyGetSetDef CudaNdarray_getset[] = {
- 3001 {"shape",
- 3002 (getter)CudaNdarray_get_shape,
- 3003 (setter)CudaNdarray_set_shape,
- 3004 "shape of this ndarray (tuple)",
- 3005 NULL},
- 3006 {"_strides",
- 3007 (getter)CudaNdarray_get_strides,
- 3008 (setter)CudaNdarray_set_strides,
- 3009 "data pointer strides (in elements)",
- 3010 NULL},
- 3011 {"strides",
- 3012 (getter)CudaNdarray_get_strides,
- 3013 (setter)CudaNdarray_set_strides,
- 3014 "data pointer strides (in elements)",
- 3015 NULL},
- 3016 //gpudata is needed to allow calling pycuda fct with CudaNdarray input.
- 3017 {"gpudata",
- 3018 (getter)CudaNdarray_get_dev_data,
- 3019 NULL,
- 3020 "device data pointer",
- 3021 NULL},
- 3022 {"_dev_data",
- 3023 (getter)CudaNdarray_get_dev_data,
- 3024 (setter)CudaNdarray_set_dev_data,
- 3025 "device data pointer",
- 3026 NULL},
- 3027 {"dtype",
- 3028 (getter)CudaNdarray_get_dtype,
- 3029 NULL,
- 3030 "The dtype of the element. Now always float32",
- 3031 NULL},
- 3032 {"size",
- 3033 (getter)CudaNdarray_SIZE_Object,
- 3034 NULL,
- 3035 "The number of elements in this object.",
- 3036 NULL},
- 3037 //mem_size is neede for pycuda.elementwise.ElementwiseKernel Why do they use size and mem_size of the same value?
- 3038 {"mem_size",
- 3039 (getter)CudaNdarray_SIZE_Object,
- 3040 NULL,
- 3041 "The number of elements in this object.",
- 3042 NULL},
- 3043 {"ndim",
- 3044 (getter)CudaNdarray_get_ndim,
- 3045 NULL,
- 3046 "The number of dimensions in this object.",
- 3047 NULL},
- 3048 {"base",
- 3049 (getter)CudaNdarray_get_base,
- 3050 NULL,
- 3051 "If this ndarray is a view, base is the original ndarray.",
- 3052 NULL},
- 3053
- 3054 {NULL, NULL, NULL, NULL} /* Sentinel */
- 3055 };
- 3056
- 3057 PyObject *CudaNdarray_repr(PyObject *self)
- 3058 {
- 3059 CudaNdarray *object = (CudaNdarray *)self;
- 3060 PyObject * np_object = CudaNdarray_CreateArrayObj(object);
- 3061 PyObject * str = PyObject_Str((PyObject *) np_object);
- 3062 char * cstr = PyString_AsString(str);
- 3063 PyObject * out = PyString_FromFormat("%s%s%s",
- 3064 "CudaNdarray(",
- 3065 cstr,
- 3066 ")");
- 3067 Py_DECREF(str);
- 3068 Py_DECREF(np_object);
- 3069 #if PY_MAJOR_VERSION >= 3
- 3070 // In Python 3 PyString_FromFormat return a Bytes object
- 3071 PyObject* out2 = PyObject_Str(out);
- 3072 Py_DECREF(out);
- 3073 return out2;
- 3074 #endif
- 3075 return out;
- 3076 }
- 3077
- 3078 static PyTypeObject CudaNdarrayType =
- 3079 {
- 3080 #if PY_MAJOR_VERSION >= 3
- 3081 PyVarObject_HEAD_INIT(NULL, 0)
- 3082 #else
- 3083 PyObject_HEAD_INIT(NULL)
- 3084 0, /*ob_size*/
- 3085 #endif
- 3086 "CudaNdarray", /*tp_name*/
- 3087 sizeof(CudaNdarray), /*tp_basicsize*/
- 3088 0, /*tp_itemsize*/
- 3089 (destructor)CudaNdarray_dealloc, /*tp_dealloc*/
- 3090 0, /*tp_print*/
- 3091 0, /*tp_getattr*/
- 3092 0, /*tp_setattr*/
- 3093 0, /*tp_compare*/
- 3094 CudaNdarray_repr, /*tp_repr*/
- 3095 &CudaNdarrayNumberMethods, /*tp_as_number*/
- 3096 0, /*tp_as_sequence*/
- 3097 &CudaNdarrayMappingMethods,/*tp_as_mapping*/
- 3098 0, /*tp_hash */
- 3099 0, /*tp_call*/
- 3100 0, /*tp_str*/
- 3101 0, /*tp_getattro*/
- 3102 0, /*tp_setattro*/
- 3103 0, /*tp_as_buffer*/
- 3104 #if PY_MAJOR_VERSION >= 3
- 3105 // Py_TPFLAGS_CHECKTYPES is always true and was removed in Python 3.
- 3106 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
- 3107 #else
- 3108 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_CHECKTYPES, /*tp_flags*/
- 3109 #endif
- 3110 "CudaNdarray objects", /* tp_doc */
- 3111 0, /* tp_traverse */
- 3112 0, /* tp_clear */
- 3113 0, /* tp_richcompare */
- 3114 0, /* tp_weaklistoffset */
- 3115 0, /* tp_iter */
- 3116 0, /* tp_iternext */
- 3117 CudaNdarray_methods, /* tp_methods */
- 3118 CudaNdarray_members, /* tp_members */
- 3119 CudaNdarray_getset, /* tp_getset */
- 3120 0, /* tp_base */
- 3121 0, /* tp_dict */
- 3122 0, /* tp_descr_get */
- 3123 0, /* tp_descr_set */
- 3124 0, /* tp_dictoffset */
- 3125 (initproc)CudaNdarray_init,/* tp_init */
- 3126 0, /* tp_alloc */
- 3127 CudaNdarray_new, /* tp_new */
- 3128 };
- 3129
- 3130 static __global__ void get_gpu_ptr_size(int* dst)
- 3131 {
- 3132 dst[0] = sizeof(float*);
- 3133 dst[1] = sizeof(int);
- 3134 }
- 3135
- 3136 PyObject *
- 3137 CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
- 3138 {
- 3139 int *gpu_data = (int*)device_malloc(sizeof(int)*2);
- 3140 if(gpu_data == NULL){
- 3141 return NULL;
- 3142 }
- 3143 get_gpu_ptr_size<<<1,1>>>(gpu_data);
- 3144
- 3145 cudaError_t cudaErr = cudaGetLastError();
- 3146 if (cudaSuccess != cudaErr){
- 3147
- 3148 device_free(gpu_data);
- 3149 return PyErr_Format(PyExc_RuntimeError,
- 3150 "CudaNdarray_ptr_int_size: error when calling the gpu code. (%s)",
- 3151 cudaGetErrorString(cudaErr));
- 3152 }
- 3153
- 3154 // Transfer the result to cpu
- 3155 int gpu_sizes[] = {-1,-1};
- 3156 cublasStatus_t err;
- 3157 err = cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1);
- 3158 device_free(gpu_data);
- 3159
- 3160 if (CUBLAS_STATUS_SUCCESS != err){
- 3161 PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory");
- 3162 return NULL;
- 3163 }
- 3164 return Py_BuildValue("iiii", (int) gpu_sizes[0], (int)sizeof(float*),
- 3165 (int)sizeof(int), (int) gpu_sizes[1]);
- 3166 }
- 3167
- 3168 static int cublas_init();
- 3169 static void cublas_shutdown();
- 3170 // Initialize the gpu.
- 3171 // Takes two optional parameters, the device number and if we should use cnmem.
- 3172 // If the device number is provided, it sets that device to be the active device.
- 3173 // If not provided (usually just to test whether the gpu is available at all),
- 3174 // it does not set an active device.
- 3175 // Raises EnvironmentError or ValueError (as appropriate) if the initialization failed.
- 3176 // cnmem is threaded like a bool. If converted to 0, don't use cnmem. Otherwise, use it.
- 3177 PyObject *
- 3178 CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
- 3179 {
- 3180 int card_nb = 0;
- 3181 int card_number_provided = 1;
- 3182 float cnmem = 0; // Theano flag lib.cnmem
- 3183 // if we're given something wildly invalid, this will throw a TypeError
- 3184 if(!PyArg_ParseTuple(args, "|if", &card_nb, &cnmem))
- 3185 return NULL;
- 3186 if(cnmem)
- 3187 g_use_cnmem = true;
- 3188
- 3189 if(PyTuple_Size(args) == 0) {
- 3190 card_number_provided = 0;
- 3191 card_nb = 0;
- 3192 }
- 3193
- 3194 int deviceCount;
- 3195 cudaError err = cudaGetDeviceCount(&deviceCount);
- 3196 if(cudaSuccess != err) {
- 3197 return PyErr_Format(PyExc_EnvironmentError,
- 3198 "Unable to get the number of gpus available: %s",
- 3199 cudaGetErrorString(cudaGetLastError()));
- 3200 }
- 3201
- 3202 // as soon as the first successful call to a cuda* function is made, a
- 3203 // gpu context has been created
- 3204 g_gpu_context_active = 1;
- 3205
- 3206 if(deviceCount <= 0) {
- 3207 return PyErr_Format(PyExc_EnvironmentError,
- 3208 "Can't use the GPU, no devices support CUDA");
- 3209 }
- 3210 if(card_number_provided && (card_nb < 0 || card_nb > (deviceCount - 1))) {
- 3211 return PyErr_Format(PyExc_ValueError,
- 3212 "Bad device number %d. Only %d devices available.",
- 3213 card_nb,
- 3214 deviceCount);
- 3215 }
- 3216
- 3217 cudaDeviceProp deviceProp;
- 3218 err = cudaGetDeviceProperties(&deviceProp, card_nb);
- 3219 if(cudaSuccess != err) {
- 3220 return PyErr_Format(PyExc_EnvironmentError,
- 3221 "Unable to get properties of gpu %i: %s",
- 3222 card_nb,
- 3223 cudaGetErrorString(cudaGetLastError()));
- 3224 }
- 3225
- 3226 if(deviceProp.major == 9999 && deviceProp.minor == 9999 ){
- 3227 return PyErr_Format(PyExc_EnvironmentError,
- 3228 "There is no device that supports CUDA");
- 3229 }
- 3230
- 3231 if(card_number_provided) {
- 3232 err = cudaSetDevice(card_nb);
- 3233 if(cudaSuccess != err) {
- 3234 return PyErr_Format(PyExc_EnvironmentError,
- 3235 "Unable to set device %i: %s",
- 3236 card_nb,
- 3237 cudaGetErrorString(cudaGetLastError()));
- 3238 }
- 3239 if (cublas_init() == -1)
- 3240 return NULL;
- 3241 }
- 3242 if(card_number_provided && g_use_cnmem) {
- 3243 size_t mem = 0;
- 3244 if (cnmem > 1)
- 3245 mem = cnmem * 1024 * 1024;
- 3246 else{
- 3247 // Clip to 95% to let memory for the driver.
- 3248 // 98% didn't worked in some cases.
- 3249 if (cnmem > .95){
- 3250 cnmem = .95;
- 3251 }
- 3252 size_t free = 0, total = 0;
- 3253 cudaError_t err = cudaMemGetInfo(&free, &total);
- 3254 if (err != cudaSuccess){
- 3255 // Clear the error flag, cudaMemGetInfo doesn't do it.
- 3256 // Currently this returns the same thing as err, but if in future
- 3257 // it returns something else I still don't see why we should ignore
- 3258 // it. All we want to do here is reset the flag.
- 3259 cudaGetLastError();
- 3260 PyErr_Format(PyExc_RuntimeError,
- 3261 "Error while getting memory info about the gpu: %s",
- 3262 cudaGetErrorString(err));
- 3263 return NULL;
- 3264 }
- 3265 mem = total * cnmem;
- 3266 }
- 3267 if(initCnmem(card_number_provided, card_nb, mem) == -1){
- 3268 return NULL;
- 3269 }
- 3270 }
- 3271
- 3272 Py_INCREF(Py_None);
- 3273 return Py_None;
- 3274 }
- 3275
- 3276 PyObject *
- 3277 CudaNdarray_active_device_number(PyObject* _unused, PyObject* _unused_args) {
- 3278 // NB: No cuda error checking here; keeps things simple, and it's not
- 3279 // really necessary.
- 3280 int currentDevice;
- 3281 cudaGetDevice(¤tDevice);
- 3282 return PyInt_FromLong(currentDevice);
- 3283 }
- 3284
- 3285 PyObject *
- 3286 CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) {
- 3287 // NB: No cuda error checking here; keeps things simple, and it's not
- 3288 // really necessary.
- 3289 int currentDevice;
- 3290 cudaGetDevice(¤tDevice);
- 3291
- 3292 cudaDeviceProp deviceProp;
- 3293 cudaGetDeviceProperties(&deviceProp, currentDevice);
- 3294 return PyString_FromString(deviceProp.name);
- 3295 }
- 3296
- 3297 PyObject *
- 3298 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
- 3299 // Don't handle errors here
- 3300 cublas_shutdown();
- 3301 g_gpu_context_active = 0; // context has now been closed down
- 3302 if(g_use_cnmem) {
- 3303 cnmemStatus_t status = cnmemFinalize();
- 3304 if(status != CNMEM_STATUS_SUCCESS) {
- 3305 fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n",
- 3306 cnmemGetErrorString(status));
- 3307 if(status == CNMEM_STATUS_CUDA_ERROR) {
- 3308 fprintf(stderr, " Cuda-Reason=%s\n",
- 3309 cudaGetErrorString(cudaGetLastError()));
- 3310 }
- 3311 }
- 3312 }
- 3313 cudaThreadExit();
- 3314
- 3315 Py_INCREF(Py_None);
- 3316 return Py_None;
- 3317 }
- 3318
- 3319 /*
- 3320 * This function is tested in theano/misc/test_pycuda_theano_simple.py
- 3321 */
- 3322 PyObject *
- 3323 CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args)
- 3324 {
- 3325 int verbose = 0;
- 3326 PyObject *gpu_ptr = NULL;
- 3327 PyObject *shapes = NULL;
- 3328 PyObject *strides = NULL;
- 3329 PyObject *base = NULL;
- 3330 PyObject *rval = NULL;
- 3331
- 3332 //args should consist of 3 python objects
- 3333 //The first is the gpu ptr
- 3334 //The second if the shape
- 3335 //The third if the strides
- 3336 if (! PyArg_ParseTuple(args, "OOOO", &gpu_ptr, &shapes, &strides, &base))
- 3337 return NULL;
- 3338
- 3339 if (verbose) printf("In CudaNdarray_from_gpu_pointer\n");
- 3340 if (!PyLong_Check(gpu_ptr))
- 3341 {
- 3342 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: The gpu pointor is not an long");
- 3343 return NULL;
- 3344 }
- 3345
- 3346 Py_ssize_t nd = PyObject_Length(shapes);
- 3347 if (nd < 0)
- 3348 {
- 3349 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of second argument");
- 3350 return NULL;
- 3351 }
- 3352 Py_ssize_t nd_stride = PyObject_Length(strides);
- 3353 if (nd_stride < 0)
- 3354 {
- 3355 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of third argument");
- 3356 return NULL;
- 3357 }
- 3358
- 3359 if (nd != nd_stride)
- 3360 {
- 3361 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: We need the same number of shapes and strides");
- 3362 return NULL;
- 3363 }
- 3364
- 3365 rval = CudaNdarray_New();
- 3366
- 3367 if (CudaNdarray_set_nd((CudaNdarray *)rval, nd))
- 3368 {
- 3369 //CudaNdarray_set_nd set the error msg
- 3370 return NULL;
- 3371 }
- 3372 // set gpu pointeur
- 3373 assert(((CudaNdarray *)rval)->data_allocated == 0);
- 3374 if (CudaNdarray_set_device_data((CudaNdarray *)rval, (float *)PyInt_AsLong(gpu_ptr), base))
- 3375 {
- 3376 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Error while setting the gpu pointor");
- 3377 return NULL;
- 3378
- 3379 }
- 3380
- 3381 // Set dims and strides
- 3382 for (int i = nd-1; i >= 0; --i)
- 3383 {
- 3384 PyObject * idx = PyLong_FromLong(i);
- 3385 if (idx == NULL)
- 3386 {
- 3387 PyErr_SetString(PyExc_Exception, "CudaNdarray_from_gpu_pointer: Couldn't make long object to loop over list/tuple");
- 3388 return NULL;
- 3389 }
- 3390 PyObject* dim_ = PyObject_GetItem(shapes, idx);
- 3391 PyObject* strd_ = PyObject_GetItem(strides, idx);
- 3392 if (!PyInt_Check(dim_))
- 3393 {
- 3394 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: shapes[%d] is not an int", i);
- 3395 return NULL;
- 3396 }
- 3397 if (!PyInt_Check(strd_))
- 3398 {
- 3399 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: strides[%d] is not an int", i);
- 3400 return NULL;
- 3401 }
- 3402 int dim = PyInt_AsLong(dim_);
- 3403 int strd = PyInt_AsLong(strd_);
- 3404 CudaNdarray_set_stride((CudaNdarray *)rval, i, strd);
- 3405 CudaNdarray_set_dim((CudaNdarray *)rval, i, dim);
- 3406 Py_DECREF(idx);
- 3407 Py_DECREF(dim_);
- 3408 Py_DECREF(strd_);
- 3409 }
- 3410 if (verbose) printf("CudaNdarray_from_gpu_pointer normal return\n");
- 3411 return rval;
- 3412 }
- 3413
- 3414 PyObject *
- 3415 CudaNdarray_Dot(PyObject* _unused, PyObject* args)
- 3416 {
- 3417 PyObject *l=NULL;
- 3418 PyObject *r=NULL;
- 3419 PyObject * rval = NULL;
- 3420
- 3421 //args should consist of two python objects ("OO")
- 3422 if (! PyArg_ParseTuple(args, "OO", &l, &r))
- 3423 return NULL;
- 3424
- 3425 if (!CudaNdarray_Check(l) || !CudaNdarray_Check(r))
- 3426 {
- 3427 PyErr_SetString(PyExc_TypeError, "CudaNdarray arguments required ");
- 3428 goto CudaNdarray_dot_fail;
- 3429 }
- 3430 if (((CudaNdarray*)l)->nd != 2)
- 3431 {
- 3432 PyErr_SetString(PyExc_TypeError, "need 2d CudaNdarray arg for now");
- 3433 goto CudaNdarray_dot_fail;
- 3434 }
- 3435 if (((CudaNdarray*)r)->nd != 2)
- 3436 {
- 3437 PyErr_SetString(PyExc_TypeError, "need 2d CudaNdarray arg for now");
- 3438 goto CudaNdarray_dot_fail;
- 3439 }
- 3440 rval = CudaNdarray_New();
- 3441 if (!rval)
- 3442 {
- 3443 goto CudaNdarray_dot_fail;
- 3444 }
- 3445 int dims[2];
- 3446 dims[0] = CudaNdarray_HOST_DIMS((CudaNdarray*)l)[0];
- 3447 dims[1] = CudaNdarray_HOST_DIMS((CudaNdarray*)r)[1];
- 3448 if (CudaNdarray_alloc_contiguous((CudaNdarray*)rval, 2, dims))
- 3449 {
- 3450 goto CudaNdarray_dot_fail;
- 3451 }
- 3452 if (CudaNdarray_gemm(1.0, (CudaNdarray*)l, (CudaNdarray*)r, 0.0, (CudaNdarray*)rval))
- 3453 {
- 3454 goto CudaNdarray_dot_fail;
- 3455 }
- 3456
- 3457 return rval;
- 3458
- 3459 CudaNdarray_dot_fail:
- 3460 Py_XDECREF(rval);
- 3461 return NULL;
- 3462 }
- 3463
- 3464 static PyObject *
- 3465 filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict, storage)
- 3466 {
- 3467 /*
- 3468 * TODO: DOC what this function should do in the various cases of
- 3469 * What is 'strict' supposed to mean in the context of this function?
- 3470 * What do we do with input that could be interpreted as matching the broadcastable pattern in strict vs. non-strict cases?
- 3471 *
- 3472 */
- 3473 PyObject *py_data=NULL;
- 3474 PyArrayObject * data = NULL;
- 3475 int strict = 0;
- 3476 PyObject * broadcastable=NULL;
- 3477 PyObject * storage=NULL;
- 3478 CudaNdarray * rval=NULL;
- 3479
- 3480 //Python object references which are provided to the caller are borrowed references
- 3481 if (!PyArg_ParseTuple(args, "OOiO", &py_data, &broadcastable, &strict, &storage)) return NULL;
- 3482
- 3483 if (!PyTuple_Check(broadcastable)){
- 3484 PyErr_SetString(PyExc_TypeError, "broadcastable arg should be a tuple of int.");
- 3485 return NULL;
- 3486 }
- 3487 Py_INCREF(py_data);
- 3488 Py_INCREF(broadcastable);
- 3489
- 3490 CudaNdarray * cnda = (CudaNdarray*)py_data;
- 3491
- 3492 if (strict || CudaNdarray_Check(py_data))
- 3493 {
- 3494 //TODO: support non-strict "casting" from a vt to the broadcastable/type/size that we need.
- 3495 if (!CudaNdarray_Check(py_data))
- 3496 {
- 3497 Py_DECREF(py_data);
- 3498 Py_DECREF(broadcastable);
- 3499 PyErr_SetString(PyExc_TypeError, "strict mode requires CudaNdarray");
- 3500 return NULL;
- 3501 }
- 3502 if (cnda->nd != PyTuple_Size(broadcastable))
- 3503 {
- 3504 Py_DECREF(py_data);
- 3505 Py_DECREF(broadcastable);
- 3506 PyErr_Format(PyExc_TypeError, "Wrong rank: %i vs %li", cnda->nd, (long)PyTuple_Size(broadcastable));
- 3507 return NULL;
- 3508 }
- 3509 for (int i = 0; i < cnda->nd; ++i)
- 3510 {
- 3511 if ((CudaNdarray_HOST_DIMS(cnda)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
- 3512 {
- 3513 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable vt dimension %i", i);
- 3514 Py_DECREF(py_data);
- 3515 Py_DECREF(broadcastable);
- 3516 return NULL;
- 3517 }else if (CudaNdarray_HOST_DIMS(cnda)[i] == 1 && CudaNdarray_HOST_STRIDES(cnda)[i] != 0){
- 3518 PyErr_Format(PyExc_TypeError, "Non-zeros strides(%d) on dimension %d of size 1",
- 3519 CudaNdarray_HOST_STRIDES(cnda)[i], i);
- 3520 Py_DECREF(py_data);
- 3521 Py_DECREF(broadcastable);
- 3522 return NULL;
- 3523 }
- 3524 }
- 3525 Py_DECREF(broadcastable);
- 3526 return py_data;
- 3527 }
- 3528 else
- 3529 {
- 3530 data = (PyArrayObject*)PyArray_FromObject(py_data, REAL_TYPENUM, PyTuple_Size(broadcastable), PyTuple_Size(broadcastable));
- 3531 if (!data)
- 3532 {
- 3533 //err message already defined
- 3534 Py_DECREF(py_data);
- 3535 Py_DECREF(broadcastable);
- 3536 return NULL;
- 3537 }
- 3538 for (int i = 0; i < PyArray_NDIM(data); ++i)
- 3539 {
- 3540 if ((PyArray_DIMS(data)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
- 3541 {
- 3542 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable dimension %i", i);
- 3543 Py_DECREF(data);
- 3544 Py_DECREF(py_data);
- 3545 Py_DECREF(broadcastable);
- 3546 return NULL;
- 3547 }
- 3548 }
- 3549 if (storage && CudaNdarray_Check(storage))
- 3550 {
- 3551 rval = (CudaNdarray*) storage;
- 3552 Py_INCREF(rval);
- 3553 }
- 3554 else
- 3555 {
- 3556 rval = (CudaNdarray*) CudaNdarray_New();
- 3557 }
- 3558 if (rval)
- 3559 {
- 3560 if (CudaNdarray_CopyFromArray(rval, data))
- 3561 {
- 3562 Py_DECREF(rval);
- 3563 rval = NULL;
- 3564 }
- 3565 }
- 3566 Py_DECREF(data);
- 3567 Py_DECREF(py_data);
- 3568 Py_DECREF(broadcastable);
- 3569 return (PyObject*)rval;
- 3570 }
- 3571 }
- 3572
- 3573 //TODO-- CudaNdarray_Dot and CudaNdarray_active_device_name are following different capitalization conventions.
- 3574 // Pick one and standardize it, this file is already annoying enough to grep through
- 3575 static PyMethodDef module_methods[] = {
- 3576 {"dimshuffle", CudaNdarray_Dimshuffle, METH_VARARGS, "Returns the dimshuffle of a CudaNdarray."},
- 3577 {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
- 3578 {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Select the gpu card to use; also usable to test whether CUDA is available."},
- 3579 {"select_a_gpu", CudaNdarray_select_a_gpu, METH_NOARGS, "Call this method if you want to select a GPU before gpu_init call and let the driver choose the GPU."},
- 3580 {"active_device_name", CudaNdarray_active_device_name, METH_VARARGS, "Get the name of the active device."},
- 3581 {"active_device_number", CudaNdarray_active_device_number, METH_VARARGS, "Get the number of the active device."},
- 3582 {"gpu_shutdown", CudaNdarray_gpu_shutdown, METH_VARARGS, "Shut down the gpu."},
- 3583 {"device_properties", GetDeviceProperties, METH_VARARGS, "Return a dictionary with the device properties."},
- 3584 {"mem_info", GetDeviceMemInfo, METH_NOARGS, "Return a tuple with the free and total memory on the gpu in bytes."},
- 3585 #if COMPUTE_GPU_MEM_USED
- 3586 {"theano_allocated", GetTheanoAllocInfo, METH_NOARGS, "Return the size in bytes of memory Theano currently have allocated on the gpu."},
- 3587 #endif
- 3588 {"ptr_int_size", CudaNdarray_ptr_int_size, METH_VARARGS, "Return a tuple with the size of gpu pointer, cpu pointer and int in bytes."},
- 3589 {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable. strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
- 3590 {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
- 3591 {"from_gpu_pointer", CudaNdarray_from_gpu_pointer, METH_VARARGS, "Used to create a CudaNdarray from already allocated memory on the gpu.(example by pycuda)"},
- 3592 {"synchronize", CudaNdarray_synchronize, METH_NOARGS, "Used to synchronize the device"},
- 3593 {"cublas_v2", CudaNdarray_cublasv2, METH_NOARGS,
- 3594 "Used to know if this version of cuda_ndarray is linked with cublas v2."},
- 3595 {NULL, NULL, NULL, NULL} /* Sentinel */
- 3596 };
- 3597
- 3598 #define CNDA_MOD_NAME "cuda_ndarray"
- 3599 #define CNDA_DOCSTRING "CUDA implementation of a numpy ndarray-like object."
- 3600
- 3601 #if PY_MAJOR_VERSION == 3
- 3602 static struct PyModuleDef cuda_ndarray_moduledef =
- 3603 {
- 3604 PyModuleDef_HEAD_INIT,
- 3605 CNDA_MOD_NAME,
- 3606 CNDA_DOCSTRING,
- 3607 -1, /* size of per-interpreter state of the module,
- 3608 or -1 if the module keeps state in global variables. */
- 3609 module_methods
- 3610 };
- 3611
- 3612 PyMODINIT_FUNC
- 3613 PyInit_cuda_ndarray(void)
- 3614 #else
- 3615 PyMODINIT_FUNC
- 3616 initcuda_ndarray(void)
- 3617 #endif
- 3618 {
- 3619 import_array();
- 3620
- 3621 PyObject* m;
- 3622
- 3623 if (PyType_Ready(&CudaNdarrayType) < 0) {
- 3624 #if PY_MAJOR_VERSION == 3
- 3625 return NULL;
- 3626 #else
- 3627 return;
- 3628 #endif
- 3629 }
- 3630
- 3631 #if PY_MAJOR_VERSION == 3
- 3632 m = PyModule_Create(&cuda_ndarray_moduledef);
- 3633 #else
- 3634 m = Py_InitModule3(CNDA_MOD_NAME, module_methods, CNDA_DOCSTRING);
- 3635 #endif
- 3636
- 3637 if (m == NULL) {
- 3638 #if PY_MAJOR_VERSION == 3
- 3639 return NULL;
- 3640 #else
- 3641 return;
- 3642 #endif
- 3643 }
- 3644
- 3645 Py_INCREF(&CudaNdarrayType);
- 3646 PyModule_AddObject(m, "CudaNdarray", (PyObject *)&CudaNdarrayType);
- 3647 #if COMPUTE_GPU_MEM_USED
- 3648 for(int i=0;i<TABLE_SIZE;i++){
- 3649 _alloc_size_table[i].ptr=NULL;
- 3650 _alloc_size_table[i].size=0;
- 3651 }
- 3652 #endif
- 3653 // cublasInit();
- 3654 //if (0&&CUBLAS_STATUS_SUCCESS != cublasGetError())
- 3655 //{
- 3656 //std::cerr << "WARNING: initcuda_ndarray: error initializing device\n";
- 3657 //}
- 3658 if (0) //TODO: is this necessary?
- 3659 {
- 3660 int deviceId = 0; // TODO: what number goes here?
- 3661 cudaSetDevice(deviceId);
- 3662 cudaError_t err = cudaGetLastError();
- 3663 if( cudaSuccess != err)
- 3664 {
- 3665 std::cerr << "Error in SetDevice:" << cudaGetErrorString(err) << "\n";
- 3666 }
- 3667 }
- 3668
- 3669 #if PY_MAJOR_VERSION == 3
- 3670 return m;
- 3671 #endif
- 3672 }
- 3673
- 3674
- 3675 //////////////////////////////////////
- 3676 //
- 3677 // C API FOR CudaNdarray
- 3678 //
- 3679 //////////////////////////////////////
- 3680
- 3681 int
- 3682 CudaNdarray_Check(const PyObject * ob)
- 3683 {
- 3684 //TODO: doesn't work with inheritance
- 3685 return CudaNdarray_CheckExact(ob);
- 3686 }
- 3687 int
- 3688 CudaNdarray_CheckExact(const PyObject * ob)
- 3689 {
- 3690 return ((Py_TYPE(ob) == &CudaNdarrayType) ? 1 : 0);
- 3691 }
- 3692
- 3693 PyObject *
- 3694 CudaNdarray_New(int nd)
- 3695 {
- 3696 CudaNdarray *self = (CudaNdarray *)CudaNdarrayType.tp_alloc(&CudaNdarrayType, 0);
- 3697 if (self == NULL)
- 3698 {
- 3699 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_New failed to allocate self");
- 3700 return NULL;
- 3701 }
- 3702 CudaNdarray_null_init(self);
- 3703
- 3704 if (nd == 0)
- 3705 {
- 3706 self->nd = 0;
- 3707 }
- 3708 else if (nd > 0)
- 3709 {
- 3710 if (CudaNdarray_set_nd(self, nd))
- 3711 {
- 3712 Py_DECREF(self);
- 3713 return NULL;
- 3714 }
- 3715 }
- 3716 ++_outstanding_mallocs[1];
- 3717 return (PyObject *)self;
- 3718 }
- 3719
- 3720
- 3721
- 3722 //////////////////////////////
- 3723 //
- 3724 // Published helper functions
- 3725 //
- 3726 //////////////////////////////
- 3727
- 3728 static int
- 3729 cublas_init()
- 3730 {
- 3731 cublasStatus_t err;
- 3732 err = cublasCreate(&handle);
- 3733 if (CUBLAS_STATUS_SUCCESS != err)
- 3734 {
- 3735 if(CUBLAS_STATUS_NOT_INITIALIZED == err)
- 3736 PyErr_SetString(PyExc_RuntimeError,
- 3737 "cublasCreate() returned this error "
- 3738 "'the CUDA Runtime initialization failed'");
- 3739 else if(CUBLAS_STATUS_ALLOC_FAILED == err)
- 3740 PyErr_SetString(PyExc_RuntimeError,
- 3741 "cublasCreate() returned this error "
- 3742 "'the resources could not be allocated'");
- 3743 else
- 3744 PyErr_SetString(PyExc_RuntimeError,
- 3745 "unknow error during returned by cublasCreate()");
- 3746 return -1;
- 3747 }
- 3748 // Set the default stream as the one to execute on (default)
- 3749 cublasSetStream(handle, NULL);
- 3750 // Pointer to scalars are on the host (also default)
- 3751 cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
- 3752 #if CUDA_VERSION >= 5000
- 3753 // atomics can be used in kernels to speed up operations (not default)
- 3754 // This may lead to a slight variance from run to run in some operations
- 3755 cublasSetAtomicsMode(handle, CUBLAS_ATOMICS_ALLOWED);
- 3756 #endif
- 3757 return 0;
- 3758 }
- 3759
- 3760 static void
- 3761 cublas_shutdown()
- 3762 {
- 3763 if (handle != NULL)
- 3764 cublasDestroy(handle);
- 3765 // No point in handling any errors here
- 3766 handle = NULL;
- 3767 }
- 3768
- 3769 int
- 3770 CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
- 3771 {
- 3772 int err = CudaNdarray_alloc_contiguous(self, PyArray_NDIM(obj),
- 3773 PyArray_DIMS(obj));
- 3774 if (err) {
- 3775 return err;
- 3776 }
- 3777
- 3778 int typenum = PyArray_TYPE(obj);
- 3779 if (typenum != REAL_TYPENUM)
- 3780 {
- 3781 PyErr_SetString(PyExc_TypeError, "can only copy from float arrays");
- 3782 return -1;
- 3783 }
- 3784 assert( 4 == PyArray_ITEMSIZE(obj));
- 3785 PyArrayObject * py_src = (PyArrayObject *)PyArray_ContiguousFromAny(
- 3786 (PyObject*)obj, typenum, self->nd, self->nd);
- 3787 if (!py_src) {
- 3788 return -1;
- 3789 }
- 3790 npy_intp py_src_size = PyArray_SIZE(py_src);
- 3791 void *py_src_data = PyArray_DATA(py_src);
- 3792 cudaError_t cerr;
- 3793 CNDA_BEGIN_ALLOW_THREADS;
- 3794 cerr = cudaMemcpy(self->devdata, py_src_data,
- 3795 py_src_size * sizeof(real),
- 3796 cudaMemcpyHostToDevice);
- 3797 //CNDA_THREAD_SYNC; // unneeded because cudaMemcpy is blocking anyway
- 3798 CNDA_END_ALLOW_THREADS;
- 3799 if (cudaSuccess != cerr)
- 3800 {
- 3801 PyErr_Format(PyExc_RuntimeError,
- 3802 "Cuda error '%s' while copying %lli data element"
- 3803 " to device memory",
- 3804 cudaGetErrorString(cerr),
- 3805 (long long)py_src_size);
- 3806 Py_DECREF(py_src);
- 3807 return -1;
- 3808 }
- 3809 Py_DECREF(py_src);
- 3810 return 0;
- 3811 }
- 3812
- 3813 PyObject *
- 3814 CudaNdarray_new_nd(int nd)
- 3815 {
- 3816 CudaNdarray * rval = (CudaNdarray*) CudaNdarray_New();
- 3817 if (!rval || CudaNdarray_set_nd(rval, nd))
- 3818 {
- 3819 Py_XDECREF(rval);
- 3820 rval = NULL;
- 3821 }
- 3822 return (PyObject *) rval;
- 3823 }
- 3824
- 3825
- 3826 /**
- 3827 * Initialize 'self' as a view of 'base', with memory storage 'data'
- 3828 */
- 3829
- 3830 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base)
- 3831 {
- 3832 if (self->data_allocated)
- 3833 {
- 3834 assert(self->devdata);
- 3835 if (device_free(self->devdata))
- 3836 {
- 3837 self->devdata = NULL;
- 3838 self->data_allocated = 0;
- 3839 return -1;
- 3840 }
- 3841 }
- 3842 // Get the original base object (base.base.base...)
- 3843 PyObject * orig_base = base;
- 3844 // base is not always a CudaNdarray. It can be a GpuArray from pycuda, ...
- 3845 while (orig_base && CudaNdarray_Check(orig_base) && ((CudaNdarray*) orig_base)->base)
- 3846 {
- 3847 // base_base is itself a view
- 3848 orig_base = ((CudaNdarray*) orig_base)->base;
- 3849 }
- 3850 //N.B. XDECREF and XINCREF are no-ops for NULL pointers
- 3851 if (self->base != orig_base)
- 3852 {
- 3853 Py_XDECREF(self->base);
- 3854 self->base = orig_base;
- 3855 Py_XINCREF(self->base);
- 3856 }
- 3857 self->data_allocated = 0;
- 3858 self->devdata = data;
- 3859 return 0;
- 3860 }
- 3861
- 3862 static __global__ void k_copy_1d(const int N, const float * x, const int sx, float * y, const int sy)
- 3863 {
- 3864 for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += gridDim.x*blockDim.x)
- 3865 {
- 3866 y[i*sy] = x[i*sx];
- 3867 }
- 3868 }
- 3869
- 3870 // N1 through N4 are the size of y
- 3871 static __global__ void k_copy_4d(const int N1,
- 3872 const int N2, const int N3, const int N4,
- 3873 const float * x, const int sx1, const int sx2, const int sx3,
- 3874 const int sx4, float * y, const int sy1, const int sy2,
- 3875 const int sy3, const int sy4)
- 3876 {
- 3877 // These must be made int instead of unsigned int due to a bug in nvcc
- 3878 int bx = blockIdx.x;
- 3879 int by = blockIdx.y;
- 3880
- 3881 for (int i = bx; i < N1; i += gridDim.x)
- 3882 {
- 3883 for (int j = by; j < N2; j += gridDim.y)
- 3884 {
- 3885 for (int k = threadIdx.x; k < N3; k += (int) blockDim.x)
- 3886 {
- 3887 for (int l = threadIdx.y; l < N4; l += (int) blockDim.y)
- 3888 {
- 3889 y[i * sy1 + j * sy2 + k * sy3 + l * sy4] =
- 3890 x[i * sx1 + j * sx2 + k * sx3 + l * sx4];
- 3891 }
- 3892 }
- 3893 }
- 3894 }
- 3895 }
- 3896
- 3897 //copy from other into self
- 3898 int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
- 3899 const CudaNdarray * other,
- 3900 bool unbroadcast)
- 3901 {
- 3902 int verbose = 0;
- 3903 if (verbose>1) fprintf(stderr, "CudaNdarray_CopyFromCudaNdarray\n");
- 3904
- 3905 //standard elemwise size checks
- 3906 if (self->nd == -1)
- 3907 {
- 3908 PyErr_SetString(PyExc_TypeError,
- 3909 "can't copy into un-initialized CudaNdarray");
- 3910 return -1;
- 3911 }
- 3912 CudaNdarray * new_other = NULL;
- 3913
- 3914 if (self->nd < other->nd)
- 3915 {
- 3916 PyErr_Format(PyExc_NotImplementedError,
- 3917 "CudaNdarray_CopyFromCudaNdarray: The number of dimensions of the "
- 3918 "destination needs to be >= the number of dimensions of the "
- 3919 "source. Got %d and %d.", self->nd, other->nd);
- 3920 return -1;
- 3921 }
- 3922 else if (self->nd != other->nd)
- 3923 {
- 3924 new_other = (CudaNdarray *) CudaNdarray_View(other);
- 3925 int added_dims = self->nd - other->nd;
- 3926 int* pattern = (int*) alloca(self->nd * sizeof(int));
- 3927 for(int i = 0; i < added_dims; i++)
- 3928 pattern[i] = -1;
- 3929 for(int i = 0; i < other->nd; i++)
- 3930 pattern[i + added_dims] = i;
- 3931 CudaNdarray_dimshuffle(new_other, self->nd, pattern);
- 3932 other = new_other;
- 3933 }
- 3934 assert(self->nd == other->nd);
- 3935 //standard elemwise dim checks (also compute total size)
- 3936 unsigned int size = 1;
- 3937 unsigned int size_source = 1;
- 3938 for (int i = 0; i< self->nd; ++i)
- 3939 {
- 3940 if ((CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
- 3941 && (1!=CudaNdarray_HOST_DIMS(other)[i] || !unbroadcast) )
- 3942 {
- 3943 PyErr_Format(PyExc_ValueError,
- 3944 "CudaNdarray_CopyFromCudaNdarray:"
- 3945 " need same dimensions for dim %d,"
- 3946 " destination=%d, source=%d",
- 3947 i, CudaNdarray_HOST_DIMS(self)[i],
- 3948 CudaNdarray_HOST_DIMS(other)[i]);
- 3949 Py_XDECREF(new_other);
- 3950 return -1;
- 3951 }
- 3952 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
- 3953 size_source *= (unsigned int) CudaNdarray_HOST_DIMS(other)[i];
- 3954 }
- 3955 if (0 == size)
- 3956 {
- 3957 Py_XDECREF(new_other);
- 3958 return 0; //nothing to copy, we're done.
- 3959 }
- 3960 if (CudaNdarray_is_c_contiguous(self) &&
- 3961 CudaNdarray_is_c_contiguous(other) &&
- 3962 size == size_source)
- 3963 {
- 3964 if (verbose)
- 3965 fprintf(stderr, "Copying contiguous vector with cublasScopy\n");
- 3966
- 3967 cublasStatus_t err;
- 3968 err = cublasScopy(handle, size, CudaNdarray_DEV_DATA(other), 1,
- 3969 CudaNdarray_DEV_DATA(self), 1);
- 3970 CNDA_THREAD_SYNC;
- 3971 Py_XDECREF(new_other);
- 3972 if (CUBLAS_STATUS_SUCCESS != err)
- 3973 {
- 3974 PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
- 3975 return -1;
- 3976 }
- 3977 return 0;
- 3978 }
- 3979 //TODO: rewrite these copy operations to be more efficient
- 3980 // See, for example the transpose example in the cuda_sdk.
- 3981 switch (self->nd)
- 3982 {
- 3983 case 0: // scalar
- 3984 {
- 3985 // THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS
- 3986 assert(0);
- 3987 }; break;
- 3988 case 1: // vector
- 3989 {
- 3990 if (verbose) fprintf(stderr, "Copying non-contiguous vector\n");
- 3991 if (verbose) fprint_CudaNdarray(stderr, other);
- 3992 unsigned int n_blocks = std::min(size,
- 3993 (unsigned int)NUM_VECTOR_OP_BLOCKS);
- 3994 unsigned int n_threads = std::min(ceil_intdiv(size, n_blocks),
- 3995 (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
- 3996 k_copy_1d<<<n_blocks, n_threads>>>(size,
- 3997 CudaNdarray_DEV_DATA(other),
- 3998 CudaNdarray_HOST_STRIDES(other)[0],
- 3999 CudaNdarray_DEV_DATA(self),
- 4000 CudaNdarray_HOST_STRIDES(self)[0]);
- 4001 CNDA_THREAD_SYNC;
- 4002 cudaError_t err = cudaGetLastError();
- 4003 if( cudaSuccess != err)
- 4004 {
- 4005 PyErr_Format(PyExc_RuntimeError,
- 4006 "Cuda error: %s: %s. (n_blocks=%i,"
- 4007 " n_threads_per_block=%i)\n", "k_copy_1d",
- 4008 cudaGetErrorString(err), n_blocks, n_threads);
- 4009 Py_XDECREF(new_other);
- 4010 return -1;
- 4011 }
- 4012 }; break;
- 4013 case 4: // 4-tensor
- 4014 {
- 4015 if (verbose)
- 4016 {
- 4017 if (0 != fprint_CudaNdarray(stderr, other))
- 4018 {
- 4019 Py_XDECREF(new_other);
- 4020 return -1;
- 4021 }
- 4022 }
- 4023
- 4024 // The blocks implement the looping over the first two axes so
- 4025 // this needs to be (N1, N2)
- 4026 dim3 n_blocks( std::min(CudaNdarray_HOST_DIMS(self)[0],
- 4027 NUM_VECTOR_OP_BLOCKS),
- 4028 std::min(CudaNdarray_HOST_DIMS(self)[1],
- 4029 NUM_VECTOR_OP_BLOCKS));
- 4030 // For the threads, just make as many as possible
- 4031 dim3 n_threads( std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[2],
- 4032 (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK),
- 4033 std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[3],
- 4034 (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK));
- 4035
- 4036 n_threads.x = std::min( (unsigned int) 32, (unsigned int) n_threads.x);
- 4037 n_threads.y = std::min( n_threads.y, NUM_VECTOR_OP_THREADS_PER_BLOCK / n_threads.x);
- 4038
- 4039 k_copy_4d<<<n_blocks, n_threads>>>(
- 4040 // size of y
- 4041 (unsigned int) CudaNdarray_HOST_DIMS(self)[0], // N1
- 4042 (unsigned int) CudaNdarray_HOST_DIMS(self)[1], // N2
- 4043 (unsigned int) CudaNdarray_HOST_DIMS(self)[2], // N3
- 4044 (unsigned int) CudaNdarray_HOST_DIMS(self)[3], // N4
- 4045 CudaNdarray_DEV_DATA(other), // x
- 4046 // x strides
- 4047 CudaNdarray_HOST_STRIDES(other)[0],
- 4048 CudaNdarray_HOST_STRIDES(other)[1],
- 4049 CudaNdarray_HOST_STRIDES(other)[2],
- 4050 CudaNdarray_HOST_STRIDES(other)[3],
- 4051 CudaNdarray_DEV_DATA(self), // y
- 4052 // y strides
- 4053 CudaNdarray_HOST_STRIDES(self)[0],
- 4054 CudaNdarray_HOST_STRIDES(self)[1],
- 4055 CudaNdarray_HOST_STRIDES(self)[2],
- 4056 CudaNdarray_HOST_STRIDES(self)[3]
- 4057 );
- 4058 CNDA_THREAD_SYNC;
- 4059 cudaError_t err = cudaGetLastError();
- 4060 if( cudaSuccess != err)
- 4061 {
- 4062 PyErr_Format(PyExc_RuntimeError,
- 4063 "Cuda error: %s: %s.",
- 4064 "k_copy_4d",
- 4065 cudaGetErrorString(err));
- 4066 Py_XDECREF(new_other);
- 4067 return -1;
- 4068 }
- 4069 }; break;
- 4070 default:
- 4071 {
- 4072 cudaError_t err = cudaGetLastError();
- 4073 if(cudaSuccess != err){
- 4074 PyErr_Format(PyExc_RuntimeError,
- 4075 "Unexpected Cuda error: %s: %s\n",
- 4076 "CudaNdarray_CopyFromCudaNdarray",
- 4077 cudaGetErrorString(err));
- 4078 Py_XDECREF(new_other);
- 4079 return -1;
- 4080 }
- 4081
- 4082 if (verbose)
- 4083 fprintf(stderr,
- 4084 "Copying with default version unbroadcast=%d\n",
- 4085 unbroadcast);
- 4086 // call worker routine
- 4087 unsigned int threads_per_block = std::min(size,
- 4088 (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
- 4089 unsigned int n_blocks = std::min(ceil_intdiv(size, threads_per_block),
- 4090 (unsigned int)NUM_VECTOR_OP_BLOCKS);
- 4091 const CudaNdarray * cuda_dims = other;
- 4092 if(unbroadcast)
- 4093 cuda_dims = self;
- 4094 //copy from other into self
- 4095 k_elemwise_unary_rowmajor_copy<<<n_blocks, threads_per_block>>>(
- 4096 size,
- 4097 (unsigned int)other->nd,
- 4098 (const int *)CudaNdarray_DEV_DIMS(cuda_dims),
- 4099 (const float*)CudaNdarray_DEV_DATA(other),
- 4100 (const int *)CudaNdarray_DEV_STRIDES(other),
- 4101 CudaNdarray_DEV_DATA(self),
- 4102 (const int *)CudaNdarray_DEV_STRIDES(self));
- 4103 CNDA_THREAD_SYNC;
- 4104 err = cudaGetLastError();
- 4105 if(verbose>1)
- 4106 fprintf(stderr,
- 4107 "INFO k_elemwise_unary_rowmaj (n_blocks=%i,"
- 4108 " n_threads_per_block=%i)\n",
- 4109 n_blocks, threads_per_block);
- 4110 if( cudaSuccess != err)
- 4111 {
- 4112 //fprint_CudaNdarray(stderr, self);
- 4113 //fprint_CudaNdarray(stderr, other);
- 4114 PyErr_Format(PyExc_RuntimeError,
- 4115 "Cuda error: %s: %s. (n_blocks=%i,"
- 4116 " n_threads_per_block=%i)\n",
- 4117 "k_elemwise_unary_rowmajor_copy",
- 4118 cudaGetErrorString(err), n_blocks,
- 4119 threads_per_block);
- 4120 Py_XDECREF(new_other);
- 4121 return -1;
- 4122 }
- 4123 }
- 4124 };
- 4125 Py_XDECREF(new_other);
- 4126 return 0;
- 4127 }
- 4128
- 4129 int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
- 4130 {
- 4131 if (A->nd != 2)
- 4132 {
- 4133 PyErr_SetString(PyExc_ValueError, "non-matrix arg A to gemm");
- 4134 return -1;
- 4135 }
- 4136 if (B->nd != 2)
- 4137 {
- 4138 PyErr_SetString(PyExc_ValueError, "non-matrix arg B to gemm");
- 4139 return -1;
- 4140 }
- 4141 if (C->nd != 2)
- 4142 {
- 4143 PyErr_SetString(PyExc_ValueError, "non-matrix arg C to gemm");
- 4144 return -1;
- 4145 }
- 4146
- 4147 // We must allow dimensions to be zeros.
- 4148 if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
- 4149 || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0])
- 4150 || (CudaNdarray_HOST_DIMS(B)[1] != CudaNdarray_HOST_DIMS(C)[1]))
- 4151 {
- 4152 PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemm (%i,%i)x(%i,%i)->(%i,%i)",
- 4153 CudaNdarray_HOST_DIMS(A)[0],
- 4154 CudaNdarray_HOST_DIMS(A)[1],
- 4155 CudaNdarray_HOST_DIMS(B)[0],
- 4156 CudaNdarray_HOST_DIMS(B)[1],
- 4157 CudaNdarray_HOST_DIMS(C)[0],
- 4158 CudaNdarray_HOST_DIMS(C)[1]);
- 4159 return -1;
- 4160 }
- 4161
- 4162 // If matrix A or B has non-unit size and non-unit stride in both
- 4163 // dimensions, we can make a copy.
- 4164 CudaNdarray * A_new = NULL;
- 4165 CudaNdarray * B_new = NULL;
- 4166 if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
- 4167 && (CudaNdarray_HOST_STRIDES(A)[0] != 1)
- 4168 && (CudaNdarray_HOST_DIMS(A)[1] > 1)
- 4169 && (CudaNdarray_HOST_STRIDES(A)[1] != 1))
- 4170 || (CudaNdarray_HOST_STRIDES(A)[0] < 0)
- 4171 || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
- 4172 {
- 4173 A_new = (CudaNdarray*) CudaNdarray_Copy(A);
- 4174 if (!A_new)
- 4175 return -1;
- 4176 A = A_new;
- 4177 }
- 4178
- 4179 if (((CudaNdarray_HOST_DIMS(B)[0] > 1)
- 4180 && (CudaNdarray_HOST_STRIDES(B)[0] != 1)
- 4181 && (CudaNdarray_HOST_DIMS(B)[1] > 1)
- 4182 && (CudaNdarray_HOST_STRIDES(B)[1] != 1))
- 4183 || (CudaNdarray_HOST_STRIDES(B)[0] < 0)
- 4184 || (CudaNdarray_HOST_STRIDES(B)[1] < 0))
- 4185 {
- 4186 B_new = (CudaNdarray*) CudaNdarray_Copy(B);
- 4187 if (!B_new)
- 4188 {
- 4189 // If A_new is NULL, meaning A was not copied nothing happens
- 4190 Py_XDECREF(A_new);
- 4191 return -1;
- 4192 }
- 4193 B = B_new;
- 4194 }
- 4195
- 4196 // If matrix C has non-unit size and non-unit stride in both
- 4197 // dimensions, or negative strides, we can't operate. We cannot copy
- 4198 // C either, because the calling code will expect the result to be
- 4199 // in the original C container.
- 4200 if (((CudaNdarray_HOST_DIMS(C)[0] > 1)
- 4201 && (CudaNdarray_HOST_STRIDES(C)[0] != 1)
- 4202 && (CudaNdarray_HOST_DIMS(C)[1] > 1)
- 4203 && (CudaNdarray_HOST_STRIDES(C)[1] != 1))
- 4204 || (CudaNdarray_HOST_STRIDES(C)[0] < 0)
- 4205 || (CudaNdarray_HOST_STRIDES(C)[1] < 0))
- 4206 {
- 4207 PyErr_Format(PyExc_AssertionError,
- 4208 "non-unit or negative stride in gemm arg C (%i,%i) of shape (%i,%i)",
- 4209 CudaNdarray_HOST_STRIDES(C)[0],
- 4210 CudaNdarray_HOST_STRIDES(C)[1],
- 4211 CudaNdarray_HOST_DIMS(C)[0],
- 4212 CudaNdarray_HOST_DIMS(C)[1]);
- 4213 Py_XDECREF(A_new);
- 4214 Py_XDECREF(B_new);
- 4215 return -1;
- 4216 }
- 4217
- 4218 // the unit integer is divided logically into three fields of 4 bits
- 4219 // the lowermost 4 bits encode the stride pattern of the output
- 4220 // the next higher 4 bits encode the B variable (or y)
- 4221 // the next higher 4 bits encode the C variable (or x)
- 4222 //
- 4223 // the stride pattern for each input is encoded as 0 for unit stride from col to col (Row major)
- 4224 // 1 for unit stride from row to row (Col major)
- 4225
- 4226 // a stride of 0 implies a dimension of 1 - so we can actually define
- 4227 // a stride of 0 as a 'unit' stride because gemm will never use it.
- 4228 // If a dimension is 0, its stride will not be used either, so we can
- 4229 // consider it a 'unit' stride too.
- 4230 int unit = 0;
- 4231 if (CudaNdarray_HOST_STRIDES(A)[1] == 1 || CudaNdarray_HOST_DIMS(A)[1] <= 1) {
- 4232 unit |= (0x0 << 8);
- 4233 } else if (CudaNdarray_HOST_STRIDES(A)[0] == 1 || CudaNdarray_HOST_DIMS(A)[0] <= 1) {
- 4234 unit |= (0x1 << 8);
- 4235 } else {
- 4236 unit |= (0x2 << 8);
- 4237 }
- 4238 if (CudaNdarray_HOST_STRIDES(B)[1] == 1 || CudaNdarray_HOST_DIMS(B)[1] <= 1) {
- 4239 unit |= (0x0 << 4);
- 4240 } else if (CudaNdarray_HOST_STRIDES(B)[0] == 1 || CudaNdarray_HOST_DIMS(B)[0] <= 1) {
- 4241 unit |= (0x1 << 4);
- 4242 } else {
- 4243 unit |= (0x2 << 4);
- 4244 }
- 4245 if (CudaNdarray_HOST_STRIDES(C)[1] == 1 || CudaNdarray_HOST_DIMS(C)[1] <= 1) {
- 4246 unit |= (0x0 << 0);
- 4247 } else if (CudaNdarray_HOST_STRIDES(C)[0] == 1 || CudaNdarray_HOST_DIMS(C)[0] <= 1) {
- 4248 unit |= (0x1 << 0);
- 4249 } else {
- 4250 unit |= (0x2 << 0);
- 4251 }
- 4252
- 4253 /* create appropriate strides for malformed matrices that are row or column
- 4254 * vectors
- 4255 */
- 4256 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1];
- 4257 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0];
- 4258 int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : CudaNdarray_HOST_DIMS(B)[1];
- 4259 int sb_1 = (CudaNdarray_HOST_DIMS(B)[1] > 1) ? CudaNdarray_HOST_STRIDES(B)[1] : CudaNdarray_HOST_DIMS(B)[0];
- 4260 int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : CudaNdarray_HOST_DIMS(C)[1];
- 4261 int sc_1 = (CudaNdarray_HOST_DIMS(C)[1] > 1) ? CudaNdarray_HOST_STRIDES(C)[1] : CudaNdarray_HOST_DIMS(C)[0];
- 4262
- 4263 float* a = CudaNdarray_DEV_DATA(A);
- 4264 float* b = CudaNdarray_DEV_DATA(B);
- 4265 float* c = CudaNdarray_DEV_DATA(C);
- 4266 cublasOperation_t N = CUBLAS_OP_N;
- 4267 cublasOperation_t T = CUBLAS_OP_T;
- 4268 //std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
- 4269 // There should be no negative stride at that point
- 4270 #define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \
- 4271 if (sx == 0){sx = 1;}\
- 4272 if (sy == 0){sy = 1;}\
- 4273 if (sz == 0){sz = 1;}\
- 4274 if ((sx > 0) && (sy > 0) && (sz > 0)) { \
- 4275 err = cublasSgemm(handle, T0, T1, D0, D1, D2, &a, x, sx, y, sy, &b, z, sz); \
- 4276 } else { \
- 4277 PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\
- 4278 Py_XDECREF(A_new);\
- 4279 Py_XDECREF(B_new);\
- 4280 return -1; \
- 4281 }
- 4282
- 4283 cublasStatus_t err;
- 4284 switch(unit)
- 4285 {
- 4286 case 0x000: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_0, beta, c, sc_0); break;
- 4287 case 0x100: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_1, beta, c, sc_0); break;
- 4288 case 0x010: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_1, a, sa_0, beta, c, sc_0); break;
- 4289 case 0x110: CHK_STRIDE_SGEMM(T, T, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_1, a, sa_1, beta, c, sc_0); break;
- 4290 case 0x001: CHK_STRIDE_SGEMM(T, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_0, beta, c, sc_1); break;
- 4291 case 0x101: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_0, beta, c, sc_1); break;
- 4292 case 0x011: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_1, beta, c, sc_1); break;
- 4293 case 0x111: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_1, beta, c, sc_1); break;
- 4294 default: PyErr_Format(PyExc_ValueError, "some matrix has no unit stride (unit=%x)", unit);
- 4295 return -1;
- 4296 };
- 4297 CNDA_THREAD_SYNC;
- 4298 Py_XDECREF(A_new);
- 4299 Py_XDECREF(B_new);
- 4300
- 4301 if (CUBLAS_STATUS_SUCCESS != err)
- 4302 {
- 4303 PyErr_Format(PyExc_RuntimeError,
- 4304 "cublasSgemm failed (%i) %s\n"
- 4305 " unit=%x N=%d, c.dims=[%d %d], a.dim=[%d %d], alpha=%f, beta=%f, a=%p, b=%p, c=%p"
- 4306 " sa_0=%d, sa_1=%d, sb_0=%d, sb_1=%d, sc_0=%d, sc_1=%d",
- 4307 err, cublasGetErrorString(err),
- 4308 unit, N,
- 4309 CudaNdarray_HOST_DIMS(C)[0],
- 4310 CudaNdarray_HOST_DIMS(C)[1],
- 4311 CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
- 4312 alpha, beta, a, b, c, sa_0, sa_1, sb_0, sb_1, sc_0, sc_1);
- 4313
- 4314 return -1;
- 4315 }
- 4316 return 0;
- 4317 }
- 4318
- 4319 int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
- 4320 {
- 4321 /**
- 4322 * C <- alpha A B + beta C
- 4323 * A : matrix
- 4324 * B, C: vector
- 4325 * alpha, beta: scalars
- 4326 */
- 4327 if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemv"); return -1; }
- 4328 if (B->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
- 4329 if (C->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
- 4330
- 4331 // We must allow dimensions to be zeros.
- 4332 if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
- 4333 || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0]))
- 4334 {
- 4335 PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemv (%i,%i)x(%i)->(%i)",
- 4336 CudaNdarray_HOST_DIMS(A)[0],
- 4337 CudaNdarray_HOST_DIMS(A)[1],
- 4338 CudaNdarray_HOST_DIMS(B)[0],
- 4339 CudaNdarray_HOST_DIMS(C)[0]);
- 4340 return -1;
- 4341 }
- 4342
- 4343 // If matrix A has non-unit size and non-unit stride in both
- 4344 // dimensions, or negative strides, we cannot operate, but we can
- 4345 // make a copy.
- 4346 CudaNdarray * A_new = NULL;
- 4347 CudaNdarray * B_new = NULL;
- 4348 if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
- 4349 && (CudaNdarray_HOST_STRIDES(A)[0] != 1)
- 4350 && (CudaNdarray_HOST_DIMS(A)[1] > 1)
- 4351 && (CudaNdarray_HOST_STRIDES(A)[1] != 1))
- 4352 || (CudaNdarray_HOST_STRIDES(A)[0] < 0)
- 4353 || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
- 4354 {
- 4355 A_new = (CudaNdarray*) CudaNdarray_Copy(A);
- 4356 if (!A_new)
- 4357 return -1;
- 4358 A = A_new;
- 4359 }
- 4360
- 4361 // If vector B as a negative stride, we also have to make a copy.
- 4362 if (CudaNdarray_HOST_STRIDES(B)[0] < 0)
- 4363 {
- 4364 B_new = (CudaNdarray*) CudaNdarray_Copy(B);
- 4365 if (!B_new)
- 4366 {
- 4367 // If A was not copied, A_new is NULL, and Py_XDECREF does not
- 4368 // do anything
- 4369 Py_XDECREF(A_new);
- 4370 return -1;
- 4371 }
- 4372 B = B_new;
- 4373 }
- 4374
- 4375 // cudablas does not handle negative strides as expected
- 4376 if ( (CudaNdarray_HOST_STRIDES(A)[0] < 0)
- 4377 || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
- 4378 {
- 4379 PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)",
- 4380 CudaNdarray_HOST_STRIDES(A)[0],
- 4381 CudaNdarray_HOST_STRIDES(A)[1]);
- 4382 Py_XDECREF(A_new);
- 4383 Py_XDECREF(B_new);
- 4384 return -1;
- 4385 }
- 4386
- 4387 /* create appropriate strides for malformed matrices that are row or column
- 4388 * vectors
- 4389 */
- 4390 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1];
- 4391 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0];
- 4392 int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : 1;
- 4393 int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : 1;
- 4394
- 4395 if (sa_0 == 0)
- 4396 sa_0 = 1;
- 4397 if (sa_1 == 0)
- 4398 sa_1 = 1;
- 4399
- 4400 // This is important because we can end up not calling Sgemv at all
- 4401 cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
- 4402 if (CudaNdarray_SIZE(C)) {
- 4403 if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
- 4404 || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
- 4405 && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
- 4406 {
- 4407 err = cublasSgemv(handle, CUBLAS_OP_N,
- 4408 CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
- 4409 &alpha,
- 4410 CudaNdarray_DEV_DATA(A), sa_1,
- 4411 CudaNdarray_DEV_DATA(B), sb_0,
- 4412 &beta,
- 4413 CudaNdarray_DEV_DATA(C), sc_0);
- 4414 }
- 4415 else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
- 4416 || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
- 4417 && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
- 4418 {
- 4419 err = cublasSgemv(handle, CUBLAS_OP_T,
- 4420 CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
- 4421 &alpha,
- 4422 CudaNdarray_DEV_DATA(A), sa_0,
- 4423 CudaNdarray_DEV_DATA(B), sb_0,
- 4424 &beta,
- 4425 CudaNdarray_DEV_DATA(C), sc_0);
- 4426 }
- 4427 else
- 4428 {
- 4429 PyErr_Format(PyExc_AssertionError,
- 4430 "Unexpected stride pattern in gemv: (%i, %i) x %i -> %i.\n"
- 4431 "Shapes are: (%i, %i) x %i -> %i\n",
- 4432 CudaNdarray_HOST_STRIDES(A)[0],
- 4433 CudaNdarray_HOST_STRIDES(A)[1],
- 4434 CudaNdarray_HOST_STRIDES(B)[0],
- 4435 CudaNdarray_HOST_STRIDES(C)[0],
- 4436 CudaNdarray_HOST_DIMS(A)[0],
- 4437 CudaNdarray_HOST_DIMS(A)[1],
- 4438 CudaNdarray_HOST_DIMS(B)[0],
- 4439 CudaNdarray_HOST_DIMS(C)[0]);
- 4440 Py_XDECREF(A_new);
- 4441 Py_XDECREF(B_new);
- 4442 return -1;
- 4443 }
- 4444 }
- 4445
- 4446 CNDA_THREAD_SYNC;
- 4447 Py_XDECREF(A_new);
- 4448 Py_XDECREF(B_new);
- 4449
- 4450 if (CUBLAS_STATUS_SUCCESS != err)
- 4451 {
- 4452 PyErr_Format(PyExc_RuntimeError,
- 4453 "cublasSgemv failed (%i)",
- 4454 err);
- 4455 return -1;
- 4456 }
- 4457 return 0;
- 4458 }
- 4459
- 4460 int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray * A) {
- 4461 if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
- 4462 if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
- 4463 if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; }
- 4464
- 4465 if ((CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(x)[0])
- 4466 || (CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(y)[0])) {
- 4467 PyErr_Format(PyExc_ValueError,
- 4468 "dimension mismatch in args to sger (%i)x(%i)->(%i,%i)",
- 4469 CudaNdarray_HOST_DIMS(x)[0],
- 4470 CudaNdarray_HOST_DIMS(y)[0],
- 4471 CudaNdarray_HOST_DIMS(A)[0],
- 4472 CudaNdarray_HOST_DIMS(A)[1]);
- 4473 return -1;
- 4474 }
- 4475
- 4476 int x_strides = CudaNdarray_HOST_STRIDES(x)[0];
- 4477 CudaNdarray * x_new = NULL;
- 4478 if(x_strides == 0){
- 4479 if(CudaNdarray_HOST_DIMS(x)[0] != 1){
- 4480 PyErr_Format(PyExc_RuntimeError,
- 4481 "CudaNdarray_sger: Invalid input x (should not happen)."
- 4482 " We received a CudaNdarray vector with a stride of 0"
- 4483 " that has more than 1 element!");
- 4484 return -1;
- 4485 }
- 4486 x_strides = 1;
- 4487 } else if(x_strides < 0){
- 4488 x_new = (CudaNdarray*) CudaNdarray_Copy(x);
- 4489 x = x_new;
- 4490 x_strides = CudaNdarray_HOST_STRIDES(x)[0];
- 4491 }
- 4492
- 4493 int y_strides = CudaNdarray_HOST_STRIDES(y)[0];
- 4494 CudaNdarray * y_new = NULL;
- 4495 if(y_strides == 0){
- 4496 if(CudaNdarray_HOST_DIMS(y)[0] != 1){
- 4497 PyErr_Format(PyExc_RuntimeError,
- 4498 "CudaNdarray_sger: Invalid input y (should not happen)."
- 4499 " We received a CudaNdarray vector with a stride of 0"
- 4500 " that has more than 1 elements!");
- 4501 Py_XDECREF(x_new);
- 4502 return -1;
- 4503 }
- 4504 y_strides = 1;
- 4505 } else if(y_strides < 0){
- 4506 y_new = (CudaNdarray*) CudaNdarray_Copy(y);
- 4507 y = y_new;
- 4508 y_strides = CudaNdarray_HOST_STRIDES(y)[0];
- 4509 }
- 4510
- 4511 // Create appropriate strides if A is a row or column vector
- 4512 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0]
- 4513 : CudaNdarray_HOST_DIMS(A)[1];
- 4514 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1]
- 4515 : CudaNdarray_HOST_DIMS(A)[0];
- 4516
- 4517 // This is important because we can end up not calling Sger at all
- 4518 cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
- 4519 if(CudaNdarray_SIZE(A)){
- 4520 // If A is in col-major
- 4521 if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
- 4522 || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
- 4523 && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
- 4524 {
- 4525 err = cublasSger(handle, CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], &alpha,
- 4526 CudaNdarray_DEV_DATA(x), x_strides,
- 4527 CudaNdarray_DEV_DATA(y), y_strides,
- 4528 CudaNdarray_DEV_DATA(A), sa_1);
- 4529 }
- 4530 // Since Sger expects A in col-major, we invert x and y to fake this.
- 4531 else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
- 4532 || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
- 4533 && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
- 4534 {
- 4535 err = cublasSger(handle, CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], &alpha,
- 4536 CudaNdarray_DEV_DATA(y), y_strides,
- 4537 CudaNdarray_DEV_DATA(x), x_strides,
- 4538 CudaNdarray_DEV_DATA(A), sa_0);
- 4539 }
- 4540 // A has to be either c- or f-contiguous, with no negative strides
- 4541 else
- 4542 {
- 4543 PyErr_SetString(PyExc_NotImplementedError,
- 4544 "non-contiguous A, or negative strides, in sger");
- 4545 Py_XDECREF(x_new);
- 4546 Py_XDECREF(y_new);
- 4547 return -1;
- 4548 }
- 4549 }
- 4550 CNDA_THREAD_SYNC;
- 4551 Py_XDECREF(x_new);
- 4552 Py_XDECREF(y_new);
- 4553
- 4554 if (CUBLAS_STATUS_SUCCESS != err)
- 4555 {
- 4556 PyErr_Format(PyExc_RuntimeError,
- 4557 "cublasSger failed (%i)",
- 4558 err);
- 4559 return -1;
- 4560 }
- 4561
- 4562 return 0;
- 4563 }
- 4564
- 4565 /**
- 4566 *
- 4567 * Precondition:
- 4568 * a->dim[d] == (dims_a[d]==0) ? (1 << log2_dims_a[d]) : dims_a[d]
- 4569 * z->dim[d] == (z_str[d]==0) ? 1 : dims_a[d];
- 4570 *
- 4571 * TODO: templatize this function to support other reductions.
- 4572 * All that needs to change is the initial value for sum, and the reduction operator.
- 4573 */
- 4574
- 4575 static __global__ void kernel_reduce_sum(const unsigned int size_z,
- 4576 const unsigned int nd,
- 4577 const int * dims_a,
- 4578 const int * log2_dims_a,
- 4579 const int * a_str,
- 4580 const float * a_data,
- 4581 const int * z_str,
- 4582 float * z_data)
- 4583 {
- 4584 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
- 4585 const unsigned int numThreads = blockDim.x * gridDim.x;
- 4586
- 4587 //structure data contains the strides and dimensions of both a and z
- 4588 // a_dim[0], a_dim[1], ... a_dim[nd-1],
- 4589 // a_log2dim[0], a_log2dim[1], ... a_log2dim[nd-1],
- 4590 // a_str[0], ... a_str[nd-1],
- 4591 // z_str[0], ... z_str[nd-1]
- 4592 extern __shared__ int structure_data[];
- 4593 for (unsigned int i = threadIdx.x; i < nd; i += blockDim.x)
- 4594 {
- 4595 structure_data[i+0*nd] = dims_a[i];
- 4596 structure_data[i+1*nd] = log2_dims_a[i];
- 4597 structure_data[i+2*nd] = a_str[i];
- 4598 structure_data[i+3*nd] = z_str[i];
- 4599 }
- 4600 dims_a = structure_data;
- 4601 log2_dims_a = structure_data + nd;
- 4602 a_str = structure_data + 2*nd;
- 4603 z_str = structure_data + 3*nd;
- 4604
- 4605 __syncthreads(); //wait for all the shared structure to be loaded
- 4606
- 4607 for (unsigned int i = idx; i < size_z; i += numThreads)
- 4608 {
- 4609 unsigned int ii = i;
- 4610 const float * a_data_i = a_data;
- 4611 float * z_data_i = z_data;
- 4612 unsigned int n_reduce_elements = 1;
- 4613 unsigned int n_reduce_dims = 0;
- 4614 unsigned int reduce_dim0 = nd-1;
- 4615
- 4616
- 4617 //In this loop, we locate the initial element of the slice that we'd like to reduce with this thread
- 4618 // At the same time, we [re]calculate the size of that slice (n_reduce_elements)
- 4619 for (unsigned int d = 0; d < nd; ++d)
- 4620 {
- 4621 if (a_str[d] && (!z_str[d])) // this means 'd' is a dimension we are reducing over
- 4622 {
- 4623 n_reduce_elements *= dims_a[d];
- 4624 n_reduce_dims += 1;
- 4625 reduce_dim0 = (d < reduce_dim0) ? d : reduce_dim0;
- 4626 }
- 4627 else //'d' is not a dimension that we are reducing over
- 4628 {
- 4629 unsigned int pos_d;
- 4630 if (log2_dims_a[d]==-1) //TODO: when things are working, use this switch
- 4631 {
- 4632 // this branch is not preferred,
- 4633 // because the manual said that integer mod and div operations are slow on gpu
- 4634 pos_d = (ii % dims_a[d]);
- 4635 ii = (ii / dims_a[d]);
- 4636 }
- 4637 else
- 4638 {
- 4639 pos_d = (ii & ((1 << log2_dims_a[d])-1)); //take the lower log2_dims bits
- 4640 ii = (ii >> log2_dims_a[d]); //shift those lower log2_dims bits off of ii
- 4641 }
- 4642 a_data_i += pos_d * a_str[d];
- 4643 z_data_i += pos_d * z_str[d];
- 4644 }
- 4645 }
- 4646 // now we've got pointers a_data_i and z_data_i into element 0 of the slice over which we are reducing
- 4647 // do a similar loop
- 4648
- 4649 float sum = 0.0f;
- 4650 switch(n_reduce_dims)
- 4651 {
- 4652 case 0:
- 4653 {
- 4654 sum = a_data_i[0];
- 4655 }
- 4656 break;
- 4657 case 1:
- 4658 {
- 4659 const int stride = a_str[reduce_dim0];
- 4660 const float * a_data_i_max = a_data_i + dims_a[reduce_dim0] * stride;
- 4661 while (a_data_i != a_data_i_max)
- 4662 {
- 4663 sum += a_data_i[0];
- 4664 a_data_i += stride;
- 4665 }
- 4666 }
- 4667 break;
- 4668 case 2:
- 4669 {
- 4670 int rd = reduce_dim0+1;
- 4671 for (; rd < nd; ++rd)
- 4672 {
- 4673 if (a_str[rd] && (!z_str[rd])) // this means 'rd' is a dimension we are reducing over
- 4674 break;
- 4675 }
- 4676 const int stride0 = a_str[reduce_dim0];
- 4677 const int stride1 = a_str[rd];
- 4678 for (int ii = 0; ii < dims_a[rd]; ++ii)
- 4679 {
- 4680 const float * a_data_ri = a_data_i + ii * stride1;
- 4681 const float * a_data_ri_max = a_data_ri + dims_a[reduce_dim0] * stride0;
- 4682 while (a_data_ri != a_data_ri_max)
- 4683 {
- 4684 sum += a_data_ri[0];
- 4685 a_data_ri += stride0;
- 4686 }
- 4687 }
- 4688 };
- 4689 break;
- 4690 default:
- 4691 {
- 4692 for (unsigned int reduce_i = 0; reduce_i < n_reduce_elements; ++reduce_i)
- 4693 {
- 4694 //TODO: optimize this loop to work more like theano's Elemwise. It's serial code.
- 4695 unsigned int reduce_ii = reduce_i;
- 4696 const float * a_data_ri = a_data_i;
- 4697
- 4698 //This loop finds the element in the a slice to add.
- 4699 for (unsigned int rd = reduce_dim0; rd < nd; ++rd)
- 4700 {
- 4701 unsigned int pos_d;
- 4702 if (a_str[rd] && (!z_str[rd])) // this means 'd' is a dimension we are reducing over
- 4703 {
- 4704 if (log2_dims_a[rd]==-1)
- 4705 {
- 4706 // this branch is not preferred,
- 4707 // because the manual said that integer mod and div operations are slow on gpu
- 4708 pos_d = (reduce_ii % dims_a[rd]);
- 4709 reduce_ii = (reduce_ii / dims_a[rd]);
- 4710 }
- 4711 else
- 4712 {
- 4713 pos_d = (reduce_ii & ((1 << log2_dims_a[rd])-1)); //take the lower log2_dims bits
- 4714 reduce_ii = (reduce_ii >> log2_dims_a[rd]); //shift those lower log2_dims bits off of ii
- 4715 }
- 4716 a_data_ri += pos_d * a_str[rd];
- 4717 }
- 4718 }
- 4719 sum += a_data_ri[0];
- 4720 }
- 4721 }
- 4722 }
- 4723 z_data_i[0] = sum;
- 4724 }
- 4725 }
- 4726
- 4727 static __global__ void kernel_reduce_sum_1011(
- 4728 const unsigned int d0,
- 4729 const unsigned int d1,
- 4730 const unsigned int d2,
- 4731 const unsigned int d3,
- 4732 const float *A, const int sA0, const int sA1, const int sA2, const int sA3,
- 4733 float * Z, const int sZ0)
- 4734 {
- 4735 const int threadCount = blockDim.x * blockDim.y * blockDim.z;
- 4736 const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
- 4737 extern __shared__ float buf[];
- 4738 float mysum = 0.0f;
- 4739
- 4740 if (warpSize != 32)
- 4741 {
- 4742 return; //TODO: set error code
- 4743 }
- 4744
- 4745 for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
- 4746 {
- 4747 float Ai = A[i0 * sA0 + blockIdx.x * sA1 + threadIdx.y * sA2 + threadIdx.x * sA3];
- 4748 mysum += Ai;
- 4749 }
- 4750 buf[threadNum] = mysum;
- 4751 __syncthreads();
- 4752
- 4753 // rest of function is handled by one warp
- 4754 if (threadNum < warpSize)
- 4755 {
- 4756 for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
- 4757 {
- 4758 mysum += buf[i];
- 4759 }
- 4760 buf[threadNum] = mysum;
- 4761 if (threadNum < 16)
- 4762 {
- 4763 //reduce so that threadNum 0 has the sum of everything
- 4764 if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
- 4765 if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
- 4766 if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
- 4767 if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
- 4768 if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
- 4769 if (threadNum == 0)
- 4770 {
- 4771 Z[blockIdx.x*sZ0] = buf[0];
- 4772 }
- 4773 }
- 4774 }
- 4775 }
- 4776 /**
- 4777 * Dimensions in which the self has size 1 and A has size > 1 are considered summing dimensions
- 4778 * Dimensions in which self has size > 1 and A has size > 1 are considered non-summing dimensions, and in this case their sizes must be equal.
- 4779 */
- 4780 int
- 4781 CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A)
- 4782 {
- 4783 int verbose = 0;
- 4784 //check input rank
- 4785 if (self->nd != A->nd)
- 4786 {
- 4787 PyErr_Format(PyExc_TypeError, "Rank mismatch in CudaNdarray_sum: %i vs %i", self->nd, A->nd);
- 4788 return -1;
- 4789 }
- 4790 for (int i = 0; i < self->nd; ++i)
- 4791 {
- 4792 if ((CudaNdarray_HOST_DIMS(self)[i] > 1) && (CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(A)[i]))
- 4793 {
- 4794 PyErr_Format(PyExc_TypeError, "Dimension mismatch in CudaNdarray_sum: self->dim[%i] == %i , A->dim[%i] = %i",
- 4795 i, CudaNdarray_HOST_DIMS(self)[i], i, CudaNdarray_HOST_DIMS(A)[i]);
- 4796 return -1;
- 4797 }
- 4798 }
- 4799
- 4800 int n_summations = (unsigned int)CudaNdarray_SIZE(self);
- 4801 if (verbose)
- 4802 {
- 4803 std::cerr << "reduce_sum n_summations " << n_summations << '\n';
- 4804 std::cerr << "reduce_sum nd " << self->nd << '\n';
- 4805 fprint_CudaNdarray(stderr, A);
- 4806 fprint_CudaNdarray(stderr, self);
- 4807 }
- 4808 if (0 && (A->nd == 4) //check to see if kernel_reduce_sum_1011 applies
- 4809 && (CudaNdarray_HOST_DIMS(self)[0] == 1)
- 4810 && (CudaNdarray_HOST_DIMS(self)[2] == 1)
- 4811 && (CudaNdarray_HOST_DIMS(self)[3] == 1)
- 4812 )
- 4813 {
- 4814 dim3 n_threads(CudaNdarray_HOST_DIMS(A)[3], CudaNdarray_HOST_DIMS(A)[2]);
- 4815 dim3 n_blocks(CudaNdarray_HOST_DIMS(A)[1]);
- 4816 while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
- 4817 n_threads.z -= 1;
- 4818 if (n_threads.z > 64) n_threads.z = 64;
- 4819 if (n_threads.z)
- 4820 {
- 4821 if (verbose) printf("trying kernel_reduce_sum_1011\n");
- 4822 int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
- 4823 kernel_reduce_sum_1011<<<n_blocks, n_threads, n_shared>>>(
- 4824 CudaNdarray_HOST_DIMS(A)[0],
- 4825 CudaNdarray_HOST_DIMS(A)[1],
- 4826 CudaNdarray_HOST_DIMS(A)[2],
- 4827 CudaNdarray_HOST_DIMS(A)[3],
- 4828 CudaNdarray_DEV_DATA(A),
- 4829 CudaNdarray_HOST_STRIDES(A)[0],
- 4830 CudaNdarray_HOST_STRIDES(A)[1],
- 4831 CudaNdarray_HOST_STRIDES(A)[2],
- 4832 CudaNdarray_HOST_STRIDES(A)[3],
- 4833 CudaNdarray_DEV_DATA(self),
- 4834 CudaNdarray_HOST_STRIDES(self)[1]);
- 4835 CNDA_THREAD_SYNC;
- 4836 if (cudaSuccess == cudaGetLastError()) return 0;
- 4837 if (verbose) printf("failed, falling back to kernel_reduce_sum\n");
- 4838 }
- 4839 }
- 4840
- 4841 int n_threads_per_block = std::min(n_summations,
- 4842 NUM_VECTOR_OP_THREADS_PER_BLOCK);
- 4843 int n_blocks = std::min(ceil_intdiv(n_summations,n_threads_per_block),
- 4844 NUM_VECTOR_OP_BLOCKS);
- 4845 int n_structure_cache = self->nd * 4 * sizeof(int);
- 4846
- 4847 if (verbose)
- 4848 {
- 4849 std::cerr << "n_blocks, n_threads_per_block " << n_blocks << ' ' << n_threads_per_block << '\n';
- 4850 }
- 4851 assert (self->nd > 0);
- 4852 assert (self->nd == A->nd);
- 4853 kernel_reduce_sum<<<n_blocks, n_threads_per_block, n_structure_cache>>>(
- 4854 n_summations,
- 4855 self->nd,
- 4856 CudaNdarray_DEV_DIMS(A),
- 4857 CudaNdarray_DEV_LOG2DIMS(A),
- 4858 CudaNdarray_DEV_STRIDES(A),
- 4859 CudaNdarray_DEV_DATA(A),
- 4860 CudaNdarray_DEV_STRIDES(self),
- 4861 CudaNdarray_DEV_DATA(self));
- 4862 CNDA_THREAD_SYNC;
- 4863 cudaError_t err = cudaGetLastError();
- 4864 if (cudaSuccess != err)
- 4865 {
- 4866 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kernel_reduce_sum", cudaGetErrorString(err));
- 4867 return -1;
- 4868 }
- 4869 return 0;
- 4870 }
- 4871 int
- 4872 CudaNdarray_reduce_prod(CudaNdarray * self, const CudaNdarray * A)
- 4873 {
- 4874 PyErr_SetString(PyExc_NotImplementedError, "");
- 4875 return -1;
- 4876 }
- 4877 int
- 4878 CudaNdarray_reduce_min(CudaNdarray * self, const CudaNdarray * A)
- 4879 {
- 4880 PyErr_SetString(PyExc_NotImplementedError, "");
- 4881 return -1;
- 4882 }
- 4883 int
- 4884 CudaNdarray_reduce_max(CudaNdarray * self, const CudaNdarray * A)
- 4885 {
- 4886 PyErr_SetString(PyExc_NotImplementedError, "");
- 4887 return -1;
- 4888 }
- 4889
- 4890
- 4891 /**
- 4892 *
- 4893 * pattern is a permutation of [0, 1, ... self->nd-1] with the following twists:
- 4894 * - an element 'd' of the permutation can be dropped if CudaNdarray_HOST_DIMS(self)[d] == 1
- 4895 * - any number of '-1' elements can be in the pattern, and they will cause new ranks (with dim==1) to be inserted.
- 4896 *
- 4897 * For example, if CudaNdarray_HOST_DIMS(self) == [4, 5, 1, 6], and pattern = [0,3,-1,-1, 1], then CudaNdarray_HOST_DIMS(self) would be modified to become:
- 4898 * [4, 6, 1, 1, 5] (we dropped the original dim[2]==1, and inserted two singleton dimensions with the -1s.
- 4899 */
- 4900 int
- 4901 CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern)
- 4902 {
- 4903 //TODO: pass a workspace pointer to avoid the internal malloc
- 4904 int * newdims = (int *)malloc(sizeof(int) * (len + len + self->nd)); //we tack on the taken buffer here for speed of not having to malloc twice.
- 4905 int * newstrides = newdims + len;
- 4906 int * dims_taken = newstrides + len;
- 4907 if (!newdims)
- 4908 {
- 4909 PyErr_SetString(PyExc_MemoryError, "CudaNdarray_dimshuffle: Failed to allocate temporary space");
- 4910 return -1;
- 4911 }
- 4912 for (int i = 0; i < self->nd; ++i)
- 4913 {
- 4914 dims_taken[i] = 0;
- 4915 }
- 4916 for (int i = 0; i < len; ++i)
- 4917 {
- 4918 if (pattern[i] < 0)
- 4919 {
- 4920 newdims[i] = 1;
- 4921 newstrides[i] = 0;
- 4922 }
- 4923 else if(dims_taken[pattern[i]])
- 4924 {
- 4925 PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You used the dimensions %d multiple time",
- 4926 pattern[i]);
- 4927 free(newdims);
- 4928 return -1;
- 4929 }
- 4930 else if (pattern[i]>= self->nd)
- 4931 {
- 4932 PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You asked for a dimensions that don't exist %d for a %d dims CudaNdarray",
- 4933 pattern[i], self->nd);
- 4934 free(newdims);
- 4935 return -1;
- 4936 }
- 4937 else
- 4938 {
- 4939 newdims[i] = CudaNdarray_HOST_DIMS(self)[pattern[i]];
- 4940 newstrides[i] = CudaNdarray_HOST_STRIDES(self)[pattern[i]];
- 4941 dims_taken[pattern[i]] = 1;
- 4942 }
- 4943 }
- 4944 //Check if we dropped not broadcastable dims
- 4945 for (int i = 0; i < self->nd; ++i)
- 4946 {
- 4947 if (dims_taken[i]==0 && CudaNdarray_HOST_DIMS(self)[i]!=1)
- 4948 {
- 4949 PyErr_SetString(PyExc_ValueError, "Cudandarray_dimshuffle: You cannot drop a non-broadcastable dimension.");
- 4950 free(newdims);
- 4951 return -1;
- 4952 }
- 4953 }
- 4954 //swap this structure in for the one in self, and sync to the card
- 4955 if (CudaNdarray_set_nd(self, len))
- 4956 {
- 4957 free(newdims);
- 4958 return -1;
- 4959 }
- 4960 for (int i = 0; i < len; ++i)
- 4961 {
- 4962 CudaNdarray_set_dim(self, i, newdims[i]);
- 4963 CudaNdarray_set_stride(self, i, newstrides[i]);
- 4964 }
- 4965 if (cnda_copy_structure_to_device(self))
- 4966 {
- 4967 free(newdims);
- 4968 return -1;
- 4969 }
- 4970 free(newdims);
- 4971 return 0;
- 4972 }
- 4973
- 4974
- 4975
- 4976 /**
- 4977 *
- 4978 * This is the function that bind to python.
- 4979 * See CudaNdarray_dimshuffle to call from C.
- 4980 * We use -1 to mean 'x' as in Tensor Dimshuffle.
- 4981 */
- 4982 PyObject *
- 4983 CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args)
- 4984 {
- 4985 PyObject * self = NULL;
- 4986 PyObject * pattern_object = NULL;
- 4987 int * pattern = NULL;
- 4988 PyObject * rval = NULL;
- 4989 int success = -1;
- 4990 //const int * dims = NULL;
- 4991
- 4992 //args should consist of two python objects ("OO")
- 4993 if (! PyArg_ParseTuple(args, "OO", &self, &pattern_object))
- 4994 return NULL;
- 4995
- 4996 if (!CudaNdarray_Check(self) )
- 4997 {
- 4998 PyErr_SetString(PyExc_TypeError, "First argument to cuda_ndarray.dimshuffle must be a CudaNdarray");
- 4999 return NULL;
- 5000 }
- 5001
- 5002 //parse pattern_object into int * pattern
- 5003
- 5004 Py_ssize_t pattern_dim = PyObject_Length(pattern_object);
- 5005
- 5006 if (pattern_dim < 0)
- 5007 {
- 5008 PyErr_SetString(PyExc_TypeError, "Couldn't get length of third argument to cuda_ndarray.dimshuffle");
- 5009 return NULL;
- 5010 }
- 5011
- 5012 pattern = (int *) malloc( pattern_dim * sizeof(int));
- 5013
- 5014 for (Py_ssize_t i = 0; i < pattern_dim; i++)
- 5015 {
- 5016 PyObject * idx = PyLong_FromLong(i);
- 5017
- 5018 if (idx == NULL)
- 5019 {
- 5020 PyErr_SetString(PyExc_Exception, "Couldn't make long object to loop over list/tuple");
- 5021 goto CudaNdarray_dimshuffle_fail;
- 5022 }
- 5023
- 5024 long elem_value = 0;
- 5025
- 5026 PyObject * elem = PyObject_GetItem(pattern_object, idx);
- 5027
- 5028 if (elem == NULL)
- 5029 {
- 5030 Py_XDECREF( elem);
- 5031 PyErr_SetString(PyExc_ValueError, "Third argument to dimshuffle must be list or tuple of integers");
- 5032 goto CudaNdarray_dimshuffle_fail;
- 5033 }
- 5034
- 5035 elem_value = PyInt_AsLong(elem);
- 5036
- 5037 if (elem_value == -1 && PyErr_Occurred() )
- 5038 {
- 5039 Py_XDECREF(elem);
- 5040 PyErr_SetString(PyExc_ValueError, "Third argument to dimshuffle must be list or tuple of integers");
- 5041 goto CudaNdarray_dimshuffle_fail;
- 5042 }
- 5043
- 5044 pattern[i] = elem_value;
- 5045
- 5046 Py_XDECREF( elem );
- 5047 Py_XDECREF( idx );
- 5048 }
- 5049
- 5050 //allocate rval
- 5051 rval = (PyObject *) CudaNdarray_View((CudaNdarray *) self);
- 5052
- 5053 if (rval == NULL)
- 5054 {
- 5055 //CudaNdarray_New should have set the exception string
- 5056 goto CudaNdarray_dimshuffle_fail;
- 5057 }
- 5058
- 5059
- 5060 //printf("pattern_dim: %d\n",pattern_dim);
- 5061 //printf("pattern: %d %d\n",pattern[0],pattern[1]);
- 5062 //dims = CudaNdarray_HOST_DIMS( (CudaNdarray *) self);
- 5063 //printf("dims before: %d %d\n",dims[0],dims[1]);
- 5064
- 5065 success = CudaNdarray_dimshuffle((CudaNdarray *) rval, pattern_dim, pattern);
- 5066
- 5067 if (success != 0)
- 5068 {
- 5069 //Exception string should already be set by CudaNdarray_dimshuffle
- 5070 goto CudaNdarray_dimshuffle_fail;
- 5071 }
- 5072
- 5073 free(pattern);
- 5074
- 5075 return rval;
- 5076
- 5077 CudaNdarray_dimshuffle_fail:
- 5078
- 5079 if (pattern != NULL)
- 5080 free(pattern);
- 5081
- 5082 Py_XDECREF(rval);
- 5083 return NULL;
- 5084 }
- 5085
- 5086
- 5087 int
- 5088 cnda_structure_size(int nd)
- 5089 {
- 5090 // dim0, dim1, ...
- 5091 // str0, str1, ...
- 5092 // log2(dim0), log2(dim1), ...
- 5093 return nd + nd + nd;
- 5094 }
- 5095
- 5096 const int *
- 5097 CudaNdarray_HOST_DIMS(const CudaNdarray * self)
- 5098 {
- 5099 return self->host_structure;
- 5100 }
- 5101
- 5102 const int *
- 5103 CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
- 5104 {
- 5105 return self->host_structure + self->nd;
- 5106 }
- 5107 const int *
- 5108 CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
- 5109 {
- 5110 return self->host_structure + 2*self->nd;
- 5111 }
- 5112
- 5113 int
- 5114 CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase)
- 5115 {
- 5116 int verbose = 0;
- 5117
- 5118 if (!ignoreSync && cnda1->dev_structure_fresh != cnda2->dev_structure_fresh)
- 5119 {
- 5120 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 1\n");
- 5121 return 0;
- 5122 }
- 5123
- 5124 if (cnda1->nd != cnda2->nd)
- 5125 {
- 5126 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 2\n");
- 5127 return 0;
- 5128 }
- 5129
- 5130 for (int i=0; i < 2*cnda1->nd; i++)
- 5131 {
- 5132 if (cnda1->host_structure[i] != cnda2->host_structure[i])
- 5133 {
- 5134 if(verbose)
- 5135 fprintf(stdout, "CUDANDARRAY_EQUAL : host_structure : %d, %d, %d\n", i, cnda1->host_structure[i], cnda2->host_structure[i]);
- 5136 return 0;
- 5137 }
- 5138 }
- 5139
- 5140 if (!ignoreBase && cnda1->base != cnda2->base)
- 5141 {
- 5142 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 4");
- 5143 return 0;
- 5144 }
- 5145 else if (cnda1->data_allocated != cnda2->data_allocated)
- 5146 {
- 5147 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 5");
- 5148 return 0;
- 5149 }
- 5150 else if (cnda1->data_allocated && cnda1->devdata != cnda2->devdata)
- 5151 {
- 5152 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 6");
- 5153 // no need to check devdata if data is not allocated
- 5154 return 0;
- 5155 }
- 5156
- 5157 return 1;
- 5158 }
- 5159
- 5160
- 5161 int
- 5162 CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
- 5163 {
- 5164 return CudaNdarray_EqualAndIgnore(cnda1, cnda2, 0, 0);
- 5165 }
- 5166
- 5167 int
- 5168 cnda_copy_structure_to_device(const CudaNdarray * self)
- 5169 {
- 5170 //If the device structure do not exists, create it.
- 5171 //We allocate it here as we do not need it often.
- 5172 //In fact, we need it so infrequently that we expect
- 5173 //that most object won't need it. Not allocating it
- 5174 //save a significant when creating object.
- 5175 //This speed up a benchmark by 8% with the gc.
- 5176 if (!self->dev_structure)
- 5177 {
- 5178 int struct_size = cnda_structure_size(self->nd);
- 5179 if (struct_size)
- 5180 {
- 5181 self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
- 5182 if (NULL == self->dev_structure)
- 5183 {
- 5184 return -1;
- 5185 }
- 5186 }
- 5187 }
- 5188 if (cublasSetVector(cnda_structure_size(self->nd),
- 5189 sizeof(int),
- 5190 self->host_structure,
- 5191 1,
- 5192 self->dev_structure,
- 5193 1) != CUBLAS_STATUS_SUCCESS)
- 5194 {
- 5195 PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
- 5196 return -1;
- 5197 }
- 5198 self->dev_structure_fresh = 1;
- 5199 return 0;
- 5200 }
- 5201
- 5202 const int *
- 5203 CudaNdarray_DEV_DIMS(const CudaNdarray * self)
- 5204 {
- 5205 if (!self->dev_structure_fresh)
- 5206 {
- 5207 if (cnda_copy_structure_to_device(self))
- 5208 return NULL;
- 5209 }
- 5210 return self->dev_structure;
- 5211 }
- 5212 const int *
- 5213 CudaNdarray_DEV_STRIDES(const CudaNdarray * self)
- 5214 {
- 5215 if (!self->dev_structure_fresh)
- 5216 {
- 5217 if (cnda_copy_structure_to_device(self))
- 5218 return NULL;
- 5219 }
- 5220 return self->dev_structure + self->nd;
- 5221 }
- 5222 const int *
- 5223 CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self)
- 5224 {
- 5225 if (!self->dev_structure_fresh)
- 5226 {
- 5227 if (cnda_copy_structure_to_device(self))
- 5228 return NULL;
- 5229 }
- 5230 return self->dev_structure + 2*self->nd;
- 5231 }
- 5232 float *
- 5233 CudaNdarray_DEV_DATA(const CudaNdarray * self)
- 5234 {
- 5235 return self->devdata;
- 5236 }
- 5237
- 5238 /**
- 5239 * Return the number of elements in the ndarray (product of the dimensions)
- 5240 */
- 5241 size_t
- 5242 CudaNdarray_SIZE(const CudaNdarray *self)
- 5243 {
- 5244 if (self->nd == -1) return 0;
- 5245 size_t size = 1;
- 5246 for (int i = 0; i < self->nd; ++i)
- 5247 {
- 5248 size *= CudaNdarray_HOST_DIMS(self)[i];
- 5249 }
- 5250 return size;
- 5251 }
- 5252
- 5253 PyObject *
- 5254 CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
- 5255 {
- 5256 return PyInt_FromLong(CudaNdarray_SIZE(self));
- 5257 }
- 5258
- 5259 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base)
- 5260 {
- 5261 return CudaNdarray_set_device_data(self, data, (PyObject *) base);
- 5262 }
- 5263
- 5264 PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self)
- 5265 {
- 5266 return PyBool_FromLong(CudaNdarray_is_c_contiguous(self));
- 5267 }
- 5268
- 5269 int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
- 5270 {
- 5271 cudaError_t err = cudaGetLastError();
- 5272 if( cudaSuccess != err)
- 5273 {
- 5274 PyErr_Format(PyExc_RuntimeError,
- 5275 "Cuda error: %s: %s.",
- 5276 "fprint_CudaNdarray was called with an uncleared error",
- 5277 cudaGetErrorString(err));
- 5278 return -1;
- 5279 }
- 5280 fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
- 5281 self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
- 5282 fprintf(fd, "\tHOST_DIMS: ");
- 5283 for (int i = 0; i < self->nd; ++i)
- 5284 {
- 5285 fprintf(fd, "%i\t", CudaNdarray_HOST_DIMS(self)[i]);
- 5286 }
- 5287 fprintf(fd, "\n\tHOST_STRIDES: ");
- 5288 for (int i = 0; i < self->nd; ++i)
- 5289 {
- 5290 fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
- 5291 }
- 5292
- 5293 if (self->dev_structure)
- 5294 {
- 5295 int data=0;
- 5296 fprintf(fd, "\n\tDEV_DIMS: ");
- 5297 for (int i = 0; i < self->nd; ++i)
- 5298 {
- 5299 cublasGetVector(1, sizeof(int),
- 5300 self->dev_structure+i, 1,
- 5301 &data, 1);
- 5302 fprintf(fd, "%i\t", data);
- 5303 }
- 5304 fprintf(fd, "\n\tDEV_STRIDES: ");
- 5305 for (int i = 0; i < self->nd; ++i)
- 5306 {
- 5307 cublasGetVector(1, sizeof(int),
- 5308 self->dev_structure + self->nd+i, 1,
- 5309 &data, 1);
- 5310 fprintf(fd, "%i \t", data);
- 5311 }
- 5312 fprintf(fd, "\n");
- 5313 }
- 5314 else
- 5315 {
- 5316 fprintf(fd, "\n\tdev_structure not allocated\n");
- 5317 }
- 5318
- 5319 err = cudaGetLastError();
- 5320 if( cudaSuccess != err)
- 5321 {
- 5322 PyErr_Format(PyExc_RuntimeError,
- 5323 "Cuda error: %s: %s.",
- 5324 "fprint_CudaNdarray",
- 5325 cudaGetErrorString(err));
- 5326 return -1;
- 5327 }
- 5328 return 0;
- 5329 }
- 5330
- 5331
- 5332 int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
- 5333 const int * dims, int fortran)
- 5334 {
- 5335 bool allocated = false;
- 5336 if (*arr == NULL)
- 5337 {
- 5338 // This allocates the metadata but not the data
- 5339 *arr = (CudaNdarray *) CudaNdarray_new_nd(nd);
- 5340 if (*arr == NULL)
- 5341 return -1;
- 5342 allocated = true;
- 5343 }
- 5344
- 5345 if (CudaNdarray_alloc_contiguous(*arr, nd, dims, fortran))
- 5346 {
- 5347 if (allocated)
- 5348 {
- 5349 Py_DECREF(*arr);
- 5350 *arr = NULL;
- 5351 }
- 5352 return -1;
- 5353 }
- 5354 return 0;
- 5355 }
- 5356
- 5357
- 5358 /*
- 5359 Local Variables:
- 5360 mode:c++
- 5361 c-basic-offset:4
- 5362 c-file-style:"stroustrup"
- 5363 indent-tabs-mode:nil
- 5364 fill-column:79
- 5365 End:
- 5366 */
- 5367 // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
- 5368
- ===============================
- mod.cu(941): warning: pointless comparison of unsigned integer with zero
- mod.cu(3001): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3004): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3006): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3009): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3011): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3014): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3017): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3020): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3022): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3025): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3027): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3030): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3032): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3035): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3038): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3041): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3043): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3046): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3048): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3051): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(941): warning: pointless comparison of unsigned integer with zero
- mod.cu(3001): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3004): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3006): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3009): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3011): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3014): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3017): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3020): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3022): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3025): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3027): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3030): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3032): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3035): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3038): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3041): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3043): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3046): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3048): warning: conversion from a string literal to "char *" is deprecated
- mod.cu(3051): warning: conversion from a string literal to "char *" is deprecated
- /usr/include/string.h: In function ‘void* __mempcpy_inline(void*, const void*, size_t)’:
- /usr/include/string.h:652:42: error: ‘memcpy’ was not declared in this scope
- return (char *) memcpy (__dest, __src, __n) + __n;
- ^
- mod.cu: In function ‘PyObject* CudaNdarray_Reshape(CudaNdarray*, PyObject*)’:
- mod.cu:955:122: warning: format ‘%lld’ expects argument of type ‘long long int’, but argument 3 has type ‘size_t {aka long unsigned int}’ [-Wformat=]
- PyErr_Format(PyExc_ValueError, "size must remain unchanged, changed from %lld to %lld", CudaNdarray_SIZE(self), rval_size);
- ^
- mod.cu:955:122: warning: format ‘%lld’ expects argument of type ‘long long int’, but argument 4 has type ‘size_t {aka long unsigned int}’ [-Wformat=]
- ERROR (theano.sandbox.cuda): Failed to compile cuda_ndarray.cu: ('nvcc return status', 1, 'for cmd', 'nvcc -shared -O3 -m64 -Xcompiler -DCUDA_NDARRAY_CUH=c72d035fdf91890f3b36710688069b2e,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,-fPIC,-fvisibility=hidden -Xlinker -rpath,/home/moose/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-2.7.11+-64/cuda_ndarray -I/home/moose/.local/lib/python2.7/site-packages/theano/sandbox/cuda -I/usr/lib/python2.7/dist-packages/numpy/core/include -I/usr/include/python2.7 -I/home/moose/.local/lib/python2.7/site-packages/theano/gof -o /home/moose/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-2.7.11+-64/cuda_ndarray/cuda_ndarray.so mod.cu -L/usr/lib -lcublas -lpython2.7 -lcudart')
- WARNING (theano.sandbox.cuda): CUDA is installed, but device gpu is not available (error: cuda unavailable)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement