Advertisement
themoosemind

Untitled

May 26th, 2016
7,839
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 221.23 KB | None | 0 0
  1. 1 #define _CUDA_NDARRAY_C
  2. 2
  3. 3 #include <Python.h>
  4. 4 #include <structmember.h>
  5. 5 #include "theano_mod_helper.h"
  6. 6
  7. 7 #include <numpy/arrayobject.h>
  8. 8 #include <iostream>
  9. 9
  10. 10 #include "cuda_ndarray.cuh"
  11. 11
  12. 12 #ifndef CNMEM_DLLEXPORT
  13. 13 #define CNMEM_DLLEXPORT
  14. 14 #endif
  15. 15
  16. 16 #include "cnmem.h"
  17. 17 #include "cnmem.cpp"
  18. 18
  19. 19 //If true, when there is a gpu malloc or free error, we print the size of allocated memory on the device.
  20. 20 #define COMPUTE_GPU_MEM_USED 0
  21. 21
  22. 22 //If true, we fill with NAN allocated device memory.
  23. 23 #define ALLOC_MEMSET 0
  24. 24
  25. 25 //If true, we print out when we free a device pointer, uninitialize a
  26. 26 //CudaNdarray, or allocate a device pointer
  27. 27 #define PRINT_FREE_MALLOC 0
  28. 28
  29. 29 //If true, we do error checking at the start of functions, to make sure there
  30. 30 //is not a pre-existing error when the function is called.
  31. 31 //You probably need to set the environment variable
  32. 32 //CUDA_LAUNCH_BLOCKING=1, and/or modify the CNDA_THREAD_SYNC
  33. 33 //preprocessor macro in cuda_ndarray.cuh
  34. 34 //if you want this to work.
  35. 35 #define PRECHECK_ERROR 0
  36. 36
  37. 37 cublasHandle_t handle = NULL;
  38. 38 int* err_var = NULL;
  39. 39
  40. 40 /////////////////////////
  41. 41 // Alloc and Free
  42. 42 /////////////////////////
  43. 43
  44. 44 static int g_gpu_context_active = 0;
  45. 45
  46. 46
  47. 47 PyObject *
  48. 48 CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args);
  49. 49 static PyObject *CudaNdarray_get_shape(CudaNdarray *self, void *closure);
  50. 50
  51. 51
  52. 52 /**
  53. 53 *
  54. 54 * In the test program I'm using, the _outstanding_mallocs decreases with every call.
  55. 55 * This suggests there are more free() calls being made than alloc(), but I can't figure out why.
  56. 56 *
  57. 57 */
  58. 58 int _outstanding_mallocs[] = {0,0};
  59. 59
  60. 60 #if COMPUTE_GPU_MEM_USED
  61. 61 size_t _allocated_size = 0;
  62. 62 size_t _max_allocated_size = 0;
  63. 63
  64. 64 const int TABLE_SIZE = 10000;
  65. 65 struct table_struct{
  66. 66 void* ptr;
  67. 67 size_t size;
  68. 68 };
  69. 69 table_struct _alloc_size_table[TABLE_SIZE];
  70. 70 #endif
  71. 71
  72. 72 void * device_malloc(size_t size)
  73. 73 {
  74. 74 return device_malloc(size, VERBOSE_DEVICE_MALLOC);
  75. 75 }
  76. 76
  77. 77 ///@TODO: thejaswi: link this option to a theano config variable?
  78. 78 static bool g_use_cnmem = false;
  79. 79 static const int g_max_devices = 8;
  80. 80 int initCnmem(int card_number_provided, int card_nb, size_t mem) {
  81. 81 static bool cnmemInitialized = false;
  82. 82 if(cnmemInitialized) {
  83. 83 return 0;
  84. 84 }
  85. 85 // On stderr to be at the same place as "Using gpu device..."
  86. 86 int numDevices = 0;
  87. 87 cnmemDevice_t devices[g_max_devices];
  88. 88 if(cudaGetDeviceCount(&numDevices) != cudaSuccess) {
  89. 89 PyErr_Format(PyExc_RuntimeError,
  90. 90 "initCnmem: 'cudaGetDeviceCount' failed! Reason=%s\n",
  91. 91 cudaGetErrorString(cudaGetLastError()));
  92. 92 return -1;
  93. 93 }
  94. 94 if(card_number_provided){
  95. 95 numDevices = 1;
  96. 96 int i = 0;
  97. 97 devices[i].device = card_nb;
  98. 98 devices[i].size = mem;
  99. 99 ///@TODO: thejaswi: add support for multiple streams
  100. 100 devices[i].numStreams = 0;
  101. 101 devices[i].streams = NULL;
  102. 102 devices[i].streamSizes = NULL;
  103. 103 }else{
  104. 104 for(int i=0;i<numDevices;++i) {
  105. 105 devices[i].device = i;
  106. 106 devices[i].size = mem;
  107. 107 ///@TODO: thejaswi: add support for multiple streams
  108. 108 devices[i].numStreams = 0;
  109. 109 devices[i].streams = NULL;
  110. 110 }
  111. 111 }
  112. 112
  113. 113 ///@TODO: thejaswi: passing custom cnmem flags?
  114. 114 cnmemStatus_t status = cnmemInit(numDevices, devices, CNMEM_FLAGS_DEFAULT);
  115. 115 if(status != CNMEM_STATUS_SUCCESS) {
  116. 116 PyErr_Format(PyExc_RuntimeError,
  117. 117 "initCnmem: cnmemInit call failed! Reason=%s. numdev=%d\n",
  118. 118 cnmemGetErrorString(status), numDevices);
  119. 119 return -1;
  120. 120 }
  121. 121 cnmemInitialized = true;
  122. 122 return 0;
  123. 123 }
  124. 124
  125. 125 void * device_malloc(size_t size, int verbose)
  126. 126 {
  127. 127 #if PRECHECK_ERROR
  128. 128 cudaThreadSynchronize();
  129. 129 cudaError_t prevError = cudaGetLastError();
  130. 130 if (cudaSuccess != prevError)
  131. 131 {
  132. 132 fprintf(stderr,
  133. 133 "Error existed before calling device_malloc. %s\n",
  134. 134 cudaGetErrorString(prevError)
  135. 135 );
  136. 136 }
  137. 137 #endif
  138. 138 void * rval=NULL;
  139. 139 ///@TODO: thejaswi: support for multiple-streams?
  140. 140 if(g_use_cnmem) {
  141. 141 cnmemStatus_t status = CNMEM_STATUS_SUCCESS;
  142. 142 status = cnmemMalloc(&rval, size, NULL);
  143. 143 if(status != CNMEM_STATUS_SUCCESS) {
  144. 144 PyErr_Format(PyExc_MemoryError,
  145. 145 "Error allocating %llu bytes of device memory (%s).",
  146. 146 (unsigned long long)size, cnmemGetErrorString(status));
  147. 147 return NULL;
  148. 148 }
  149. 149 }
  150. 150 else {
  151. 151 cudaError_t err = cudaMalloc(&rval, size);
  152. 152 if (cudaSuccess != err)
  153. 153 {
  154. 154 // Clear the error flag, cudaMalloc doesn't do it.
  155. 155 // Currently this returns the same thing as err, but if in future
  156. 156 // it returns something else I still don't see why we should ignore
  157. 157 // it. All we want to do here is reset the flag.
  158. 158 cudaGetLastError();
  159. 159 if (verbose)
  160. 160 {
  161. 161 size_t free = 0, total = 0;
  162. 162 cudaError_t err2 = cudaMemGetInfo(&free, &total);
  163. 163 if (err2 != cudaSuccess){
  164. 164 cudaGetLastError();
  165. 165 fprintf(stderr,
  166. 166 "Error when trying to find the memory information"
  167. 167 " on the GPU: %s\n", cudaGetErrorString(err2));
  168. 168 }
  169. 169 #if COMPUTE_GPU_MEM_USED
  170. 170 fprintf(stderr,
  171. 171 "Error allocating %llu bytes of device memory (%s)."
  172. 172 " new total bytes allocated: %llu."
  173. 173 " Driver report %llu bytes free and %llu bytes total \n",
  174. 174 (unsigned long long)size, cudaGetErrorString(err), (unsigned long long)_allocated_size,
  175. 175 (unsigned long long)free, (unsigned long long)total);
  176. 176 #else
  177. 177 fprintf(stderr,
  178. 178 "Error allocating %llu bytes of device memory (%s)."
  179. 179 " Driver report %llu bytes free and %llu bytes total \n",
  180. 180 (unsigned long long)size, cudaGetErrorString(err), (unsigned long long)free, (unsigned long long)total);
  181. 181 #endif
  182. 182 }
  183. 183 PyErr_Format(PyExc_MemoryError,
  184. 184 "Error allocating %llu bytes of device memory (%s).",
  185. 185 (unsigned long long)size, cudaGetErrorString(err));
  186. 186 return NULL;
  187. 187 }
  188. 188 }
  189. 189 if (rval != NULL){
  190. 190 // Can it happen that cudaMalloc return cudaSuccess, but return a NULL ptr?
  191. 191 // Could this be what happen if size is 0?
  192. 192 _outstanding_mallocs[0] += 1;
  193. 193
  194. 194 #if COMPUTE_GPU_MEM_USED
  195. 195 _allocated_size += size;
  196. 196 _max_allocated_size = std::max(_max_allocated_size, _allocated_size);
  197. 197 int i = 0;
  198. 198 for(;i<TABLE_SIZE;i++){
  199. 199 if(NULL==_alloc_size_table[i].ptr){
  200. 200 _alloc_size_table[i].ptr=rval;
  201. 201 _alloc_size_table[i].size=size;
  202. 202 break;
  203. 203 }
  204. 204 }
  205. 205 if (i == TABLE_SIZE){
  206. 206 fprintf(stderr,
  207. 207 "When tracking GPU malloc, our table size wasn't big enough."
  208. 208 " So we loose some tracking. Raise the value of TABLE_SIZE in the file cuda_ndarra.cu");
  209. 209 }
  210. 210 #endif
  211. 211 }
  212. 212 //fprintf(stderr,
  213. 213 //"allocated %li bytes of device memory (%s). new total bytes allocated: %d. ptr: %p\n",
  214. 214 //(long)size, cudaGetErrorString(err),_allocated_size,rval);
  215. 215
  216. 216 if(ALLOC_MEMSET){
  217. 217 //We init them to nan to make sure we catch more debug case.
  218. 218 cudaMemset(rval, 0xFF, size);
  219. 219 //printf("MEMSET\n");
  220. 220 }
  221. 221 #if PRINT_FREE_MALLOC
  222. 222 fprintf(stderr, "device malloc %p of size %d\n", rval, size);
  223. 223 #endif
  224. 224 return rval;
  225. 225 }
  226. 226
  227. 227 int device_free(void *ptr)
  228. 228 {
  229. 229 #if PRECHECK_ERROR
  230. 230 cudaThreadSynchronize();
  231. 231 cudaError_t prevError = cudaGetLastError();
  232. 232 if (cudaSuccess != prevError)
  233. 233 {
  234. 234 fprintf(stderr,
  235. 235 "Error existed before calling device_free. %s\n",
  236. 236 cudaGetErrorString(prevError)
  237. 237 );
  238. 238 }
  239. 239 #endif
  240. 240 #if PRINT_FREE_MALLOC
  241. 241 size_t free = 0, total = 0;
  242. 242 cudaError_t err2 = cudaMemGetInfo(&free, &total);
  243. 243 if (err2 != cudaSuccess){
  244. 244 cudaGetLastError();
  245. 245 fprintf(stderr,
  246. 246 "Error when tring to find the memory information"
  247. 247 " on the GPU: %s\n", cudaGetErrorString(err2));
  248. 248 }
  249. 249 #if COMPUTE_GPU_MEM_USED
  250. 250 {
  251. 251 int i = 0;
  252. 252 for(;i<TABLE_SIZE;i++)
  253. 253 if(_alloc_size_table[i].ptr==ptr){
  254. 254 break;
  255. 255 }
  256. 256 assert(i<TABLE_SIZE);
  257. 257 fprintf(stderr, "device_free %p of size %d."
  258. 258 " Driver report %d bytes free and %d bytes total \n",
  259. 259 ptr, _alloc_size_table[i].size, free, total);
  260. 260 }
  261. 261 #else
  262. 262 fprintf(stderr, "device_free %p."
  263. 263 " Driver report %d bytes free and %d bytes total \n",
  264. 264 ptr, free, total);
  265. 265 #endif
  266. 266 #endif
  267. 267
  268. 268 // if there is no gpu context, the call to cudaFree will fail; skip it entirely
  269. 269 if(!g_gpu_context_active) {
  270. 270 return 0;
  271. 271 }
  272. 272
  273. 273 ///@TODO: thejaswi: multi-stream support
  274. 274 if(g_use_cnmem) {
  275. 275 cnmemStatus_t status = cnmemFree(ptr, NULL);
  276. 276 if(status != CNMEM_STATUS_SUCCESS) {
  277. 277 fprintf(stderr, "device_free: cnmemFree call failed! Reason=%s\n",
  278. 278 cnmemGetErrorString(status));
  279. 279 }
  280. 280 }
  281. 281 else {
  282. 282 // We need sync as the Theano's GC could remove intermediate variable that
  283. 283 // are still needed as the gpu kernel are running or in the queue.
  284. 284 CNDA_BEGIN_ALLOW_THREADS
  285. 285 cudaThreadSynchronize();
  286. 286 CNDA_END_ALLOW_THREADS
  287. 287
  288. 288 cudaError_t err = cudaFree(ptr);
  289. 289 if (cudaSuccess != err)
  290. 290 {
  291. 291 // Clear the error flag, cudaFree doesn't do it.
  292. 292 // Currently this returns the same thing as err, but if in future
  293. 293 // it returns something else I still don't see why we should ignore
  294. 294 // it. All we want to do here is reset the flag.
  295. 295 cudaGetLastError();
  296. 296 size_t free = 0, total = 0;
  297. 297 cudaError_t err2 = cudaMemGetInfo(&free, &total);
  298. 298 if (err2 != cudaSuccess){
  299. 299 cudaGetLastError();
  300. 300 fprintf(stderr,
  301. 301 "Error when tring to find the memory information"
  302. 302 " on the GPU: %s\n", cudaGetErrorString(err2));
  303. 303 }
  304. 304 #if COMPUTE_GPU_MEM_USED
  305. 305 {
  306. 306 int i = 0;
  307. 307 for(;i<TABLE_SIZE;i++)
  308. 308 if(_alloc_size_table[i].ptr==ptr){
  309. 309 break;
  310. 310 }
  311. 311 assert(i<TABLE_SIZE);
  312. 312 fprintf(stderr,
  313. 313 "Error freeing device pointer %p (%s) of size %llu. %llu byte already allocated."
  314. 314 " Driver report %llu bytes free and %llu bytes total \n",
  315. 315 ptr, cudaGetErrorString(err),
  316. 316 (unsigned long long)_alloc_size_table[i].size, (unsigned long long)_allocated_size, (unsigned long long)free, (unsigned long long)total);
  317. 317 }
  318. 318 #else
  319. 319 fprintf(stderr,
  320. 320 "Error freeing device pointer %p (%s)."
  321. 321 " Driver report %llu bytes free and %llu bytes total \n",
  322. 322 ptr,
  323. 323 cudaGetErrorString(err), (unsigned long long)free, (unsigned long long)total);
  324. 324 #endif
  325. 325 if (NULL != PyErr_Occurred()){
  326. 326 fprintf(stderr,
  327. 327 "device_free: cudaFree() returned an error, but there is already an"
  328. 328 " Python error set. This happen during the clean up when there is a"
  329. 329 " first error and the CUDA driver is in a so bad state that it don't"
  330. 330 " work anymore. We keep the previous error set to help debugging it.");
  331. 331 return -1;
  332. 332 }
  333. 333 PyErr_Format(PyExc_MemoryError,
  334. 334 "error freeing device pointer %p (%s)",
  335. 335 ptr,
  336. 336 cudaGetErrorString(err));
  337. 337 return -1;
  338. 338 }
  339. 339 }
  340. 340 _outstanding_mallocs[0] -= (ptr != NULL);
  341. 341 #if COMPUTE_GPU_MEM_USED
  342. 342 int i=0;
  343. 343 size_t total_freed = 0;
  344. 344 for(;i<TABLE_SIZE;i++)
  345. 345 if(_alloc_size_table[i].ptr==ptr){
  346. 346 _allocated_size -= _alloc_size_table[i].size;
  347. 347 total_freed += _alloc_size_table[i].size;
  348. 348 _alloc_size_table[i].ptr=0;
  349. 349 _alloc_size_table[i].size=0;
  350. 350
  351. 351 break;
  352. 352 }
  353. 353 //if(i==TABLE_SIZE)
  354. 354 // printf("Unallocated unknow size!\n");
  355. 355 //fprintf(stderr, "freed %li bytes of device memory (%s). %d already allocated, ptr=%p\n", (long)total_freed, cudaGetErrorString(err),_allocated_size,ptr);
  356. 356 #endif
  357. 357 return 0;
  358. 358 }
  359. 359
  360. 360 static PyObject *
  361. 361 outstanding_mallocs(PyObject* self, PyObject * args)
  362. 362 {
  363. 363 return PyInt_FromLong(_outstanding_mallocs[0]);
  364. 364 }
  365. 365
  366. 366
  367. 367 static void *work_mem = NULL;
  368. 368 static size_t work_size = 0;
  369. 369
  370. 370 /*
  371. 371 * Returns a chunk of memory for temporary work inside of an op. You can only
  372. 372 * request a single chunk of memory at a time since it is reused.
  373. 373 */
  374. 374 void *get_work_mem(size_t sz) {
  375. 375 if (sz <= work_size)
  376. 376 return work_mem;
  377. 377 device_free(work_mem);
  378. 378 work_mem = device_malloc(sz);
  379. 379 work_size = sz;
  380. 380 if (work_mem == NULL)
  381. 381 work_size = 0;
  382. 382 return work_mem;
  383. 383 }
  384. 384
  385. 385 /////////////////////////
  386. 386 // Static helper methods
  387. 387 /////////////////////////
  388. 388
  389. 389 static void
  390. 390 CudaNdarray_null_init(CudaNdarray*self)
  391. 391 {
  392. 392 self->base = NULL;
  393. 393 self->nd = -1;
  394. 394 self->host_structure = NULL;
  395. 395 self->data_allocated = 0;
  396. 396 self->dev_structure_fresh = 1;
  397. 397 self->dev_structure = NULL;
  398. 398 self->devdata = NULL;
  399. 399 }
  400. 400
  401. 401 static int
  402. 402 CudaNdarray_uninit(CudaNdarray*self)
  403. 403 {
  404. 404 #if PRINT_FREE_MALLOC
  405. 405 fprintf(stderr, "CudaNdarray_uninit %p\n", self);
  406. 406 #endif
  407. 407 int rval = 0;
  408. 408 if (self->data_allocated) {
  409. 409 assert(self->devdata);
  410. 410 if (device_free(self->devdata))
  411. 411 {
  412. 412 fprintf(stderr,
  413. 413 "CudaNdarray_uninit: error freeing self->devdata. (self=%p, self->devata=%p)\n",
  414. 414 self, self->devdata);
  415. 415 rval = -1;
  416. 416 }
  417. 417 self->devdata = NULL;
  418. 418 self->data_allocated = 0;
  419. 419 }
  420. 420 if (self->dev_structure)
  421. 421 {
  422. 422 if (device_free(self->dev_structure))
  423. 423 {
  424. 424 fprintf(stderr,
  425. 425 "CudaNdarray_uninit: error freeing dev_structure memory %p (self=%p)\n",
  426. 426 self->dev_structure, self);
  427. 427 rval = -1;
  428. 428 }
  429. 429 self->dev_structure = NULL;
  430. 430 }
  431. 431 if (self->host_structure)
  432. 432 {
  433. 433 free(self->host_structure);
  434. 434 self->host_structure = NULL;
  435. 435 }
  436. 436 self->nd = -1;
  437. 437 Py_XDECREF(self->base);
  438. 438 self->base = NULL;
  439. 439 return rval;
  440. 440 }
  441. 441
  442. 442
  443. 443 //make the rightmost coords change fastest
  444. 444 //TODO: why does a downward for-loop not work????
  445. 445 //TODO: use the log2_dims and driver code to remove / and %
  446. 446 //TODO: skip the last division (when d == 0)
  447. 447 #define decl_k_elemwise_unary_rowmajor(name, F) \
  448. 448 __global__ void name (unsigned int numEls, \
  449. 449 unsigned int nd, \
  450. 450 const int * dim, \
  451. 451 const float * a_data, const int * a_str, \
  452. 452 float * z_data, const int * z_str) \
  453. 453 { \
  454. 454 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x; \
  455. 455 const unsigned int numThreads = blockDim.x * gridDim.x; \
  456. 456 \
  457. 457 for (unsigned int i = idx; i < numEls; i += numThreads) \
  458. 458 { \
  459. 459 unsigned int ii = i; \
  460. 460 const float * a_i = a_data; \
  461. 461 float * z_i = z_data; \
  462. 462 for (unsigned int _d = 0; _d < nd; ++_d) \
  463. 463 { \
  464. 464 unsigned int d = nd - _d-1; \
  465. 465 int i_d = ii % dim[d]; /* i_d is our position in the d'th dimension */ \
  466. 466 ii = ii / dim[d]; \
  467. 467 a_i += i_d * a_str[d]; /* increment our a and z pointers by i_d elements */ \
  468. 468 z_i += i_d * z_str[d]; \
  469. 469 } \
  470. 470 z_i[0] = F(a_i[0]); \
  471. 471 } \
  472. 472 }
  473. 473
  474. 474 template<typename T> __device__ T unary_copy(T a) { return a; }
  475. 475 decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_copy, unary_copy<float>)
  476. 476
  477. 477 template<typename T> __device__ T unary_exp(T a) { return exp(a); }
  478. 478 decl_k_elemwise_unary_rowmajor(k_elemwise_unary_rowmajor_exp, unary_exp<float>)
  479. 479
  480. 480 /////////////////////////////
  481. 481 // Satisfying reqs to be Type
  482. 482 /////////////////////////////
  483. 483
  484. 484 //DON'T use directly(if their is other CudaNdarray that point to it, it will cause problem)! use Py_DECREF() instead
  485. 485 static void
  486. 486 CudaNdarray_dealloc(CudaNdarray* self)
  487. 487 {
  488. 488 if (0) std::cerr << "CudaNdarray dealloc " << self << " " << self->devdata << '\n';
  489. 489 if(Py_REFCNT(self) > 1)
  490. 490 printf("WARNING:CudaNdarray_dealloc called when there is still active reference to it.\n");
  491. 491 CudaNdarray_uninit(self);
  492. 492 Py_TYPE(self)->tp_free((PyObject*)self);
  493. 493 --_outstanding_mallocs[1];
  494. 494 if (0)
  495. 495 {
  496. 496 fprintf(stderr, "device_malloc_counts: (device) %i (obj) %i\n",
  497. 497 _outstanding_mallocs[0],
  498. 498 _outstanding_mallocs[1]);
  499. 499 }
  500. 500 }
  501. 501
  502. 502 static PyObject *
  503. 503 CudaNdarray_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
  504. 504 {
  505. 505 CudaNdarray *self;
  506. 506
  507. 507 self = (CudaNdarray *)type->tp_alloc(type, 0);
  508. 508 if (self != NULL)
  509. 509 {
  510. 510 CudaNdarray_null_init(self);
  511. 511 ++_outstanding_mallocs[1];
  512. 512 }
  513. 513 return (PyObject *)self;
  514. 514 }
  515. 515 static int
  516. 516 CudaNdarray_init(CudaNdarray *self, PyObject *args, PyObject *kwds)
  517. 517 {
  518. 518 PyObject *arr=NULL;
  519. 519
  520. 520 if (! PyArg_ParseTuple(args, "O", &arr))
  521. 521 return -1;
  522. 522 if (! PyArray_Check(arr))
  523. 523 {
  524. 524 PyErr_SetString(PyExc_TypeError, "PyArray arg required");
  525. 525 return -1;
  526. 526 }
  527. 527 int rval = CudaNdarray_CopyFromArray(self, (PyArrayObject*)arr);
  528. 528 return rval;
  529. 529 }
  530. 530 static PyMemberDef CudaNdarray_members[] =
  531. 531 {
  532. 532 /*
  533. 533 {"first", T_OBJECT_EX, offsetof(CudaNdarray, first), 0,
  534. 534 "first name"},
  535. 535 {"last", T_OBJECT_EX, offsetof(CudaNdarray, last), 0,
  536. 536 "last name"},
  537. 537 {"number", T_INT, offsetof(CudaNdarray, number), 0,
  538. 538 "noddy number"},
  539. 539 */
  540. 540 {NULL} /* Sentinel */
  541. 541 };
  542. 542
  543. 543 PyObject * CudaNdarray_CreateArrayObj(CudaNdarray * self, PyObject *args)
  544. 544 {
  545. 545 PyObject * dtype = NULL;
  546. 546 if (args && !PyArg_ParseTuple(args, "|O", &dtype))
  547. 547 return NULL;
  548. 548 if (dtype) {
  549. 549 PyArray_Descr* dtype2;
  550. 550 // PyArray_DescrConverter try to convert anything to a PyArray_Descr.
  551. 551 if(!PyArray_DescrConverter(dtype, &dtype2))
  552. 552 {
  553. 553 PyObject * str = PyObject_Repr(dtype);
  554. 554 PyErr_Format(PyExc_TypeError,
  555. 555 "CudaNdarray dtype parameter not understood: %s",
  556. 556 PyString_AsString(str)
  557. 557 );
  558. 558 Py_CLEAR(str);
  559. 559 return NULL;
  560. 560 }
  561. 561 int typeNum = dtype2->type_num;
  562. 562 Py_DECREF(dtype2);
  563. 563 if (typeNum != NPY_FLOAT32)
  564. 564 {
  565. 565 PyObject * str = PyObject_Repr(dtype);
  566. 566 PyErr_Format(PyExc_TypeError,
  567. 567 "CudaNdarray support only support float32 dtype, provided: %d",
  568. 568 typeNum
  569. 569 );
  570. 570 Py_CLEAR(str);
  571. 571 return NULL;
  572. 572 }
  573. 573 }
  574. 574
  575. 575 int verbose = 0;
  576. 576 if(self->nd>=0 && CudaNdarray_SIZE(self)==0){
  577. 577 npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp));
  578. 578 assert (npydims);
  579. 579 for (int i = 0; i < self->nd; ++i) npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]);
  580. 580 PyObject * rval = PyArray_SimpleNew(self->nd, npydims, REAL_TYPENUM);
  581. 581 free(npydims);
  582. 582 if (!rval){
  583. 583 return NULL;
  584. 584 }
  585. 585 assert (PyArray_ITEMSIZE((PyArrayObject *)rval) == sizeof(real));
  586. 586 return rval;
  587. 587 }
  588. 588 if ((self->nd < 0) || (self->devdata == 0))
  589. 589 {
  590. 590 PyErr_SetString(PyExc_ValueError, "can't copy from un-initialized CudaNdarray");
  591. 591 return NULL;
  592. 592 }
  593. 593 CudaNdarray * contiguous_self = NULL;
  594. 594 if (CudaNdarray_is_c_contiguous(self))
  595. 595 {
  596. 596 contiguous_self = self;
  597. 597 Py_INCREF(contiguous_self);
  598. 598 if (verbose) std::cerr << "CreateArrayObj already contiguous" << contiguous_self << '\n';
  599. 599 }
  600. 600 else
  601. 601 {
  602. 602 contiguous_self = (CudaNdarray*)CudaNdarray_Copy(self);
  603. 603 if (verbose) std::cerr << "CreateArrayObj created contiguous" << contiguous_self << '\n';
  604. 604 }
  605. 605 if (!contiguous_self)
  606. 606 {
  607. 607 return NULL;
  608. 608 }
  609. 609
  610. 610 npy_intp * npydims = (npy_intp*)malloc(self->nd * sizeof(npy_intp));
  611. 611 assert (npydims);
  612. 612 for (int i = 0; i < self->nd; ++i)
  613. 613 npydims[i] = (npy_intp)(CudaNdarray_HOST_DIMS(self)[i]);
  614. 614 PyArrayObject * rval = (PyArrayObject *) PyArray_SimpleNew(self->nd,
  615. 615 npydims,
  616. 616 REAL_TYPENUM);
  617. 617 free(npydims);
  618. 618 if (!rval)
  619. 619 {
  620. 620 Py_DECREF(contiguous_self);
  621. 621 return NULL;
  622. 622 }
  623. 623
  624. 624 assert (PyArray_ITEMSIZE(rval) == sizeof(real));
  625. 625
  626. 626 npy_intp rval_size = PyArray_SIZE(rval);
  627. 627 void *rval_data = PyArray_DATA(rval);
  628. 628 cudaError_t err;
  629. 629 CNDA_BEGIN_ALLOW_THREADS;
  630. 630
  631. 631 err = cudaMemcpy(rval_data, contiguous_self->devdata,
  632. 632 rval_size * sizeof(real),
  633. 633 cudaMemcpyDeviceToHost
  634. 634 );
  635. 635 //CNDA_THREAD_SYNC; // unneeded because cudaMemcpy is blocking anyway
  636. 636 CNDA_END_ALLOW_THREADS;
  637. 637
  638. 638 if (cudaSuccess != err)
  639. 639 {
  640. 640 PyErr_Format(PyExc_RuntimeError, "error (%s)copying data to host",
  641. 641 cudaGetErrorString(err));
  642. 642 Py_DECREF(rval);
  643. 643 rval = NULL;
  644. 644 }
  645. 645
  646. 646 Py_DECREF(contiguous_self);
  647. 647 return (PyObject *)rval;
  648. 648 }
  649. 649
  650. 650 // TODO-- we have two functions here, ZEROS and Zeros.
  651. 651 // ZEROS is meant to be called just from C code (you don't need to pass it PyObject * s)
  652. 652 // but this naming is very weird, makes it look like a macro
  653. 653 // we should figure out the correct convention and change to that
  654. 654 PyObject* CudaNdarray_ZEROS(int n, int * dims)
  655. 655 {
  656. 656
  657. 657 size_t total_elements = 1;
  658. 658
  659. 659 for(size_t i=0;i<n;i++){
  660. 660 // Detect overflow on unsigned integer
  661. 661 if (dims[i] != 0 && total_elements > (SIZE_MAX / dims[i])) {
  662. 662 PyErr_Format(PyExc_RuntimeError,
  663. 663 "Can't store in size_t for the bytes requested %llu * %llu",
  664. 664 (unsigned long long)total_elements,
  665. 665 (unsigned long long)dims[i]);
  666. 666 return NULL;
  667. 667 }
  668. 668 total_elements*=dims[i];
  669. 669 }
  670. 670
  671. 671 // total_elements now contains the size of the array, in reals
  672. 672 if (total_elements > (SIZE_MAX / sizeof(real))){
  673. 673 PyErr_Format(PyExc_RuntimeError,
  674. 674 "Can't store in size_t for the bytes requested %llu * 4",
  675. 675 (unsigned long long)total_elements);
  676. 676 return NULL;
  677. 677 }
  678. 678 size_t total_size = total_elements * sizeof(real);
  679. 679
  680. 680 CudaNdarray* rval = (CudaNdarray*)CudaNdarray_New();
  681. 681 if (!rval)
  682. 682 {
  683. 683 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: call to New failed");
  684. 684 return NULL;
  685. 685 }
  686. 686
  687. 687 if (CudaNdarray_alloc_contiguous(rval, n, dims))
  688. 688 {
  689. 689 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: allocation failed.");
  690. 690 Py_DECREF(rval);
  691. 691 return NULL;
  692. 692 }
  693. 693
  694. 694 // Fill with zeros
  695. 695 //fprintf(stdout, "Sizeof: %d\n", total_size);
  696. 696 if (cudaSuccess != cudaMemset(rval->devdata, 0, total_size))
  697. 697 {
  698. 698 PyErr_Format(PyExc_MemoryError,
  699. 699 "CudaNdarray_ZEROS: Error memsetting %llu bytes of device memory.",
  700. 700 (unsigned long long)total_size);
  701. 701 Py_DECREF(rval);
  702. 702 return NULL;
  703. 703 }
  704. 704
  705. 705 if (cnda_copy_structure_to_device(rval))
  706. 706 {
  707. 707 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_ZEROS: syncing structure to device failed");
  708. 708 Py_DECREF(rval);
  709. 709 return NULL;
  710. 710 }
  711. 711 return (PyObject*) rval;
  712. 712 }
  713. 713
  714. 714 // declared as a static method (hence 1st parameter is not used)
  715. 715 // Based on _Copy and _dimshuffle
  716. 716 PyObject* CudaNdarray_Zeros(PyObject* _unused, PyObject* shape)
  717. 717 {
  718. 718 if(!shape)
  719. 719 {
  720. 720 PyErr_SetString(PyExc_TypeError, "CudaNdarray_Zeros: function takes at least 1 argument (0 given)");
  721. 721 return NULL;
  722. 722 }
  723. 723 if(!PySequence_Check(shape))
  724. 724 {
  725. 725 PyErr_SetString(PyExc_TypeError, "shape argument must be a sequence");
  726. 726 return NULL;
  727. 727 }
  728. 728
  729. 729 int shplen = PySequence_Length(shape);
  730. 730
  731. 731 if (shplen == 0)
  732. 732 {
  733. 733 return CudaNdarray_ZEROS(0, NULL);
  734. 734 }
  735. 735
  736. 736 int* newdims = (int *)malloc(sizeof(int) * shplen);
  737. 737
  738. 738 if (!newdims)
  739. 739 {
  740. 740 PyErr_SetString(PyExc_MemoryError,
  741. 741 "CudaNdarray_Zeros: Failed to allocate temporary space");
  742. 742 return NULL;
  743. 743 }
  744. 744
  745. 745 // start from the end to compute strides
  746. 746 for (int i = shplen-1; i >= 0; --i)
  747. 747 {
  748. 748 PyObject* shp_el_obj = PySequence_GetItem(shape, i);
  749. 749 if(shp_el_obj == NULL)
  750. 750 {
  751. 751 // shouldn't happen since we checked length before...
  752. 752 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_Zeros: Index out of bound in sequence");
  753. 753 free(newdims);
  754. 754 return NULL;
  755. 755 }
  756. 756
  757. 757 int shp_el = PyInt_AsLong(shp_el_obj);
  758. 758 Py_DECREF(shp_el_obj);
  759. 759
  760. 760 if (shp_el < 0)
  761. 761 {
  762. 762 PyErr_SetString(PyExc_ValueError, "CudaNdarray_Zeros: shape must contain only non-negative values for size of a dimension");
  763. 763 free(newdims);
  764. 764 return NULL;
  765. 765 }
  766. 766
  767. 767 newdims[i] = shp_el;
  768. 768 }
  769. 769
  770. 770 PyObject* rval = CudaNdarray_ZEROS(shplen,newdims);
  771. 771
  772. 772 free(newdims);
  773. 773
  774. 774 return (PyObject*)rval;
  775. 775 }
  776. 776
  777. 777
  778. 778
  779. 779
  780. 780
  781. 781 PyObject * CudaNdarray_Copy(const CudaNdarray * self)
  782. 782 {
  783. 783 PyObject * rval = CudaNdarray_New();
  784. 784 if ((!rval) || (-1 == self->nd))
  785. 785 {
  786. 786 return rval;
  787. 787 }
  788. 788 if (CudaNdarray_alloc_contiguous((CudaNdarray*)rval, self->nd, CudaNdarray_HOST_DIMS(self)))
  789. 789 {
  790. 790 Py_DECREF(rval);
  791. 791 return NULL;
  792. 792 }
  793. 793 if (CudaNdarray_CopyFromCudaNdarray((CudaNdarray*)rval, self))
  794. 794 {
  795. 795 Py_DECREF(rval);
  796. 796 return NULL;
  797. 797 }
  798. 798 return rval;
  799. 799 }
  800. 800 PyObject * CudaNdarray_DeepCopy(CudaNdarray * self, PyObject * memo)
  801. 801 {
  802. 802 assert(PyDict_Check(memo));
  803. 803 PyObject * selfkey = PyInt_FromLong((long)self);
  804. 804 assert(selfkey);
  805. 805 if (PyDict_Contains(memo, selfkey))
  806. 806 {
  807. 807 PyObject * rval = PyDict_GetItem(memo, selfkey);
  808. 808 Py_DECREF(selfkey);
  809. 809 Py_XINCREF(rval);
  810. 810 return rval;
  811. 811 }
  812. 812 else
  813. 813 {
  814. 814 PyObject * rval = CudaNdarray_Copy(self);
  815. 815 if (0) std::cerr << "DeepCopy created " << rval << " devdata " << ((CudaNdarray*)rval)->devdata << "\n";
  816. 816 if (NULL == rval)
  817. 817 {
  818. 818 Py_DECREF(selfkey);
  819. 819 return NULL;
  820. 820 }
  821. 821 if (PyDict_SetItem(memo, selfkey, rval))
  822. 822 {
  823. 823 Py_DECREF(rval);
  824. 824 Py_DECREF(selfkey);
  825. 825 return NULL;
  826. 826 }
  827. 827 Py_DECREF(selfkey);
  828. 828 return rval;
  829. 829 }
  830. 830 }
  831. 831 PyObject * CudaNdarray_ReduceSum(CudaNdarray * self, PyObject * py_reduce_mask)
  832. 832 {
  833. 833 if (!PySequence_Check(py_reduce_mask))
  834. 834 {
  835. 835 PyErr_SetString(PyExc_TypeError, "reduce_mask must be sequence of ints");
  836. 836 return NULL;
  837. 837 }
  838. 838 int len = PySequence_Length(py_reduce_mask);
  839. 839 if (len != self->nd)
  840. 840 {
  841. 841 PyErr_SetString(PyExc_TypeError, "length of reduce_mask must match self->nd");
  842. 842 return NULL;
  843. 843 }
  844. 844 CudaNdarray * self_sum = (CudaNdarray*)CudaNdarray_New();
  845. 845 if (!self_sum)
  846. 846 {
  847. 847 return NULL;
  848. 848 }
  849. 849 //TODO: allocate a fixed size dimshuffle_pattern_cache on the stack,
  850. 850 // and use it if it is big enough.
  851. 851 int * dimshuffle_pattern = (int*)malloc(len * 2 * sizeof(int));
  852. 852 int * sum_dims = dimshuffle_pattern + len;
  853. 853 int n_remaining_dims = 0;
  854. 854 if (!dimshuffle_pattern)
  855. 855 {
  856. 856 Py_DECREF(self_sum);
  857. 857 PyErr_SetString(PyExc_MemoryError, "failed to alloc internal storage");
  858. 858 return NULL;
  859. 859 }
  860. 860 for (int i = 0; i < len; ++i)
  861. 861 {
  862. 862 PyObject *o_i = PySequence_GetItem(py_reduce_mask, i);
  863. 863 int o_i_int = PyInt_AsLong(o_i);
  864. 864 Py_XDECREF(o_i);
  865. 865 if (PyErr_Occurred())
  866. 866 {
  867. 867 Py_DECREF(self_sum);
  868. 868 free(dimshuffle_pattern);
  869. 869 return NULL;
  870. 870 }
  871. 871 if (o_i_int) // this is a dimension over which we are reducing
  872. 872 {
  873. 873 sum_dims[i] = 1;
  874. 874 }
  875. 875 else
  876. 876 {
  877. 877 sum_dims[i] = CudaNdarray_HOST_DIMS(self)[i];
  878. 878 dimshuffle_pattern[n_remaining_dims++] = i;
  879. 879 }
  880. 880 }
  881. 881 if (0 || CudaNdarray_alloc_contiguous(self_sum, len, sum_dims)
  882. 882 || CudaNdarray_reduce_sum(self_sum, self)
  883. 883 || CudaNdarray_dimshuffle(self_sum, n_remaining_dims, dimshuffle_pattern))
  884. 884 {
  885. 885 Py_DECREF(self_sum);
  886. 886 free(dimshuffle_pattern);
  887. 887 return NULL;
  888. 888 }
  889. 889 free(dimshuffle_pattern);
  890. 890 return (PyObject*)self_sum;
  891. 891 }
  892. 892
  893. 893 // Reshape self to the new shape gived by the tuple shape.
  894. 894 //
  895. 895 // If self is c contiguous, it return a view. Otherwise it always do a copy.
  896. 896 // TODO: make it return a view when the strides allow it even if it is not
  897. 897 // c contiguous
  898. 898 PyObject * CudaNdarray_Reshape(CudaNdarray * self, PyObject * shape)
  899. 899 {
  900. 900 if(!CudaNdarray_is_c_contiguous(self))
  901. 901 {
  902. 902 // allocate new space
  903. 903 //TODO: test to see if we can re-use old one and take a new param to
  904. 904 // use this
  905. 905 CudaNdarray* rval = (CudaNdarray*) CudaNdarray_Copy(self);
  906. 906 if (!rval)
  907. 907 {
  908. 908 return NULL;
  909. 909 }
  910. 910
  911. 911 CudaNdarray* ret = (CudaNdarray*) CudaNdarray_Reshape(rval, shape);
  912. 912 Py_XDECREF(rval);
  913. 913 return (PyObject*)ret;
  914. 914 }
  915. 915
  916. 916 // check shape tuple
  917. 917 unsigned int rval_nd;
  918. 918 unsigned int * rval_dims;
  919. 919 size_t rval_size = 1;
  920. 920
  921. 921 if (PyTuple_Check(shape)){
  922. 922 // copy shape to integer array
  923. 923 rval_nd = PyTuple_Size(shape);
  924. 924 }else if (PyInt_Check(shape)){
  925. 925 rval_nd = 1;
  926. 926 }else{
  927. 927 PyErr_SetString(PyExc_TypeError, "shape must be tuple of integers or an integer");
  928. 928 return NULL;
  929. 929 }
  930. 930 rval_dims = (unsigned int*)malloc(rval_nd * sizeof(int));
  931. 931
  932. 932 if(PyTuple_Check(shape)){
  933. 933 for (int i = 0; i < rval_nd; ++i)
  934. 934 {
  935. 935 rval_dims[i] = PyInt_AsLong(PyTuple_GetItem(shape, i)); //GetItem returns borrowed reference
  936. 936 if (PyErr_Occurred()) //error in AsLong
  937. 937 {
  938. 938 free(rval_dims);
  939. 939 return NULL;
  940. 940 }
  941. 941 if(rval_dims[i]<0){
  942. 942 PyErr_Format(PyExc_ValueError, "Reshape has invalid dimension %i (must be >=0)",rval_dims[i]);
  943. 943 free(rval_dims);
  944. 944 return NULL;
  945. 945 }
  946. 946 rval_size = rval_size * rval_dims[i];
  947. 947 }
  948. 948 }else{
  949. 949 rval_size = PyInt_AsLong(shape);
  950. 950 rval_dims[0] = rval_size;
  951. 951 }
  952. 952 // calculate new size, assert same as old size
  953. 953 if (rval_size != CudaNdarray_SIZE(self))
  954. 954 {
  955. 955 PyErr_Format(PyExc_ValueError, "size must remain unchanged, changed from %lld to %lld", CudaNdarray_SIZE(self), rval_size);
  956. 956 free(rval_dims);
  957. 957 return NULL;
  958. 958 }
  959. 959 if (rval_size==0)
  960. 960 {
  961. 961 PyObject * rval = CudaNdarray_NewDims(rval_nd, rval_dims);
  962. 962 free(rval_dims);
  963. 963 return rval;
  964. 964 }
  965. 965
  966. 966 //return a view, not a copy
  967. 967 //we can do this as we checked self is c_contiguous
  968. 968 CudaNdarray * rval = (CudaNdarray * )CudaNdarray_New(rval_nd);
  969. 969
  970. 970 if (!rval || 0 != rval->data_allocated
  971. 971 ||CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
  972. 972 {
  973. 973 Py_XDECREF(rval);
  974. 974 free(rval_dims);
  975. 975 return NULL;
  976. 976 }
  977. 977 //set dim and stride
  978. 978 int size = 1;
  979. 979 for (int i = rval_nd-1; i >= 0; --i)
  980. 980 {
  981. 981 CudaNdarray_set_stride(rval, i, (rval_dims[i] == 1) ? 0 : size);
  982. 982 CudaNdarray_set_dim(rval, i, rval_dims[i]);
  983. 983 size = size * rval_dims[i];
  984. 984 }
  985. 985 free(rval_dims);
  986. 986 return (PyObject*)rval;
  987. 987 }
  988. 988
  989. 989 PyObject * CudaNdarray_View(const CudaNdarray * self)
  990. 990 {
  991. 991 CudaNdarray * rval = (CudaNdarray*)CudaNdarray_New(self->nd);
  992. 992 if (!rval || CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
  993. 993 {
  994. 994 Py_XDECREF(rval);
  995. 995 rval = NULL;
  996. 996 }
  997. 997 else
  998. 998 {
  999. 999 for (int i = 0; i < self->nd; ++i)
  1000. 1000 {
  1001. 1001 CudaNdarray_set_dim(rval, i, CudaNdarray_HOST_DIMS(self)[i]);
  1002. 1002 CudaNdarray_set_stride(rval, i, CudaNdarray_HOST_STRIDES(self)[i]);
  1003. 1003 }
  1004. 1004 }
  1005. 1005 return (PyObject*)rval;
  1006. 1006 }
  1007. 1007
  1008. 1008 /*
  1009. 1009 * d0,... are the output dims
  1010. 1010 * indices are a list of index to operate on
  1011. 1011 * They are int32 viewed as float32.
  1012. 1012 * a is the output
  1013. 1013 * b is the input
  1014. 1014 * dB0, the source leading dimensions size
  1015. 1015 */
  1016. 1016 template <int operator_num>
  1017. 1017 __global__ void k_take_3(const int d0, const int d1, const int d2,
  1018. 1018 const npy_int64* indices,
  1019. 1019 float* a,
  1020. 1020 const int sA0, const int sA1, const int sA2,
  1021. 1021 const float* b, const int dB0,
  1022. 1022 const int sB0, const int sB1, const int sB2,
  1023. 1023 int* err){
  1024. 1024 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
  1025. 1025 npy_int64 idx = indices[i0];
  1026. 1026 if (idx<0)
  1027. 1027 idx += dB0; // To allow negative indexing.
  1028. 1028 if ((idx < 0) || (idx >= dB0)){
  1029. 1029 // Any value other the 0 probably work. But to be more safe, I want
  1030. 1030 // to change all bits to prevent problem with concurrent write that
  1031. 1031 // could cross cache line. But this should not happen with the
  1032. 1032 // current code and driver.
  1033. 1033 *err = 0xFFFF;
  1034. 1034 continue;
  1035. 1035 }
  1036. 1036 for (int i1 = threadIdx.x; i1 < d1; i1 += blockDim.x){
  1037. 1037 for (int i2 = threadIdx.y; i2 < d2; i2 += blockDim.y){
  1038. 1038 int a_idx = i0*sA0 + i1*sA1 + i2*sA2;
  1039. 1039 int b_idx = idx*sB0 + i1*sB1 + i2*sB2;
  1040. 1040 a[a_idx] = b[b_idx];
  1041. 1041 }
  1042. 1042 }
  1043. 1043 }
  1044. 1044 }
  1045. 1045
  1046. 1046 // We try to be similar to the PyArray_TakeFrom function
  1047. 1047 //http://docs.scipy.org/doc/numpy/reference/c-api.array.html
  1048. 1048 //TODO: support other clip mode then raise(clip, wrap)
  1049. 1049 //self is the input that we copy data from.
  1050. 1050 //The indices that we receive MUST be an CudaNdarray(float32)
  1051. 1051 // that is in fact a view to int64 indices
  1052. 1052 PyObject*
  1053. 1053 CudaNdarray_TakeFrom(CudaNdarray * self, PyObject *args){
  1054. 1054 int verbose = 0;
  1055. 1055 PyObject * indices_obj = NULL;
  1056. 1056 //int axis; Default None, that mean the flattened array.
  1057. 1057 PyObject * axis_obj = Py_None;
  1058. 1058 PyObject * out_obj = Py_None;
  1059. 1059 PyObject * clipmode_obj = NULL;
  1060. 1060 int max_threads = 1; // max threads per blocks
  1061. 1061
  1062. 1062 if (! PyArg_ParseTuple(args, "O|OOOi", &indices_obj, &axis_obj,
  1063. 1063 &out_obj, &clipmode_obj, &max_threads))
  1064. 1064 return NULL;
  1065. 1065
  1066. 1066 //Check argument indices
  1067. 1067 //TODO: if not a numpy.ndarray, convert to numpy.ndarray
  1068. 1068 //TODO: If a CudaNdarray, accept it and suppose the data is int32? is float32 number of int?
  1069. 1069 //TODO: Support ndarray of other dtype then int32
  1070. 1070 //TODO: support list of indices that are not c_contiguous
  1071. 1071 CudaNdarray * indices = NULL;
  1072. 1072 if (CudaNdarray_Check(indices_obj)) {
  1073. 1073 if (verbose) printf("cudandarray indices\n");
  1074. 1074 indices = (CudaNdarray*) indices_obj;
  1075. 1075 Py_INCREF(indices);
  1076. 1076 } else if (PyArray_Check(indices_obj)) {
  1077. 1077 if (verbose) printf("ndarray indices\n");
  1078. 1078 if (PyArray_TYPE((PyArrayObject *)indices_obj) != NPY_INT64) {
  1079. 1079 PyErr_SetString(PyExc_TypeError,
  1080. 1080 "CudaNdarray_TakeFrom: need a ndarray for indices"
  1081. 1081 " with dtype int64");
  1082. 1082 return NULL;
  1083. 1083 }
  1084. 1084 if (PyArray_NDIM(((PyArrayObject*)indices_obj)) != 1) {
  1085. 1085 PyErr_SetString(PyExc_TypeError,
  1086. 1086 "CudaNdarray_TakeFrom: need a CudaNdarray of"
  1087. 1087 " indices with only 1 dimensions");
  1088. 1088 return NULL;
  1089. 1089 }
  1090. 1090 // We need indices_obj to be contiguous, in order to take a view
  1091. 1091 // with a different dtype.
  1092. 1092 if (!PyArray_IS_C_CONTIGUOUS((PyArrayObject*) indices_obj)) {
  1093. 1093 PyObject* indices_obj_contig = PyArray_NewCopy((PyArrayObject*) indices_obj, NPY_CORDER);
  1094. 1094 if (!indices_obj_contig)
  1095. 1095 return NULL;
  1096. 1096 indices_obj = indices_obj_contig;
  1097. 1097 } else {
  1098. 1098 // Keep the refcount consistent
  1099. 1099 Py_INCREF(indices_obj);
  1100. 1100 }
  1101. 1101 PyArray_Descr* float32_descr = PyArray_DescrFromType(NPY_FLOAT32);
  1102. 1102 PyObject * indices_float32 = NULL;
  1103. 1103 indices_float32 = PyArray_View((PyArrayObject*)indices_obj,
  1104. 1104 float32_descr, NULL);
  1105. 1105 if (verbose) printf("ndarray indices\n");
  1106. 1106 if (!indices_float32) {
  1107. 1107 Py_DECREF(indices_obj);
  1108. 1108 return NULL;
  1109. 1109 }
  1110. 1110
  1111. 1111 indices = (CudaNdarray*) CudaNdarray_New();
  1112. 1112 if (verbose) printf("\nndarray after new\n");
  1113. 1113 if (! indices){
  1114. 1114 Py_DECREF(indices_obj);
  1115. 1115 Py_DECREF(indices_float32);
  1116. 1116 return NULL;
  1117. 1117 }
  1118. 1118 if (CudaNdarray_CopyFromArray(indices,
  1119. 1119 (PyArrayObject *)indices_float32)){
  1120. 1120 Py_DECREF(indices_obj);
  1121. 1121 Py_DECREF(indices_float32);
  1122. 1122 return NULL;
  1123. 1123 }
  1124. 1124 Py_DECREF(indices_obj);
  1125. 1125 Py_DECREF(indices_float32);
  1126. 1126 } else {
  1127. 1127 PyObject* py_s = PyObject_Str(indices_obj);
  1128. 1128 const char* s = PyString_AsString(py_s);
  1129. 1129 Py_DECREF(py_s);
  1130. 1130 PyErr_Format(PyExc_TypeError,
  1131. 1131 "CudaNdarray_TakeFrom: need an ndarray of int64 or a"
  1132. 1132 " CudaNdarray(float32) that is a view from int64 data"
  1133. 1133 " for indices. Got %s", s);
  1134. 1134 return NULL;
  1135. 1135 }
  1136. 1136
  1137. 1137 if (verbose) {
  1138. 1138 printf("indices used on the gpu\n");
  1139. 1139 fprint_CudaNdarray(stdout, indices);
  1140. 1140 PyObject * used_indices = CudaNdarray_CreateArrayObj(indices);
  1141. 1141 PyObject_Print(used_indices, stdout, 0);
  1142. 1142 Py_DECREF(used_indices);
  1143. 1143 }
  1144. 1144 if (verbose) printf("after print of object\n");
  1145. 1145 if(!CudaNdarray_is_c_contiguous(indices) != 0) {
  1146. 1146 PyErr_SetString(PyExc_NotImplementedError,
  1147. 1147 "CudaNdarray_TakeFrom: The indices must be contiguous in memory.");
  1148. 1148 Py_DECREF(indices);
  1149. 1149 return NULL;
  1150. 1150 }
  1151. 1151 int nb_indices = CudaNdarray_SIZE((CudaNdarray *)indices) / 2;// int64 are 8 bytes, float32 are 4 bytes
  1152. 1152
  1153. 1153 //Check argument axis
  1154. 1154 //TODO: implement the default and other axis
  1155. 1155 long axis = PyInt_AsLong(axis_obj);
  1156. 1156
  1157. 1157 if (axis != 0) {
  1158. 1158 PyErr_Format(PyExc_NotImplementedError,
  1159. 1159 "CudaNdarray_TakeFrom: only axis=0 is currently supported."
  1160. 1160 " Got %ld.", axis);
  1161. 1161 Py_DECREF(indices);
  1162. 1162 return NULL;
  1163. 1163 }
  1164. 1164
  1165. 1165 //Check argument out_obj
  1166. 1166 CudaNdarray * out = NULL;
  1167. 1167 if (out_obj && CudaNdarray_Check(out_obj))
  1168. 1168 out = (CudaNdarray*) out_obj;
  1169. 1169 if (out && (out->nd != self->nd ||
  1170. 1170 CudaNdarray_HOST_DIMS(out)[0] != nb_indices))
  1171. 1171 out = NULL;
  1172. 1172 int * dims = (int *)malloc(sizeof(int) * self->nd);
  1173. 1173 dims[0] = nb_indices;
  1174. 1174
  1175. 1175 for (int i=1 ; i<self->nd ; i++) {
  1176. 1176 dims[i] = CudaNdarray_HOST_DIMS(self)[i];
  1177. 1177 if (out && CudaNdarray_HOST_DIMS(out)[i] != dims[i]) {
  1178. 1178 out = NULL;
  1179. 1179 }
  1180. 1180 }
  1181. 1181 if (!out) {
  1182. 1182 out = (CudaNdarray*)CudaNdarray_New();
  1183. 1183 if (!out){
  1184. 1184 Py_DECREF(indices);
  1185. 1185 free(dims);
  1186. 1186 return NULL;
  1187. 1187 }
  1188. 1188 if (CudaNdarray_alloc_contiguous(out, self->nd, dims)) {
  1189. 1189 Py_DECREF(out);
  1190. 1190 Py_DECREF(indices);
  1191. 1191 free(dims);
  1192. 1192 return NULL;
  1193. 1193 }
  1194. 1194 }else {
  1195. 1195 Py_INCREF(out);
  1196. 1196 }
  1197. 1197
  1198. 1198 //Check argument clipmode
  1199. 1199 if (clipmode_obj) {
  1200. 1200 char * clipmode = PyString_AsString(clipmode_obj);
  1201. 1201 if (! clipmode){
  1202. 1202 Py_DECREF(indices);
  1203. 1203 Py_DECREF(out);
  1204. 1204 free(dims);
  1205. 1205 return NULL;
  1206. 1206 }
  1207. 1207 if (strcmp(clipmode, "raise") != 0) {
  1208. 1208 PyErr_Format(PyExc_NotImplementedError,
  1209. 1209 "CudaNdarray_TakeFrom: only the raise mode is currently supported. Got '%s'",
  1210. 1210 clipmode);
  1211. 1211 Py_DECREF(indices);
  1212. 1212 Py_DECREF(out);
  1213. 1213 free(dims);
  1214. 1214 return NULL;
  1215. 1215 }
  1216. 1216 }
  1217. 1217 void (*k3)(const int, const int, const int,
  1218. 1218 const npy_int64*,
  1219. 1219 float*, const int, const int, const int,
  1220. 1220 const float*, const int,
  1221. 1221 const int, const int, const int,
  1222. 1222 int*);
  1223. 1223 k3 = k_take_3<CPY>;
  1224. 1224
  1225. 1225 // Create the memory place that will store the error information.
  1226. 1226 if(init_err_var() != 0) return NULL;
  1227. 1227
  1228. 1228 dim3 n_blocks(std::min(CudaNdarray_HOST_DIMS(out)[0],65535),1,1);
  1229. 1229 if(CudaNdarray_HOST_DIMS(out)[0] == 0){
  1230. 1230 // We take 0 elements, so no need for the rest of the code.
  1231. 1231 // This speed up that case AND fix crash otherwise.
  1232. 1232 free(dims);
  1233. 1233 Py_DECREF(indices);
  1234. 1234 return (PyObject *)out;
  1235. 1235 }
  1236. 1236
  1237. 1237 switch (self->nd) {
  1238. 1238 case 1:
  1239. 1239 {
  1240. 1240 dim3 n_threads(1, 1, 1);
  1241. 1241 if (verbose)
  1242. 1242 printf("cudaGetLastError=%d, nd=%d"
  1243. 1243 " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
  1244. 1244 " n_threads.x=%i, n_threads.y=%i)\n",
  1245. 1245 cudaGetLastError(), self->nd,
  1246. 1246 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
  1247. 1247 k3<<<n_blocks, n_threads>>>(
  1248. 1248 dims[0],
  1249. 1249 1,
  1250. 1250 1,
  1251. 1251 (npy_int64*) CudaNdarray_DEV_DATA(indices),
  1252. 1252 CudaNdarray_DEV_DATA(out),
  1253. 1253 CudaNdarray_HOST_STRIDES(out)[0], //strides
  1254. 1254 1,
  1255. 1255 1,
  1256. 1256 CudaNdarray_DEV_DATA(self),
  1257. 1257 CudaNdarray_HOST_DIMS(self)[0], //For indices check
  1258. 1258 CudaNdarray_HOST_STRIDES(self)[0], //strides
  1259. 1259 1,
  1260. 1260 1,
  1261. 1261 err_var);
  1262. 1262 }
  1263. 1263 break;
  1264. 1264 case 2:
  1265. 1265 {
  1266. 1266 dim3 n_threads(std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads), 1, 1);
  1267. 1267
  1268. 1268 if (verbose)
  1269. 1269 printf("cudaGetLastError=%d, nd=%d"
  1270. 1270 " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
  1271. 1271 " n_threads.x=%i, n_threads.y=%i)\n",
  1272. 1272 cudaGetLastError(), self->nd,
  1273. 1273 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
  1274. 1274
  1275. 1275 k3<<<n_blocks, n_threads>>>(
  1276. 1276 dims[0], //dimensions
  1277. 1277 dims[1],
  1278. 1278 1,
  1279. 1279 (npy_int64*) CudaNdarray_DEV_DATA(indices),
  1280. 1280 CudaNdarray_DEV_DATA(out),
  1281. 1281 CudaNdarray_HOST_STRIDES(out)[0], //strides
  1282. 1282 CudaNdarray_HOST_STRIDES(out)[1],
  1283. 1283 1,
  1284. 1284 CudaNdarray_DEV_DATA(self),
  1285. 1285 CudaNdarray_HOST_DIMS(self)[0], //For indices check
  1286. 1286 CudaNdarray_HOST_STRIDES(self)[0], //strides
  1287. 1287 CudaNdarray_HOST_STRIDES(self)[1],
  1288. 1288 1,
  1289. 1289 err_var);
  1290. 1290 }
  1291. 1291 break;
  1292. 1292 case 3:
  1293. 1293 {
  1294. 1294 int ty = std::min(CudaNdarray_HOST_DIMS(out)[2], max_threads);
  1295. 1295 int tx = std::min(CudaNdarray_HOST_DIMS(out)[1], max_threads / ty);
  1296. 1296 dim3 n_threads(tx, ty, 1);
  1297. 1297 if (verbose)
  1298. 1298 printf("cudaGetLastError=%d, nd=%d"
  1299. 1299 " kernel config: (n_blocks.x=%d, n_blocks.y=%d,"
  1300. 1300 " n_threads.x=%i, n_threads.y=%i)\n",
  1301. 1301 cudaGetLastError(), self->nd,
  1302. 1302 n_blocks.x, n_blocks.y, n_threads.x, n_threads.y);
  1303. 1303 k3<<<n_blocks, n_threads>>>(
  1304. 1304 dims[0], //dimensions
  1305. 1305 dims[1],
  1306. 1306 dims[2],
  1307. 1307 (npy_int64*) CudaNdarray_DEV_DATA(indices),
  1308. 1308 CudaNdarray_DEV_DATA(out),
  1309. 1309 CudaNdarray_HOST_STRIDES(out)[0], //strides
  1310. 1310 CudaNdarray_HOST_STRIDES(out)[1],
  1311. 1311 CudaNdarray_HOST_STRIDES(out)[2],
  1312. 1312 CudaNdarray_DEV_DATA(self),
  1313. 1313 CudaNdarray_HOST_DIMS(self)[0], //For indices check
  1314. 1314 CudaNdarray_HOST_STRIDES(self)[0], //strides
  1315. 1315 CudaNdarray_HOST_STRIDES(self)[1],
  1316. 1316 CudaNdarray_HOST_STRIDES(self)[2],
  1317. 1317 err_var);
  1318. 1318 }
  1319. 1319 break;
  1320. 1320 default:
  1321. 1321 PyErr_SetString(PyExc_NotImplementedError,
  1322. 1322 "CudaNdarray_TakeFrom: only input with 1, 2 or 3"
  1323. 1323 " dimensions are currently supported");
  1324. 1324
  1325. 1325 }
  1326. 1326 free(dims);
  1327. 1327 CNDA_THREAD_SYNC;
  1328. 1328 cudaError_t err = cudaGetLastError();
  1329. 1329 if (cudaSuccess != err) {
  1330. 1330 PyErr_Format(PyExc_RuntimeError,
  1331. 1331 "Cuda error: %s: %s.\n",
  1332. 1332 "CudaNdarray_TakeFrom",
  1333. 1333 cudaGetErrorString(err));
  1334. 1334 Py_DECREF(indices);
  1335. 1335 Py_DECREF(out);
  1336. 1336 return NULL;
  1337. 1337 }
  1338. 1338
  1339. 1339 int index_err = check_err_var();
  1340. 1340 Py_DECREF(indices);
  1341. 1341 if (index_err != 0) {
  1342. 1342 Py_DECREF(out);
  1343. 1343 return NULL;
  1344. 1344 }
  1345. 1345
  1346. 1346 if (verbose) printf("TAKE SUCCEDED\n");
  1347. 1347 return (PyObject *)out;
  1348. 1348 }
  1349. 1349
  1350. 1350
  1351. 1351 PyObject * CudaNdarray_SetStride(CudaNdarray * self, PyObject *args)
  1352. 1352 {
  1353. 1353 int pos, stride;
  1354. 1354 if (! PyArg_ParseTuple(args, "ii", &pos, &stride))
  1355. 1355 return NULL;
  1356. 1356 if ((pos < 0) || (pos >= self->nd))
  1357. 1357 {
  1358. 1358 PyErr_Format(PyExc_ValueError, "position argument out of legal range [0, %i)", self->nd);
  1359. 1359 return NULL;
  1360. 1360 }
  1361. 1361 CudaNdarray_set_stride(self, pos, stride);
  1362. 1362 if (cnda_copy_structure_to_device(self))
  1363. 1363 {
  1364. 1364 return NULL;
  1365. 1365 }
  1366. 1366 Py_INCREF(Py_None);
  1367. 1367 return Py_None;
  1368. 1368 }
  1369. 1369 PyObject * CudaNdarray_SetShapeI(CudaNdarray * self, PyObject *args)
  1370. 1370 {
  1371. 1371 int pos, dim;
  1372. 1372 if (! PyArg_ParseTuple(args, "ii", &pos, &dim))
  1373. 1373 return NULL;
  1374. 1374 if ((pos < 0) || (pos >= self->nd))
  1375. 1375 {
  1376. 1376 PyErr_Format(PyExc_ValueError, "position argument out of legal range [0, %i)", self->nd);
  1377. 1377 return NULL;
  1378. 1378 }
  1379. 1379 CudaNdarray_set_dim(self, pos, dim);
  1380. 1380 if (cnda_copy_structure_to_device(self))
  1381. 1381 {
  1382. 1382 return NULL;
  1383. 1383 }
  1384. 1384 Py_INCREF(Py_None);
  1385. 1385 return Py_None;
  1386. 1386 }
  1387. 1387
  1388. 1388 static PyObject *
  1389. 1389 CudaNdarray_exp(CudaNdarray* self)
  1390. 1390 {
  1391. 1391 CudaNdarray * rval = (CudaNdarray *)CudaNdarray_New();
  1392. 1392 if ((NULL == rval) || CudaNdarray_alloc_contiguous(rval, self->nd, CudaNdarray_HOST_DIMS(self)))
  1393. 1393 {
  1394. 1394 Py_XDECREF(rval);
  1395. 1395 return NULL;
  1396. 1396 }
  1397. 1397 unsigned int size = 1;
  1398. 1398 for (int i = 0; i < self->nd; i++)
  1399. 1399 {
  1400. 1400 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
  1401. 1401 }
  1402. 1402 unsigned int threads_per_block = std::min(size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
  1403. 1403 unsigned int n_blocks = std::min(ceil_intdiv(size,threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
  1404. 1404 k_elemwise_unary_rowmajor_exp<<<n_blocks,threads_per_block>>>(size, self->nd, CudaNdarray_DEV_DIMS(self),
  1405. 1405 CudaNdarray_DEV_DATA(self), CudaNdarray_DEV_STRIDES(self),
  1406. 1406 CudaNdarray_DEV_DATA(rval), CudaNdarray_DEV_STRIDES(rval));
  1407. 1407
  1408. 1408 //TODO: don't do this right away, do it when we need the result
  1409. 1409 CNDA_THREAD_SYNC;
  1410. 1410 cudaError_t err = cudaGetLastError();
  1411. 1411 if( cudaSuccess != err)
  1412. 1412 {
  1413. 1413 Py_DECREF(rval);
  1414. 1414 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kExp", cudaGetErrorString(err));
  1415. 1415 return NULL;
  1416. 1416 }
  1417. 1417
  1418. 1418 return (PyObject*)rval;
  1419. 1419 }
  1420. 1420
  1421. 1421 static PyMethodDef CudaNdarray_methods[] =
  1422. 1422 {
  1423. 1423 {"__array__",
  1424. 1424 (PyCFunction)CudaNdarray_CreateArrayObj, METH_VARARGS,
  1425. 1425 "Copy from the device to a numpy ndarray"},
  1426. 1426 {"__copy__",
  1427. 1427 (PyCFunction)CudaNdarray_View, METH_NOARGS,
  1428. 1428 "Create a shallow copy of this object. used by module copy"},
  1429. 1429 {"__deepcopy__",
  1430. 1430 (PyCFunction)CudaNdarray_DeepCopy, METH_O,
  1431. 1431 "Create a copy of this object"},
  1432. 1432 {"zeros",
  1433. 1433 (PyCFunction)CudaNdarray_Zeros, METH_STATIC | METH_O,
  1434. 1434 "Create a new CudaNdarray with specified shape, filled with zeros."},
  1435. 1435 {"copy",
  1436. 1436 (PyCFunction)CudaNdarray_Copy, METH_NOARGS,
  1437. 1437 "Create a copy of this object"},
  1438. 1438 {"is_c_contiguous",
  1439. 1439 (PyCFunction)CudaNdarray_IS_C_Contiguous, METH_NOARGS,
  1440. 1440 "Return True is the object is c contiguous. False otherwise."},
  1441. 1441 {"reduce_sum",
  1442. 1442 (PyCFunction)CudaNdarray_ReduceSum, METH_O,
  1443. 1443 "Reduce over the given dimensions by summation"},
  1444. 1444 {"exp",
  1445. 1445 (PyCFunction)CudaNdarray_exp, METH_NOARGS,
  1446. 1446 "Return the exponential of all elements"},
  1447. 1447 {"reshape",
  1448. 1448 (PyCFunction)CudaNdarray_Reshape, METH_O,
  1449. 1449 "Return a reshaped view (or copy) of this ndarray\n\
  1450. 1450 The required argument is a tuple of integers specifying the shape of the new ndarray."},
  1451. 1451 {"view",
  1452. 1452 (PyCFunction)CudaNdarray_View, METH_NOARGS,
  1453. 1453 "Return an alias of this ndarray"},
  1454. 1454 {"_set_stride",
  1455. 1455 (PyCFunction)CudaNdarray_SetStride, METH_VARARGS,
  1456. 1456 "For integer arguments (i, s), set the 'i'th stride to 's'"},
  1457. 1457 {"take",
  1458. 1458 (PyCFunction)CudaNdarray_TakeFrom, METH_VARARGS,
  1459. 1459 "Equivalent of numpy.take"},
  1460. 1460 {"_set_shape_i",
  1461. 1461 (PyCFunction)CudaNdarray_SetShapeI, METH_VARARGS,
  1462. 1462 "For integer arguments (i, s), set the 'i'th shape to 's'"},
  1463. 1463 {NULL, NULL, NULL, NULL} /* Sentinel */
  1464. 1464 };
  1465. 1465
  1466. 1466
  1467. 1467 ////////////////////
  1468. 1468 // Number protocol
  1469. 1469 ////////////////////
  1470. 1470
  1471. 1471 __global__ void kAdd_contiguous(float* a, float* b, float* dest, unsigned int numEls) {
  1472. 1472 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
  1473. 1473 const unsigned int numThreads = blockDim.x * gridDim.x;
  1474. 1474
  1475. 1475 for (unsigned int i = idx; i < numEls; i += numThreads) {
  1476. 1476 dest[i] = a[i] + b[i];
  1477. 1477 }
  1478. 1478 }
  1479. 1479
  1480. 1480 // Will be called by __add__ in Python
  1481. 1481 static PyObject *
  1482. 1482 CudaNdarray_add(PyObject* py_self, PyObject * py_other)
  1483. 1483 {
  1484. 1484 if (! CudaNdarray_Check(py_self)) {
  1485. 1485 PyErr_SetString(PyExc_TypeError, "need a CudaNdarray on left");
  1486. 1486 return NULL;
  1487. 1487 }
  1488. 1488 if (! CudaNdarray_Check(py_other)) {
  1489. 1489 PyErr_SetString(PyExc_TypeError, "need a CudaNdarray on right");
  1490. 1490 return NULL;
  1491. 1491 }
  1492. 1492 CudaNdarray * self = (CudaNdarray *)py_self;
  1493. 1493 CudaNdarray * other = (CudaNdarray *)py_other;
  1494. 1494 if(!CudaNdarray_is_c_contiguous(self) || !CudaNdarray_is_c_contiguous(other)){
  1495. 1495 PyErr_SetString(PyExc_TypeError, "We have implementet only the c_contiguous version for now.");
  1496. 1496 return NULL;
  1497. 1497 }
  1498. 1498
  1499. 1499 //standard elemwise size checks
  1500. 1500 if (self->nd != other->nd)
  1501. 1501 {
  1502. 1502 PyErr_SetString(PyExc_TypeError, "CudaNdarray_add: need same number of dims");
  1503. 1503 return NULL;
  1504. 1504 }
  1505. 1505 //standard elemwise dim checks
  1506. 1506 unsigned int size = 1;
  1507. 1507 for (int i = 0; i< self->nd; ++i)
  1508. 1508 {
  1509. 1509 if (CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
  1510. 1510 {
  1511. 1511 PyErr_SetString(PyExc_TypeError, "need same dimensions");
  1512. 1512 return NULL;
  1513. 1513 }
  1514. 1514 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
  1515. 1515 }
  1516. 1516 CudaNdarray * rval = (CudaNdarray *)CudaNdarray_New();
  1517. 1517 if (!rval || CudaNdarray_alloc_contiguous(rval, self->nd, CudaNdarray_HOST_DIMS(self)))
  1518. 1518 {
  1519. 1519 Py_XDECREF(rval);
  1520. 1520 return NULL;
  1521. 1521 }
  1522. 1522
  1523. 1523 if(CudaNdarray_SIZE((CudaNdarray *)py_self)==0 && CudaNdarray_SIZE((CudaNdarray *)py_other)==0){
  1524. 1524 return (PyObject *) rval;
  1525. 1525 }
  1526. 1526
  1527. 1527 int threads_per_block = std::min(size, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
  1528. 1528 int n_blocks = std::min(ceil_intdiv(size,(unsigned int)threads_per_block), (unsigned int)NUM_VECTOR_OP_BLOCKS);
  1529. 1529 kAdd_contiguous<<<n_blocks,threads_per_block>>>(
  1530. 1530 self->devdata, other->devdata, rval->devdata, size);
  1531. 1531 CNDA_THREAD_SYNC;
  1532. 1532 cudaError_t err = cudaGetLastError();
  1533. 1533 if( cudaSuccess != err)
  1534. 1534 {
  1535. 1535 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kAdd", cudaGetErrorString(err));
  1536. 1536 Py_DECREF(rval);
  1537. 1537 return NULL;
  1538. 1538 }
  1539. 1539 return (PyObject *) rval;
  1540. 1540 }
  1541. 1541
  1542. 1542 template <int operator_num>
  1543. 1543 __global__ void k_ielem_3(const int d0, const int d1, const int d2,
  1544. 1544 float* a, const int sA0, const int sA1, const int sA2,
  1545. 1545 const float* b, const int sB0, const int sB1, const int sB2){
  1546. 1546 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
  1547. 1547 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
  1548. 1548 for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){
  1549. 1549 switch (operator_num)
  1550. 1550 {
  1551. 1551 case IADD:
  1552. 1552 a[i0*sA0 + i1*sA1 + i2*sA2] += b[i0*sB0 + i1*sB1 + i2*sB2];
  1553. 1553 break;
  1554. 1554 case IDIV:
  1555. 1555 a[i0*sA0 + i1*sA1 + i2*sA2] /= b[i0*sB0 + i1*sB1 + i2*sB2];
  1556. 1556 break;
  1557. 1557 case CPY:
  1558. 1558 a[i0*sA0 + i1*sA1 + i2*sA2] = b[i0*sB0 + i1*sB1 + i2*sB2];
  1559. 1559 break;
  1560. 1560 }
  1561. 1561 }
  1562. 1562 }
  1563. 1563 }
  1564. 1564 }
  1565. 1565
  1566. 1566 template <int operator_num>
  1567. 1567 __global__ void k_ielem_4(const int d0, const int d1, const int d2, const int d3,
  1568. 1568 float* a, const int sA0, const int sA1,
  1569. 1569 const int sA2, const int sA3,
  1570. 1570 const float* b, const int sB0, const int sB1,
  1571. 1571 const int sB2, const int sB3){
  1572. 1572 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
  1573. 1573 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
  1574. 1574 for (int i2 = threadIdx.x; i2 < d2; i2 += blockDim.x){
  1575. 1575 for (int i3 = threadIdx.y; i3 < d3; i3 += blockDim.y){
  1576. 1576 switch (operator_num) {
  1577. 1577 case IADD:
  1578. 1578 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
  1579. 1579 += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
  1580. 1580 break;
  1581. 1581 case IDIV:
  1582. 1582 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
  1583. 1583 /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
  1584. 1584 break;
  1585. 1585 case CPY:
  1586. 1586 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3]
  1587. 1587 = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3];
  1588. 1588 break;
  1589. 1589 }
  1590. 1590 }
  1591. 1591 }
  1592. 1592 }
  1593. 1593 }
  1594. 1594 }
  1595. 1595
  1596. 1596 template <int operator_num>
  1597. 1597 __global__ void k_ielem_6(const int d0, const int d1,
  1598. 1598 const int d2, const int d3,
  1599. 1599 const int d4, const int d5,
  1600. 1600 float* a, const int sA0, const int sA1,
  1601. 1601 const int sA2, const int sA3,
  1602. 1602 const int sA4, const int sA5,
  1603. 1603 const float* b, const int sB0, const int sB1,
  1604. 1604 const int sB2, const int sB3,
  1605. 1605 const int sB4, const int sB5
  1606. 1606 ){
  1607. 1607 for (int i0 = blockIdx.x; i0 < d0; i0 += gridDim.x){
  1608. 1608 for (int i1 = blockIdx.y; i1 < d1; i1 += gridDim.y){
  1609. 1609 for (int i2 = blockIdx.z; i2 < d2; i2 += gridDim.z){
  1610. 1610 for (int i3 = threadIdx.x; i3 < d3; i3 += blockDim.x){
  1611. 1611 for (int i4 = threadIdx.y; i4 < d4; i4 += blockDim.y){
  1612. 1612 for (int i5 = threadIdx.z; i5 < d5; i5 += blockDim.z){
  1613. 1613 switch (operator_num) {
  1614. 1614 case IADD:
  1615. 1615 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
  1616. 1616 += b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
  1617. 1617 break;
  1618. 1618 case IDIV:
  1619. 1619 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
  1620. 1620 /= b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
  1621. 1621 break;
  1622. 1622 case CPY:
  1623. 1623 a[i0*sA0 + i1*sA1 + i2*sA2 + i3*sA3 + i4*sA4 + i5*sA5]
  1624. 1624 = b[i0*sB0 + i1*sB1 + i2*sB2 + i3*sB3 + i4*sB4 + i5*sB5];
  1625. 1625 break;
  1626. 1626 }
  1627. 1627 }
  1628. 1628 }
  1629. 1629 }
  1630. 1630 }
  1631. 1631 }
  1632. 1632 }
  1633. 1633 }
  1634. 1634
  1635. 1635 /*
  1636. 1636 CudaNdarray_inplace_elemwise
  1637. 1637 Compute elemwise, working inplace on A.
  1638. 1638 Currently implemented A / B, A + B and A = B
  1639. 1639 (the last is not tested and not used!)
  1640. 1640
  1641. 1641 py_self - the CudaNdarray that we'll modify (A)
  1642. 1642 py_other - the other argument (B)
  1643. 1643 fct_nb - which operation to perform (operator_t)
  1644. 1644
  1645. 1645 Returns 0 on success.
  1646. 1646 Returns -1 on failure, and sets Python exception.
  1647. 1647
  1648. 1648 */
  1649. 1649 int
  1650. 1650 CudaNdarray_inplace_elemwise(PyObject* py_self, PyObject * py_other, operator_t fct_nb)
  1651. 1651 {
  1652. 1652 int verbose = 0;
  1653. 1653 void (*k3)(const int, const int, const int,
  1654. 1654 float*, const int, const int, const int,
  1655. 1655 const float*, const int, const int, const int);
  1656. 1656 void (*k4)(const int, const int, const int, const int,
  1657. 1657 float*, const int, const int,
  1658. 1658 const int, const int,
  1659. 1659 const float*, const int, const int,
  1660. 1660 const int, const int);
  1661. 1661 void (*k6)(const int, const int,
  1662. 1662 const int, const int,
  1663. 1663 const int, const int,
  1664. 1664 float*, const int, const int,
  1665. 1665 const int, const int,
  1666. 1666 const int, const int,
  1667. 1667 const float*, const int, const int,
  1668. 1668 const int, const int,
  1669. 1669 const int, const int);
  1670. 1670 switch (fct_nb)
  1671. 1671 {
  1672. 1672 case IADD:
  1673. 1673 k3 = k_ielem_3<IADD>;
  1674. 1674 k4 = k_ielem_4<IADD>;
  1675. 1675 k6 = k_ielem_6<IADD>;
  1676. 1676 break;
  1677. 1677 case IDIV:
  1678. 1678 k3 = k_ielem_3<IDIV>;
  1679. 1679 k4 = k_ielem_4<IDIV>;
  1680. 1680 k6 = k_ielem_6<IDIV>;
  1681. 1681 break;
  1682. 1682 case CPY:
  1683. 1683 k3 = k_ielem_3<CPY>;
  1684. 1684 k4 = k_ielem_4<CPY>;
  1685. 1685 k6 = k_ielem_6<CPY>;
  1686. 1686 break;
  1687. 1687 default:
  1688. 1688 assert (0);
  1689. 1689 PyErr_Format(
  1690. 1690 PyExc_TypeError,
  1691. 1691 "CudaNdarray_inplace_elemwise invalid fct_nb (%i).",
  1692. 1692 (int)fct_nb);
  1693. 1693 return -1;
  1694. 1694 }
  1695. 1695 if (!CudaNdarray_Check(py_self)) {
  1696. 1696 PyErr_SetString(
  1697. 1697 PyExc_TypeError,
  1698. 1698 "CudaNdarray_inplace_elemwise need a CudaNdarray on left");
  1699. 1699 return -1;
  1700. 1700 }
  1701. 1701 CudaNdarray * new_other = NULL;
  1702. 1702 if (!CudaNdarray_Check(py_other)) {
  1703. 1703 new_other = (CudaNdarray*) CudaNdarray_New();
  1704. 1704 if(!new_other)
  1705. 1705 {
  1706. 1706 return -1;
  1707. 1707 }
  1708. 1708 if(CudaNdarray_CopyFromArray(new_other, (PyArrayObject *) py_other))
  1709. 1709 {
  1710. 1710 Py_XDECREF(new_other);
  1711. 1711 return -1;
  1712. 1712 }
  1713. 1713 py_other = (PyObject *) new_other;
  1714. 1714 }
  1715. 1715
  1716. 1716 CudaNdarray * self = (CudaNdarray *)py_self;
  1717. 1717 CudaNdarray * other = (CudaNdarray *)py_other;
  1718. 1718
  1719. 1719 if (verbose)
  1720. 1720 {
  1721. 1721 fprintf(stderr,
  1722. 1722 "INPLACE ADD/DIV for self->nd=%d other->nd=%d\n",
  1723. 1723 self->nd, other->nd);
  1724. 1724 }
  1725. 1725
  1726. 1726 //standard elemwise nb dim checks
  1727. 1727 if (self->nd < other->nd)
  1728. 1728 {
  1729. 1729 PyErr_Format(
  1730. 1730 PyExc_TypeError,
  1731. 1731 "CudaNdarray_inplace_elemwise: The destination need more or the"
  1732. 1732 " same number of dimensions then the source. Got %d and %d.",
  1733. 1733 self->nd, other->nd);
  1734. 1734 Py_XDECREF(new_other);
  1735. 1735 return -1;
  1736. 1736 }
  1737. 1737
  1738. 1738 //broadcast to the same number of dimensions.
  1739. 1739 int* other_dims = (int*) alloca(self->nd * sizeof(int));
  1740. 1740 int* other_strides = (int*) alloca(self->nd * sizeof(int));
  1741. 1741 int added_dims = self->nd - other->nd;
  1742. 1742 // Add the added broadcasted dimensions
  1743. 1743 for (int i = 0; i< added_dims; ++i)
  1744. 1744 {
  1745. 1745 other_dims[i] = 1;
  1746. 1746 other_strides[i] = 0;
  1747. 1747 }
  1748. 1748 // Copy the existing dimensions
  1749. 1749 for (int i = 0; i< other->nd; ++i)
  1750. 1750 {
  1751. 1751 other_dims[i+added_dims] = CudaNdarray_HOST_DIMS(other)[i];
  1752. 1752 other_strides[i+added_dims] = CudaNdarray_HOST_STRIDES(other)[i];
  1753. 1753 }
  1754. 1754
  1755. 1755 //standard elemwise dim checks
  1756. 1756 unsigned int size = 1;
  1757. 1757 for (int i = 0; i< self->nd; ++i)
  1758. 1758 {
  1759. 1759 if ((CudaNdarray_HOST_DIMS(self)[i] != other_dims[i])
  1760. 1760 && (other_dims[i] != 1))
  1761. 1761 {
  1762. 1762 PyErr_SetString(
  1763. 1763 PyExc_ValueError,
  1764. 1764 "CudaNdarray_inplace_elemwise need same dimensions (or broadcastable dimension)");
  1765. 1765 Py_XDECREF(new_other);
  1766. 1766 return -1;
  1767. 1767 }
  1768. 1768 // if we're broadcasting other, then make sure it has stride 0
  1769. 1769 assert ((CudaNdarray_HOST_DIMS(self)[i] == other_dims[i])
  1770. 1770 || (other_strides[i] == 0));
  1771. 1771 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
  1772. 1772 }
  1773. 1773
  1774. 1774 if (size==0)
  1775. 1775 {
  1776. 1776 int other_size = CudaNdarray_SIZE((CudaNdarray *)py_other);
  1777. 1777 if (!(other_size == 0 || other_size == 1))
  1778. 1778 {
  1779. 1779 PyErr_SetString(
  1780. 1780 PyExc_ValueError,
  1781. 1781 "CudaNdarray_inplace_elemwise cannot work inplace on"
  1782. 1782 " un-initialized array when the new value have more than"
  1783. 1783 " 0 or 1 broadcastable dimensions");
  1784. 1784 Py_XDECREF(new_other);
  1785. 1785 return 0;
  1786. 1786 }
  1787. 1787 Py_XDECREF(new_other);
  1788. 1788 return 0;
  1789. 1789 }
  1790. 1790
  1791. 1791 switch(self->nd)
  1792. 1792 {
  1793. 1793 case 0:
  1794. 1794 {
  1795. 1795 dim3 n_blocks(1, 1, 1);
  1796. 1796 dim3 n_threads(1);
  1797. 1797 k3<<<n_blocks, n_threads>>>(
  1798. 1798 1, //d0
  1799. 1799 1, //d1
  1800. 1800 1, //d2
  1801. 1801 CudaNdarray_DEV_DATA(self),
  1802. 1802 1, //strides
  1803. 1803 1,
  1804. 1804 1,
  1805. 1805 CudaNdarray_DEV_DATA(other),
  1806. 1806 1, //strides
  1807. 1807 1,
  1808. 1808 1);
  1809. 1809 CNDA_THREAD_SYNC;
  1810. 1810 cudaError_t err = cudaGetLastError();
  1811. 1811 if (cudaSuccess != err)
  1812. 1812 {
  1813. 1813 PyErr_Format(
  1814. 1814 PyExc_RuntimeError,
  1815. 1815 "CudaNdarray_inplace_elemwise case0: Cuda error: %s: %s.\n",
  1816. 1816 "k3",
  1817. 1817 cudaGetErrorString(err));
  1818. 1818 Py_XDECREF(new_other);
  1819. 1819 return -1;
  1820. 1820 }
  1821. 1821 }
  1822. 1822 break;
  1823. 1823 case 1:
  1824. 1824 {
  1825. 1825 dim3 n_blocks(1, 1, 1);
  1826. 1826 dim3 n_threads(
  1827. 1827 std::min(
  1828. 1828 CudaNdarray_HOST_DIMS(self)[0],
  1829. 1829 NUM_VECTOR_OP_THREADS_PER_BLOCK));
  1830. 1830 k3<<<n_blocks, n_threads>>>(
  1831. 1831 1, //dimensions
  1832. 1832 1,
  1833. 1833 CudaNdarray_HOST_DIMS(self)[0],
  1834. 1834 CudaNdarray_DEV_DATA(self),
  1835. 1835 1, //strides
  1836. 1836 1,
  1837. 1837 CudaNdarray_HOST_STRIDES(self)[0],
  1838. 1838 CudaNdarray_DEV_DATA(other),
  1839. 1839 1, //strides
  1840. 1840 1,
  1841. 1841 other_strides[0]);
  1842. 1842 CNDA_THREAD_SYNC;
  1843. 1843 cudaError_t err = cudaGetLastError();
  1844. 1844 if (cudaSuccess != err)
  1845. 1845 {
  1846. 1846 PyErr_Format(
  1847. 1847 PyExc_RuntimeError,
  1848. 1848 "CudaNdarray_inplace_elemwise case1: Cuda error: %s: %s.\n",
  1849. 1849 "k3",
  1850. 1850 cudaGetErrorString(err));
  1851. 1851 Py_XDECREF(new_other);
  1852. 1852 return -1;
  1853. 1853 }
  1854. 1854 }
  1855. 1855 break;
  1856. 1856 case 2:
  1857. 1857 {
  1858. 1858 //TODO: if both self and other are f-contiguous
  1859. 1859 // Then flip the block and thread dimensions
  1860. 1860 // to make contiguous reads & writes
  1861. 1861 dim3 n_blocks(1,
  1862. 1862 std::min(
  1863. 1863 CudaNdarray_HOST_DIMS(self)[0],
  1864. 1864 NUM_VECTOR_OP_BLOCKS));
  1865. 1865 dim3 n_threads(
  1866. 1866 std::min(
  1867. 1867 CudaNdarray_HOST_DIMS(self)[1],
  1868. 1868 NUM_VECTOR_OP_THREADS_PER_BLOCK));
  1869. 1869 k3<<<n_blocks, n_threads>>>(1,
  1870. 1870 CudaNdarray_HOST_DIMS(self)[0],
  1871. 1871 CudaNdarray_HOST_DIMS(self)[1],
  1872. 1872 CudaNdarray_DEV_DATA(self),
  1873. 1873 1,
  1874. 1874 CudaNdarray_HOST_STRIDES(self)[0],
  1875. 1875 CudaNdarray_HOST_STRIDES(self)[1],
  1876. 1876 CudaNdarray_DEV_DATA(other),
  1877. 1877 1,
  1878. 1878 other_strides[0],
  1879. 1879 other_strides[1]);
  1880. 1880 CNDA_THREAD_SYNC;
  1881. 1881 cudaError_t err = cudaGetLastError();
  1882. 1882 if (cudaSuccess != err)
  1883. 1883 {
  1884. 1884 PyErr_Format(
  1885. 1885 PyExc_RuntimeError,
  1886. 1886 "CudaNdarray_inplace_elemwise case2: Cuda error: %s: %s.\n",
  1887. 1887 "k3",
  1888. 1888 cudaGetErrorString(err));
  1889. 1889 Py_XDECREF(new_other);
  1890. 1890 return -1;
  1891. 1891 }
  1892. 1892 }
  1893. 1893 break;
  1894. 1894 case 3:
  1895. 1895 {
  1896. 1896 //TODO: Dimshuffle so that at least one of the arrays
  1897. 1897 // has a contiguous dimension on the thread idx.
  1898. 1898 dim3 n_blocks(
  1899. 1899 std::min(
  1900. 1900 CudaNdarray_HOST_DIMS(self)[0],
  1901. 1901 NUM_VECTOR_OP_BLOCKS),
  1902. 1902 CudaNdarray_HOST_DIMS(self)[1]);
  1903. 1903 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
  1904. 1904 n_blocks.y /= 2;
  1905. 1905 dim3 n_threads(
  1906. 1906 std::min(
  1907. 1907 CudaNdarray_HOST_DIMS(self)[2],
  1908. 1908 NUM_VECTOR_OP_THREADS_PER_BLOCK));
  1909. 1909 k3<<<n_blocks, n_threads>>>(
  1910. 1910 CudaNdarray_HOST_DIMS(self)[0],
  1911. 1911 CudaNdarray_HOST_DIMS(self)[1],
  1912. 1912 CudaNdarray_HOST_DIMS(self)[2],
  1913. 1913 CudaNdarray_DEV_DATA(self),
  1914. 1914 CudaNdarray_HOST_STRIDES(self)[0],
  1915. 1915 CudaNdarray_HOST_STRIDES(self)[1],
  1916. 1916 CudaNdarray_HOST_STRIDES(self)[2],
  1917. 1917 CudaNdarray_DEV_DATA(other),
  1918. 1918 other_strides[0],
  1919. 1919 other_strides[1],
  1920. 1920 other_strides[2]);
  1921. 1921 CNDA_THREAD_SYNC;
  1922. 1922 cudaError_t err = cudaGetLastError();
  1923. 1923 if (cudaSuccess != err)
  1924. 1924 {
  1925. 1925 PyErr_Format(
  1926. 1926 PyExc_RuntimeError,
  1927. 1927 "CudaNdarray_inplace_elemwise case3: Cuda error: %s: %s.\n",
  1928. 1928 "k3",
  1929. 1929 cudaGetErrorString(err));
  1930. 1930 Py_XDECREF(new_other);
  1931. 1931 return -1;
  1932. 1932 }
  1933. 1933 }
  1934. 1934 break;
  1935. 1935 case 4:
  1936. 1936 {
  1937. 1937 dim3 n_blocks(
  1938. 1938 std::min(
  1939. 1939 CudaNdarray_HOST_DIMS(self)[0],
  1940. 1940 NUM_VECTOR_OP_BLOCKS),
  1941. 1941 CudaNdarray_HOST_DIMS(self)[1]
  1942. 1942 );
  1943. 1943 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
  1944. 1944 n_blocks.y /= 2;
  1945. 1945 dim3 n_threads(
  1946. 1946 std::min(
  1947. 1947 CudaNdarray_HOST_DIMS(self)[2],
  1948. 1948 NUM_VECTOR_OP_THREADS_PER_BLOCK)
  1949. 1949 //TODO: DON"T YOU NEED OT PUT DIMS[3] in here???
  1950. 1950 );
  1951. 1951 k4<<<n_blocks, n_threads>>>(
  1952. 1952 CudaNdarray_HOST_DIMS(self)[0],
  1953. 1953 CudaNdarray_HOST_DIMS(self)[1],
  1954. 1954 CudaNdarray_HOST_DIMS(self)[2],
  1955. 1955 CudaNdarray_HOST_DIMS(self)[3],
  1956. 1956 CudaNdarray_DEV_DATA(self),
  1957. 1957 CudaNdarray_HOST_STRIDES(self)[0],
  1958. 1958 CudaNdarray_HOST_STRIDES(self)[1],
  1959. 1959 CudaNdarray_HOST_STRIDES(self)[2],
  1960. 1960 CudaNdarray_HOST_STRIDES(self)[3],
  1961. 1961 CudaNdarray_DEV_DATA(other),
  1962. 1962 other_strides[0],
  1963. 1963 other_strides[1],
  1964. 1964 other_strides[2],
  1965. 1965 other_strides[3]);
  1966. 1966 CNDA_THREAD_SYNC;
  1967. 1967 cudaError_t err = cudaGetLastError();
  1968. 1968 if (cudaSuccess != err)
  1969. 1969 {
  1970. 1970 PyErr_Format(
  1971. 1971 PyExc_RuntimeError,
  1972. 1972 "CudaNdarray_inplace_elemwise case4: Cuda error: %s: %s.\n",
  1973. 1973 "k4",
  1974. 1974 cudaGetErrorString(err));
  1975. 1975 Py_XDECREF(new_other);
  1976. 1976 return -1;
  1977. 1977 }
  1978. 1978 }
  1979. 1979 break;
  1980. 1980 case 5:
  1981. 1981 {
  1982. 1982 dim3 n_blocks(
  1983. 1983 std::min(
  1984. 1984 CudaNdarray_HOST_DIMS(self)[1],
  1985. 1985 NUM_VECTOR_OP_BLOCKS),
  1986. 1986 CudaNdarray_HOST_DIMS(self)[2]);
  1987. 1987 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
  1988. 1988 n_blocks.y /= 2;
  1989. 1989 dim3 n_threads(
  1990. 1990 std::min(
  1991. 1991 CudaNdarray_HOST_DIMS(self)[3],
  1992. 1992 NUM_VECTOR_OP_THREADS_PER_BLOCK)
  1993. 1993 //TODO: DON"T YOU NEED OT PUT DIMS[3] in here???
  1994. 1994 );
  1995. 1995 for (int i = 0; i < CudaNdarray_HOST_DIMS(self)[0]; ++i)
  1996. 1996 {
  1997. 1997 k4<<<n_blocks, n_threads>>>(
  1998. 1998 CudaNdarray_HOST_DIMS(self)[1],
  1999. 1999 CudaNdarray_HOST_DIMS(self)[2],
  2000. 2000 CudaNdarray_HOST_DIMS(self)[3],
  2001. 2001 CudaNdarray_HOST_DIMS(self)[4],
  2002. 2002 CudaNdarray_DEV_DATA(self) + i * CudaNdarray_HOST_STRIDES(self)[0],
  2003. 2003 CudaNdarray_HOST_STRIDES(self)[1],
  2004. 2004 CudaNdarray_HOST_STRIDES(self)[2],
  2005. 2005 CudaNdarray_HOST_STRIDES(self)[3],
  2006. 2006 CudaNdarray_HOST_STRIDES(self)[4],
  2007. 2007 CudaNdarray_DEV_DATA(other) + i * other_strides[0],
  2008. 2008 other_strides[1],
  2009. 2009 other_strides[2],
  2010. 2010 other_strides[3],
  2011. 2011 other_strides[4]);
  2012. 2012 CNDA_THREAD_SYNC;
  2013. 2013 cudaError_t err = cudaGetLastError();
  2014. 2014 if( cudaSuccess != err)
  2015. 2015 {
  2016. 2016 PyErr_Format(
  2017. 2017 PyExc_RuntimeError,
  2018. 2018 "CudaNdarray_inplace_elemwise case5: Cuda error: %s: %s. n_block=(%ld,%ld) n_threads=%ld\n",
  2019. 2019 "k5 with loop over k4",
  2020. 2020 cudaGetErrorString(err),
  2021. 2021 (long) n_blocks.x, (long) n_blocks.y, (long) n_threads.x);
  2022. 2022 Py_XDECREF(new_other);
  2023. 2023 return -1;
  2024. 2024 }
  2025. 2025 }
  2026. 2026 }
  2027. 2027 break;
  2028. 2028 case 6:
  2029. 2029 {
  2030. 2030 dim3 n_blocks(
  2031. 2031 std::min(
  2032. 2032 CudaNdarray_HOST_DIMS(self)[0],
  2033. 2033 NUM_VECTOR_OP_BLOCKS),
  2034. 2034 CudaNdarray_HOST_DIMS(self)[1],
  2035. 2035 CudaNdarray_HOST_DIMS(self)[2]
  2036. 2036 );
  2037. 2037 while (n_blocks.x * n_blocks.y > NUM_VECTOR_OP_BLOCKS)
  2038. 2038 n_blocks.y /= 2;
  2039. 2039 // GTX285(compute capabilities 1.3) don't support n_blocks.z > 1
  2040. 2040 // (compute capabilities 2.0) support 65535 for n_blocks.z
  2041. 2041 //while (n_blocks.x * n_blocks.y * n_blocks.z > NUM_VECTOR_OP_BLOCKS)
  2042. 2042 // n_blocks.z /= 2;
  2043. 2043 n_blocks.z = 1;
  2044. 2044 dim3 n_threads(
  2045. 2045 std::min(
  2046. 2046 CudaNdarray_HOST_DIMS(self)[3],
  2047. 2047 NUM_VECTOR_OP_THREADS_PER_BLOCK)
  2048. 2048 //TODO: DON'T YOU NEED TO PUT DIMS[4] in here???
  2049. 2049 //TODO: DON'T YOU NEED TO PUT DIMS[5] in here???
  2050. 2050 );
  2051. 2051 k6<<<n_blocks, n_threads>>>(
  2052. 2052 CudaNdarray_HOST_DIMS(self)[0],
  2053. 2053 CudaNdarray_HOST_DIMS(self)[1],
  2054. 2054 CudaNdarray_HOST_DIMS(self)[2],
  2055. 2055 CudaNdarray_HOST_DIMS(self)[3],
  2056. 2056 CudaNdarray_HOST_DIMS(self)[4],
  2057. 2057 CudaNdarray_HOST_DIMS(self)[5],
  2058. 2058 CudaNdarray_DEV_DATA(self),
  2059. 2059 CudaNdarray_HOST_STRIDES(self)[0],
  2060. 2060 CudaNdarray_HOST_STRIDES(self)[1],
  2061. 2061 CudaNdarray_HOST_STRIDES(self)[2],
  2062. 2062 CudaNdarray_HOST_STRIDES(self)[3],
  2063. 2063 CudaNdarray_HOST_STRIDES(self)[4],
  2064. 2064 CudaNdarray_HOST_STRIDES(self)[5],
  2065. 2065 CudaNdarray_DEV_DATA(other),
  2066. 2066 other_strides[0],
  2067. 2067 other_strides[1],
  2068. 2068 other_strides[2],
  2069. 2069 other_strides[3],
  2070. 2070 other_strides[4],
  2071. 2071 other_strides[5]);
  2072. 2072 CNDA_THREAD_SYNC;
  2073. 2073 cudaError_t err = cudaGetLastError();
  2074. 2074 if (cudaSuccess != err)
  2075. 2075 {
  2076. 2076 PyErr_Format(
  2077. 2077 PyExc_RuntimeError,
  2078. 2078 "CudaNdarray_inplace_elemwise case6: Cuda error: %s: %s. n_blocks=(%ld, %ld, %ld) n_threads=(%ld)\n",
  2079. 2079 "k6",
  2080. 2080 cudaGetErrorString(err),
  2081. 2081 (long) n_blocks.x, (long) n_blocks.y, (long) n_blocks.z,
  2082. 2082 (long) n_threads.x);
  2083. 2083 Py_XDECREF(new_other);
  2084. 2084 return -1;
  2085. 2085 }
  2086. 2086 }
  2087. 2087 break;
  2088. 2088 default:
  2089. 2089 {
  2090. 2090 PyErr_Format(
  2091. 2091 PyExc_NotImplementedError,
  2092. 2092 "inplace_elemwise w nd=%i\n",
  2093. 2093 self->nd);
  2094. 2094 Py_XDECREF(new_other);
  2095. 2095 return -1;
  2096. 2096 }
  2097. 2097 }
  2098. 2098 if (verbose)
  2099. 2099 fprintf(stderr, "INPLACE ADD/DIV end\n");
  2100. 2100 Py_XDECREF(new_other);
  2101. 2101 return 0;
  2102. 2102 }
  2103. 2103
  2104. 2104 /*
  2105. 2105 * We need this inplace Add to support IncSubTensor
  2106. 2106 * It returns py_self on success with an additional reference. Else NULL.
  2107. 2107 */
  2108. 2108 // Will be called by __iadd__ in Python
  2109. 2109 PyObject *
  2110. 2110 CudaNdarray_inplace_add(PyObject* py_self, PyObject * py_other)
  2111. 2111 {
  2112. 2112 if (CudaNdarray_inplace_elemwise(py_self, py_other, IADD))
  2113. 2113 {
  2114. 2114 return NULL;
  2115. 2115 }
  2116. 2116 Py_INCREF(py_self);
  2117. 2117 return py_self;
  2118. 2118 }
  2119. 2119
  2120. 2120 /*
  2121. 2121 * We need this inplace div for cuda/tests/test_basic_ops.py:test_shared_options
  2122. 2122 * It returns py_self on success with an additional reference. Else NULL.
  2123. 2123 */
  2124. 2124 // Will be called by __idiv__ in Python
  2125. 2125 static PyObject *
  2126. 2126 CudaNdarray_inplace_div(PyObject* py_self, PyObject * py_other)
  2127. 2127 {
  2128. 2128 if (CudaNdarray_inplace_elemwise(py_self, py_other, IDIV))
  2129. 2129 {
  2130. 2130 return NULL;
  2131. 2131 }
  2132. 2132 Py_INCREF(py_self);
  2133. 2133 return py_self;
  2134. 2134 }
  2135. 2135
  2136. 2136 // The PyNumberMethods struct layout changed in a non-trivial way from 2 to 3.
  2137. 2137 #if PY_MAJOR_VERSION == 3
  2138. 2138 static PyNumberMethods CudaNdarrayNumberMethods =
  2139. 2139 {
  2140. 2140 (binaryfunc)CudaNdarray_add, //binaryfunc nb_add; __add__
  2141. 2141 0, //binaryfunc nb_subtract;
  2142. 2142 0, //binaryfunc nb_multiply;
  2143. 2143 0, //binaryfunc nb_remainder;
  2144. 2144 0, //binaryfunc nb_divmod;
  2145. 2145 0, //ternaryfunc nb_power;
  2146. 2146 0, //unaryfunc nb_negative;
  2147. 2147 0, //unaryfunc nb_positive;
  2148. 2148 0, //unaryfunc nb_absolute;
  2149. 2149 0, //inquiry nb_bool;
  2150. 2150 0, //unaryfunc nb_invert;
  2151. 2151 0, //binaryfunc nb_lshift;
  2152. 2152 0, //binaryfunc nb_rshift;
  2153. 2153 0, //binaryfunc nb_and;
  2154. 2154 0, //binaryfunc nb_xor;
  2155. 2155 0, //binaryfunc nb_or;
  2156. 2156 0, //unaryfunc nb_int;
  2157. 2157 0, //void *nb_reserved;
  2158. 2158 0, //unaryfunc nb_float;
  2159. 2159
  2160. 2160 (binaryfunc)CudaNdarray_inplace_add, //binaryfunc nb_inplace_add; __iadd__
  2161. 2161 0, //binaryfunc nb_inplace_subtract;
  2162. 2162 0, //binaryfunc nb_inplace_multiply;
  2163. 2163 0, //binaryfunc nb_inplace_remainder;
  2164. 2164 0, //ternaryfunc nb_inplace_power;
  2165. 2165 0, //binaryfunc nb_inplace_lshift;
  2166. 2166 0, //binaryfunc nb_inplace_rshift;
  2167. 2167 0, //binaryfunc nb_inplace_and;
  2168. 2168 0, //binaryfunc nb_inplace_xor;
  2169. 2169 0, //binaryfunc nb_inplace_or;
  2170. 2170
  2171. 2171 0, //binaryfunc nb_floor_divide;
  2172. 2172 0, //binaryfunc nb_true_divide;
  2173. 2173 0, //binaryfunc nb_inplace_floor_divide;
  2174. 2174 (binaryfunc)CudaNdarray_inplace_div, //binaryfunc nb_inplace_true_divide; __idiv__
  2175. 2175
  2176. 2176 0, //unaryfunc nb_index
  2177. 2177 };
  2178. 2178 #else
  2179. 2179 static PyNumberMethods CudaNdarrayNumberMethods =
  2180. 2180 {
  2181. 2181 (binaryfunc)CudaNdarray_add, //binaryfunc nb_add; __add__
  2182. 2182 0, //binaryfunc nb_subtract; __sub__
  2183. 2183 0, //binaryfunc nb_multiply; __mul__
  2184. 2184 0, //binaryfunc nb_divide; __div__
  2185. 2185 0, //binaryfunc nb_remainder; __mod__
  2186. 2186 0, //binaryfunc nb_divmod; __divmod__
  2187. 2187 0, //ternaryfunc nb_power; __pow__
  2188. 2188 0, //unaryfunc nb_negative; __neg__
  2189. 2189 0, //unaryfunc nb_positive; __pos__
  2190. 2190 0, //unaryfunc nb_absolute; __abs__
  2191. 2191 0, //inquiry nb_nonzero; __nonzero__ /* Used by PyObject_IsTrue */
  2192. 2192 0, //unaryfunc nb_invert; __invert__
  2193. 2193 0, //binaryfunc nb_lshift; __lshift__
  2194. 2194 0, //binaryfunc nb_rshift; __rshift__
  2195. 2195 0, //binaryfunc nb_and; __and__
  2196. 2196 0, //binaryfunc nb_xor; __xor__
  2197. 2197 0, //binaryfunc nb_or; __or__
  2198. 2198 0, //coercion nb_coerce; __coerce__ /* Used by the coerce() function */
  2199. 2199 0, //unaryfunc nb_int; __int__
  2200. 2200 0, //unaryfunc nb_long; __long__
  2201. 2201 0, //unaryfunc nb_float; __float__
  2202. 2202 0, //unaryfunc nb_oct; __oct__
  2203. 2203 0, //unaryfunc nb_hex; __hex__
  2204. 2204
  2205. 2205 /* Added in release 2.0 */
  2206. 2206 (binaryfunc)CudaNdarray_inplace_add, //binaryfunc nb_inplace_add; __iadd__
  2207. 2207 0, //binaryfunc nb_inplace_subtract; __isub__
  2208. 2208 0, //binaryfunc nb_inplace_multiply; __imul__
  2209. 2209 (binaryfunc)CudaNdarray_inplace_div, //binaryfunc nb_inplace_divide; __idiv__
  2210. 2210 0, //binaryfunc nb_inplace_remainder; __imod__
  2211. 2211 0, //ternaryfunc nb_inplace_power; __ipow__
  2212. 2212 0, //binaryfunc nb_inplace_lshift; __ilshift__
  2213. 2213 0, //binaryfunc nb_inplace_rshift; __irshift__
  2214. 2214 0, //binaryfunc nb_inplace_and; __iand__
  2215. 2215 0, //binaryfunc nb_inplace_xor; __ixor__
  2216. 2216 0, //binaryfunc nb_inplace_or; __ior__
  2217. 2217
  2218. 2218 /* Added in release 2.2 */
  2219. 2219 0, //binaryfunc nb_floor_divide; __floordiv__
  2220. 2220 0, //binaryfunc nb_true_divide; __truediv__
  2221. 2221 0, //binaryfunc nb_inplace_floor_divide; __ifloordiv__
  2222. 2222 0, //binaryfunc nb_inplace_true_divide; __itruediv__
  2223. 2223
  2224. 2224 #if PY_MINOR_VERSION > 4
  2225. 2225 /* Added in release 2.5 */
  2226. 2226 0 //unaryfunc nb_index; __index__
  2227. 2227 #endif
  2228. 2228 };
  2229. 2229 #endif
  2230. 2230
  2231. 2231
  2232. 2232 /////////////////////
  2233. 2233 // Mapping protocol
  2234. 2234 /////////////////////
  2235. 2235
  2236. 2236 // Will by called by __len__ in Python
  2237. 2237 static Py_ssize_t
  2238. 2238 CudaNdarray_len(PyObject * py_self)
  2239. 2239 {
  2240. 2240 CudaNdarray * self = (CudaNdarray*) py_self;
  2241. 2241 if (self->nd <= 0)
  2242. 2242 {
  2243. 2243 return (Py_ssize_t) 0;
  2244. 2244 }
  2245. 2245 else
  2246. 2246 {
  2247. 2247 return (Py_ssize_t) CudaNdarray_HOST_DIMS(self)[0];
  2248. 2248 }
  2249. 2249 }
  2250. 2250
  2251. 2251 // Will by called by __getitem__ in Python
  2252. 2252 PyObject *
  2253. 2253 CudaNdarray_Subscript(PyObject * py_self, PyObject * key)
  2254. 2254 {
  2255. 2255 int verbose = 0;
  2256. 2256 if (verbose) fprintf(stderr, "Subscript .... \n");
  2257. 2257 CudaNdarray * self = (CudaNdarray*) py_self;
  2258. 2258 PyObject * py_rval = NULL;
  2259. 2259 CudaNdarray * rval = NULL;
  2260. 2260 PyObject * intobj = NULL;
  2261. 2261
  2262. 2262 //PyObject_Print(key, stderr, 0);
  2263. 2263
  2264. 2264 if (key == Py_Ellipsis)
  2265. 2265 {
  2266. 2266 Py_INCREF(py_self);
  2267. 2267 return py_self;
  2268. 2268 }
  2269. 2269 if ((intobj=PyNumber_Int(key))) //INDEXING BY INTEGER
  2270. 2270 //else if (PyInt_Check(key)) //INDEXING BY INTEGER
  2271. 2271 {
  2272. 2272 int d_idx = PyInt_AsLong(intobj);
  2273. 2273 Py_DECREF(intobj); intobj=NULL;
  2274. 2274 //int d_idx = PyInt_AsLong(key);
  2275. 2275 if (self->nd == 0)
  2276. 2276 {
  2277. 2277 PyErr_SetString(PyExc_IndexError, "0-d arrays can't be indexed");
  2278. 2278 return NULL;
  2279. 2279 }
  2280. 2280 int d_dim = CudaNdarray_HOST_DIMS(self)[0];
  2281. 2281 int offset = 0;
  2282. 2282
  2283. 2283 if ((d_idx >= 0) && (d_idx < d_dim))
  2284. 2284 {
  2285. 2285 //normal indexing
  2286. 2286 offset += d_idx * CudaNdarray_HOST_STRIDES(self)[0];
  2287. 2287 }
  2288. 2288 else if ((d_idx < 0) && (d_idx >= -d_dim))
  2289. 2289 {
  2290. 2290 //end-based indexing
  2291. 2291 // d_idx is negative
  2292. 2292 offset += (d_dim + d_idx) * CudaNdarray_HOST_STRIDES(self)[0];
  2293. 2293 }
  2294. 2294 else
  2295. 2295 {
  2296. 2296 PyErr_Format(PyExc_IndexError,
  2297. 2297 "index out of bounds. Asked %d, but size of %d",
  2298. 2298 d_idx, d_dim);
  2299. 2299 return NULL;
  2300. 2300 }
  2301. 2301
  2302. 2302 //allocate our subtensor view
  2303. 2303 py_rval = CudaNdarray_new_nd(self->nd - 1);
  2304. 2304 rval = (CudaNdarray*) py_rval;
  2305. 2305 if (!rval) return NULL;
  2306. 2306 assert (0 == rval->data_allocated);
  2307. 2307
  2308. 2308 //initialize the view's data pointer to our own.
  2309. 2309 if (CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self) + offset, self))
  2310. 2310 {
  2311. 2311 Py_DECREF(rval);
  2312. 2312 return NULL;
  2313. 2313 }
  2314. 2314 for (int d = 1; d < self->nd; ++d)
  2315. 2315 {
  2316. 2316 CudaNdarray_set_stride(rval, d-1, CudaNdarray_HOST_STRIDES(self)[d]);
  2317. 2317 CudaNdarray_set_dim(rval, d-1, CudaNdarray_HOST_DIMS(self)[d]);
  2318. 2318 }
  2319. 2319 }
  2320. 2320 else
  2321. 2321 {
  2322. 2322 PyErr_Clear();
  2323. 2323 }
  2324. 2324 if (PySlice_Check(key)) //INDEXING BY SLICE
  2325. 2325 {
  2326. 2326 if (verbose) fprintf(stderr, "by slice\n");
  2327. 2327 if (self->nd == 0)
  2328. 2328 {
  2329. 2329 PyErr_SetString(PyExc_ValueError, "cannot slice a 0-d array");
  2330. 2330 return NULL;
  2331. 2331 }
  2332. 2332
  2333. 2333 int d_dim = CudaNdarray_HOST_DIMS(self)[0];
  2334. 2334 Py_ssize_t start, stop, step, slen;
  2335. 2335 if (PySlice_GetIndicesEx(SLICE_CAST(key), d_dim, &start, &stop, &step, &slen))
  2336. 2336 {
  2337. 2337 if (verbose)
  2338. 2338 fprintf(stderr, "PySlice_GetIndicesEx failed\n");
  2339. 2339 return NULL;
  2340. 2340 }
  2341. 2341 if (verbose)
  2342. 2342 {
  2343. 2343 std::cerr << "start " << start << "\n";
  2344. 2344 std::cerr << "stop " << stop << "\n";
  2345. 2345 std::cerr << "step " << step << "\n";
  2346. 2346 std::cerr << "slen " << slen << "\n";
  2347. 2347 }
  2348. 2348
  2349. 2349 //allocate our subtensor view
  2350. 2350 py_rval = CudaNdarray_new_nd(self->nd);
  2351. 2351 rval = (CudaNdarray*) py_rval;
  2352. 2352 if (!rval) return NULL;
  2353. 2353 assert (0 == rval->data_allocated);
  2354. 2354
  2355. 2355
  2356. 2356 //initialize the view's data pointer to our own.
  2357. 2357 if (CudaNdarray_set_device_data(rval,
  2358. 2358 CudaNdarray_DEV_DATA(self) + start * CudaNdarray_HOST_STRIDES(self)[0],
  2359. 2359 self))
  2360. 2360 {
  2361. 2361 Py_DECREF(rval);
  2362. 2362 return NULL;
  2363. 2363 }
  2364. 2364 //initialize dimension 0 of rval
  2365. 2365 CudaNdarray_set_stride(rval, 0,
  2366. 2366 (slen == 1) ? 0 : step * CudaNdarray_HOST_STRIDES(self)[0]);
  2367. 2367 CudaNdarray_set_dim(rval, 0, slen);
  2368. 2368 if (verbose) std::cerr << "rval stride " << CudaNdarray_HOST_STRIDES(rval)[0] << "\n";
  2369. 2369 // initialize dimensions > 0 of rval
  2370. 2370 for (int d = 1; d < self->nd; ++d)
  2371. 2371 {
  2372. 2372 CudaNdarray_set_stride(rval, d, CudaNdarray_HOST_STRIDES(self)[d]);
  2373. 2373 CudaNdarray_set_dim(rval, d, CudaNdarray_HOST_DIMS(self)[d]);
  2374. 2374 }
  2375. 2375 }
  2376. 2376 if (PyTuple_Check(key)) //INDEXING BY TUPLE
  2377. 2377 {
  2378. 2378 if (verbose) fprintf(stderr, "by tuple\n");
  2379. 2379 //elements of the tuple can be either integers or slices
  2380. 2380 //the dimensionality of the view we will return is diminished for each slice in the tuple
  2381. 2381
  2382. 2382 if (PyTuple_Size(key) > self->nd)
  2383. 2383 {
  2384. 2384 PyErr_SetString(PyExc_IndexError, "index error");
  2385. 2385 return NULL;
  2386. 2386 }
  2387. 2387
  2388. 2388 //calculate the number of dimensions in the return value
  2389. 2389 int rval_nd = self->nd;
  2390. 2390 for (int d = 0; d < PyTuple_Size(key); ++d)
  2391. 2391 {
  2392. 2392 //On some paltform PyInt_Check(<type 'numpy.int64'>) return true, other it return false.
  2393. 2393 //So we use PyArray_IsAnyScalar that should covert everything.
  2394. 2394 rval_nd -= PyArray_IsAnyScalar(PyTuple_GetItem(key, d));
  2395. 2395 }
  2396. 2396
  2397. 2397 //allocate our subtensor view
  2398. 2398 py_rval = CudaNdarray_new_nd(rval_nd);
  2399. 2399 rval = (CudaNdarray*) py_rval;
  2400. 2400 if (!rval) return NULL;
  2401. 2401 assert (0 == rval->data_allocated);
  2402. 2402
  2403. 2403 //initialize the view's data pointer to our own.
  2404. 2404 if (CudaNdarray_set_device_data(rval, CudaNdarray_DEV_DATA(self), self))
  2405. 2405 {
  2406. 2406 Py_DECREF(rval);
  2407. 2407 return NULL;
  2408. 2408 }
  2409. 2409
  2410. 2410 // rval_d will refer to the current dimension in the rval.
  2411. 2411 // It will not be incremented for integer keys, but will be incremented for slice
  2412. 2412 // keys
  2413. 2413 int rval_d = 0;
  2414. 2414
  2415. 2415 for (int d = 0; d < self->nd; ++d)
  2416. 2416 {
  2417. 2417 // keys can be shorter than self->nd.
  2418. 2418 // when that happens, it means that the remaining dimensions are "full slices"
  2419. 2419 if (d >=PyTuple_Size(key))
  2420. 2420 {
  2421. 2421 CudaNdarray_set_stride(rval, rval_d, CudaNdarray_HOST_STRIDES(self)[d]);
  2422. 2422 CudaNdarray_set_dim(rval, rval_d, CudaNdarray_HOST_DIMS(self)[d]);
  2423. 2423 ++rval_d;
  2424. 2424 }
  2425. 2425 else
  2426. 2426 {
  2427. 2427 PyObject * key_d = PyTuple_GetItem(key, d);
  2428. 2428
  2429. 2429 if (PySlice_Check(key_d))
  2430. 2430 {
  2431. 2431 Py_ssize_t start, stop, step, slen;
  2432. 2432 if (PySlice_GetIndicesEx(SLICE_CAST(key_d), CudaNdarray_HOST_DIMS(self)[d], &start, &stop, &step, &slen))
  2433. 2433 {
  2434. 2434 Py_DECREF(rval);
  2435. 2435 return NULL;
  2436. 2436 }
  2437. 2437 rval->devdata += start * CudaNdarray_HOST_STRIDES(self)[d];
  2438. 2438 CudaNdarray_set_stride(rval, rval_d,
  2439. 2439 (slen == 1) ? 0 : step * CudaNdarray_HOST_STRIDES(self)[d]);
  2440. 2440 CudaNdarray_set_dim(rval, rval_d, slen);
  2441. 2441 if (0)
  2442. 2442 {
  2443. 2443 std::cerr << "start " << start << "\n";
  2444. 2444 std::cerr << "stop " << stop << "\n";
  2445. 2445 std::cerr << "step " << step << "\n";
  2446. 2446 std::cerr << "slen " << slen << "\n";
  2447. 2447 }
  2448. 2448 ++rval_d;
  2449. 2449 }
  2450. 2450 else if ((intobj=PyNumber_Int(key_d)))
  2451. 2451 {
  2452. 2452 assert(PyArray_IsAnyScalar(key_d));
  2453. 2453 int d_idx = PyInt_AsLong(intobj);
  2454. 2454 Py_DECREF(intobj);
  2455. 2455 intobj = NULL;
  2456. 2456 int d_dim = CudaNdarray_HOST_DIMS(self)[d];
  2457. 2457
  2458. 2458 if ((d_idx >= 0) && (d_idx < d_dim))
  2459. 2459 {
  2460. 2460 //normal indexing
  2461. 2461 rval->devdata += d_idx * CudaNdarray_HOST_STRIDES(self)[d];
  2462. 2462 }
  2463. 2463 else if ((d_idx < 0) && (d_idx >= -d_dim))
  2464. 2464 {
  2465. 2465 //end-based indexing
  2466. 2466 rval->devdata += (d_dim + d_idx) * CudaNdarray_HOST_STRIDES(self)[d];
  2467. 2467 }
  2468. 2468 else
  2469. 2469 {
  2470. 2470 PyErr_Format(PyExc_IndexError,
  2471. 2471 "index out of bounds. Asked %d for dimensions %d, but size of %d",
  2472. 2472 d_idx, d, d_dim);
  2473. 2473 Py_DECREF(rval);
  2474. 2474 return NULL;
  2475. 2475 }
  2476. 2476 }
  2477. 2477 else
  2478. 2478 {
  2479. 2479 PyErr_Clear(); // clear the error set by PyNumber_Int
  2480. 2480 PyErr_SetString(PyExc_IndexError, "index must be either int or slice");
  2481. 2481 Py_DECREF(rval);
  2482. 2482 return NULL;
  2483. 2483 }
  2484. 2484 }
  2485. 2485 }
  2486. 2486 }
  2487. 2487 if (py_rval)
  2488. 2488 {
  2489. 2489 if (verbose) fprint_CudaNdarray(stderr, self);
  2490. 2490 if (verbose) fprint_CudaNdarray(stderr, rval);
  2491. 2491 }
  2492. 2492 else
  2493. 2493 {
  2494. 2494 PyErr_SetString(PyExc_NotImplementedError, "Unknown key type");
  2495. 2495 return NULL;
  2496. 2496 }
  2497. 2497 return py_rval;
  2498. 2498 }
  2499. 2499
  2500. 2500 // Will by called by __setitem__ in Python
  2501. 2501 // See http://docs.python.org/dev/py3k/c-api/object.html#PyObject_SetItem
  2502. 2502 // Doesn't handle broadcasting, e.g. a[:] = 5
  2503. 2503 // Can only be assigned from a CudaNdarray on the right side
  2504. 2504 // Or a ndarray
  2505. 2505 // Or a python scalar with value 0 when the left side part is c contiguous.
  2506. 2506 static int
  2507. 2507 CudaNdarray_setitem(PyObject *o, PyObject *key, PyObject *value)
  2508. 2508 {
  2509. 2509 int verbose = 0;
  2510. 2510 if (verbose) fprintf(stderr, "CudaNdarray_setitem start\n");
  2511. 2511 // We try to copy directly into this CudaNdarray from the ndarray
  2512. 2512 CudaNdarray* rval = (CudaNdarray*)CudaNdarray_Subscript(o, key);
  2513. 2513 CudaNdarray* new_value = NULL;
  2514. 2514
  2515. 2515 if(!rval){
  2516. 2516 // CudaNdarray_Subscript failed and set the error msg.
  2517. 2517 Py_XDECREF(rval);
  2518. 2518 return -1;
  2519. 2519 }
  2520. 2520
  2521. 2521 if(rval != (CudaNdarray*)o &&
  2522. 2522 (rval->data_allocated ||
  2523. 2523 // The new array should have a base
  2524. 2524 !(((CudaNdarray*)rval)->base) ||
  2525. 2525 // If the original array has no base, the base of the new
  2526. 2526 // array should be the original one
  2527. 2527 (!((CudaNdarray*)o)->base && ((CudaNdarray*)rval)->base != o) ||
  2528. 2528 // Else, the two arrays should have the same base
  2529. 2529 (((CudaNdarray*)o)->base && ((CudaNdarray*)rval)->base != ((CudaNdarray*)o)->base)))
  2530. 2530 {
  2531. 2531 // This case shouldn't happen, based on what I see in Subscript
  2532. 2532 // but just in case it happens sometime in the future
  2533. 2533
  2534. 2534 PyErr_Format(PyExc_RuntimeError,
  2535. 2535 "__getitem__ must return a CudaNdarray that refers to"
  2536. 2536 " the original CudaNdarray, not a copy. rval.base=%p"
  2537. 2537 " o.base=%p o=%p",
  2538. 2538 (((CudaNdarray*)rval)->base), ((CudaNdarray*)o)->base, o);
  2539. 2539 Py_DECREF(rval);
  2540. 2540 return -1;
  2541. 2541 }
  2542. 2542
  2543. 2543 PyObject * intobj = NULL;
  2544. 2544 if (CudaNdarray_Check(o) && PyArray_Check(value)){
  2545. 2545 if (verbose)
  2546. 2546 fprintf(stderr,
  2547. 2547 "CudaNdarray_setitem dest is a CudaNdarray and"
  2548. 2548 " value is a ndarray\n");
  2549. 2549 new_value = (CudaNdarray*) CudaNdarray_New();
  2550. 2550 if(!new_value)
  2551. 2551 {
  2552. 2552 return -1;
  2553. 2553 }
  2554. 2554 if (CudaNdarray_CopyFromArray(new_value, (PyArrayObject *) value))
  2555. 2555 {
  2556. 2556 Py_XDECREF(new_value);
  2557. 2557 Py_XDECREF(rval);
  2558. 2558 return -1;
  2559. 2559 }
  2560. 2560 value = (PyObject *) new_value;
  2561. 2561 }
  2562. 2562 else if ((intobj=PyNumber_Int(value)))
  2563. 2563 {
  2564. 2564 if (verbose)
  2565. 2565 fprintf(stderr,
  2566. 2566 "CudaNdarray_setitem dest and value is a python number\n");
  2567. 2567 if(! CudaNdarray_is_c_contiguous(rval)){
  2568. 2568 PyErr_SetString(PyExc_NotImplementedError,
  2569. 2569 "CudaNdarray.__setitem__: When the new value is a scalar"
  2570. 2570 " of value 0 the part where we copy to must be c contiguous.");
  2571. 2571 Py_XDECREF(rval);
  2572. 2572 return -1;
  2573. 2573 }
  2574. 2574
  2575. 2575 long val = PyInt_AsLong(intobj);
  2576. 2576 Py_DECREF(intobj); intobj=NULL;
  2577. 2577 if (val == 0)
  2578. 2578 {
  2579. 2579 cudaError_t err = cudaMemset(rval->devdata, 0,
  2580. 2580 CudaNdarray_SIZE(rval) * sizeof(real));
  2581. 2581 Py_XDECREF(rval);
  2582. 2582 if (err)
  2583. 2583 {
  2584. 2584 // Clear the error flag, cudaMemset doesn't do it.
  2585. 2585 // Currently this returns the same thing as err, but if in future
  2586. 2586 // it returns something else I still don't see why we should ignore
  2587. 2587 // it. All we want to do here is reset the flag.
  2588. 2588 cudaGetLastError();
  2589. 2589 PyErr_SetString(PyExc_RuntimeError,
  2590. 2590 "CudaNdarray.__setitem__: cudaMemset failed");
  2591. 2591 return -1;
  2592. 2592 }
  2593. 2593 return 0;
  2594. 2594 } else {
  2595. 2595 Py_XDECREF(rval);
  2596. 2596 PyErr_SetString(PyExc_NotImplementedError,
  2597. 2597 "CudaNdarray.__setitem__: we support setting only python"
  2598. 2598 " scalar of value 0, numpy nd array and CudaNdarray.");
  2599. 2599 return -1;
  2600. 2600 }
  2601. 2601 }
  2602. 2602
  2603. 2603 PyErr_Clear(); // clear PyNumber_Int error.
  2604. 2604
  2605. 2605 if(!CudaNdarray_Check(o) || !CudaNdarray_Check(value))
  2606. 2606 {
  2607. 2607 PyErr_SetString(PyExc_TypeError,
  2608. 2608 "CudaNdarray.__setitem__: left must be a CudaNdarrays and right"
  2609. 2609 " must be a CudaNdarrays, an ndarray or a python scalar of value 0.");
  2610. 2610 Py_XDECREF(new_value);
  2611. 2611 return -1;
  2612. 2612 }
  2613. 2613
  2614. 2614 if (verbose)
  2615. 2615 fprintf(stderr, "CudaNdarray_setitem dest and value are CudaNdarray\n");
  2616. 2616
  2617. 2617 if (cnda_copy_structure_to_device(rval))
  2618. 2618 {
  2619. 2619 PyErr_SetString(PyExc_RuntimeError,
  2620. 2620 "CudaNdarray.__setitem__: syncing structure to device failed");
  2621. 2621 Py_DECREF(rval);
  2622. 2622 Py_XDECREF(new_value);
  2623. 2623
  2624. 2624 if (verbose)
  2625. 2625 fprintf(stderr, "CudaNdarray_setitem error end\n");
  2626. 2626 return -1;
  2627. 2627 }
  2628. 2628
  2629. 2629 PyObject *baseSavedForComparison = rval->base;
  2630. 2630
  2631. 2631 if (CudaNdarray_CopyFromCudaNdarray(rval, (CudaNdarray*)value, true))
  2632. 2632 {
  2633. 2633 Py_DECREF((PyObject*)rval);
  2634. 2634 Py_XDECREF(new_value);
  2635. 2635
  2636. 2636 if (verbose)
  2637. 2637 fprintf(stderr, "CudaNdarray_setitem error end\n");
  2638. 2638 return -1;
  2639. 2639 }
  2640. 2640
  2641. 2641 assert (rval->base == baseSavedForComparison);
  2642. 2642 assert (rval->dev_structure_fresh);
  2643. 2643
  2644. 2644 // Clean up locally-created references
  2645. 2645 Py_DECREF(rval);
  2646. 2646 Py_XDECREF(new_value);
  2647. 2647
  2648. 2648 return 0;
  2649. 2649 }
  2650. 2650
  2651. 2651
  2652. 2652 PyMappingMethods CudaNdarrayMappingMethods = {
  2653. 2653 CudaNdarray_len, //lenfunc mp_length; __len__
  2654. 2654 CudaNdarray_Subscript, //binaryfunc mp_subscript; __getitem__
  2655. 2655 CudaNdarray_setitem //objobjargproc mp_ass_subscript; __setitem__
  2656. 2656 };
  2657. 2657
  2658. 2658 ////////////////////
  2659. 2659 //
  2660. 2660 ////////////////////
  2661. 2661
  2662. 2662 static PyObject *
  2663. 2663 CudaNdarray_get_shape(CudaNdarray *self, void *closure)
  2664. 2664 {
  2665. 2665 if (self->nd < 0)
  2666. 2666 {
  2667. 2667 PyErr_SetString(PyExc_ValueError, "CudaNdarray not initialized");
  2668. 2668 return NULL;
  2669. 2669 }
  2670. 2670 PyObject * rval = PyTuple_New(self->nd);
  2671. 2671 for (int i = 0; i < self->nd; ++i)
  2672. 2672 {
  2673. 2673 if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(CudaNdarray_HOST_DIMS(self)[i])))
  2674. 2674 {
  2675. 2675 Py_XDECREF(rval);
  2676. 2676 return NULL;
  2677. 2677 }
  2678. 2678
  2679. 2679 }
  2680. 2680 return rval;
  2681. 2681 }
  2682. 2682
  2683. 2683 static int
  2684. 2684 CudaNdarray_set_shape(CudaNdarray *self, PyObject *value, void *closure)
  2685. 2685 {
  2686. 2686 PyErr_SetString(PyExc_NotImplementedError, "TODO: call reshape");
  2687. 2687 return -1;
  2688. 2688 }
  2689. 2689
  2690. 2690 static PyObject *
  2691. 2691 CudaNdarray_get_strides(CudaNdarray *self, void *closure)
  2692. 2692 {
  2693. 2693 if (self->nd < 0)
  2694. 2694 {
  2695. 2695 PyErr_SetString(PyExc_ValueError, "CudaNdarray not initialized");
  2696. 2696 return NULL;
  2697. 2697 }
  2698. 2698 PyObject * rval = PyTuple_New(self->nd);
  2699. 2699 for (int i = 0; i < self->nd; ++i)
  2700. 2700 {
  2701. 2701 if (!rval || PyTuple_SetItem(rval, i, PyInt_FromLong(CudaNdarray_HOST_STRIDES(self)[i])))
  2702. 2702 {
  2703. 2703 Py_XDECREF(rval);
  2704. 2704 return NULL;
  2705. 2705 }
  2706. 2706
  2707. 2707 }
  2708. 2708 return rval;
  2709. 2709 }
  2710. 2710
  2711. 2711 static int
  2712. 2712 CudaNdarray_set_strides(CudaNdarray *self, PyObject *value, void *closure)
  2713. 2713 {
  2714. 2714 //npy_intp newstrides_bytes[PyTuple_Size(value)];
  2715. 2715 if (PyTuple_Check(value)){
  2716. 2716 if (PyTuple_Size(value) != CudaNdarray_NDIM(self)){
  2717. 2717 PyErr_SetString(PyExc_ValueError,
  2718. 2718 "The new strides tuple must have the same length"
  2719. 2719 " as the number of dimensions");
  2720. 2720 return -1;
  2721. 2721 }
  2722. 2722 }else if (PyList_Check(value)){
  2723. 2723 if (PyList_Size(value) != CudaNdarray_NDIM(self)){
  2724. 2724 PyErr_SetString(PyExc_ValueError,
  2725. 2725 "The new strides list must have the same length"
  2726. 2726 " as the number of dimensions");
  2727. 2727 return -1;
  2728. 2728 }
  2729. 2729 }else{
  2730. 2730 PyErr_SetString(PyExc_ValueError,
  2731. 2731 "The new strides need to be encoded in a tuple or list");
  2732. 2732 return -1;
  2733. 2733 }
  2734. 2734 npy_intp* newstrides = (npy_intp*) alloca(CudaNdarray_NDIM(self) * sizeof(npy_intp));
  2735. 2735 if (PyTuple_Check(value)){
  2736. 2736 for(int i=0; i < CudaNdarray_NDIM(self); i++){
  2737. 2737 newstrides[i] = PyInt_AsLong(PyTuple_GetItem(value, Py_ssize_t(i)));
  2738. 2738 //newstrides_bytes[i] = newstrides[i] * 4;
  2739. 2739 }
  2740. 2740 }else if (PyList_Check(value)){
  2741. 2741 for(int i=0; i < CudaNdarray_NDIM(self); i++){
  2742. 2742 newstrides[i] = PyInt_AsLong(PyList_GetItem(value, Py_ssize_t(i)));
  2743. 2743 //newstrides_bytes[i] = newstrides[i] * 4;
  2744. 2744 }
  2745. 2745 }
  2746. 2746 /*
  2747. 2747 // Do not do this check, as ExtractDiag needs that, and NumPy does not seem
  2748. 2748 // to do it.
  2749. 2749 npy_intp dims[PyTuple_Size(value)];
  2750. 2750 for(int i=0; i < CudaNdarray_NDIM(self); i++){
  2751. 2751 dims[i] = CudaNdarray_HOST_DIMS(self)[i];
  2752. 2752 }
  2753. 2753 if (!PyArray_CheckStrides(4,
  2754. 2754 CudaNdarray_NDIM(self),
  2755. 2755 0, 0,
  2756. 2756 dims,
  2757. 2757 newstrides_bytes)){
  2758. 2758 PyErr_SetString(PyExc_ValueError, "bad new strides");
  2759. 2759 return -1;
  2760. 2760 }
  2761. 2761 */
  2762. 2762 for(int i=0; i < CudaNdarray_NDIM(self); i++){
  2763. 2763 CudaNdarray_set_stride(self, i, newstrides[i]);
  2764. 2764 }
  2765. 2765 return 0;
  2766. 2766 }
  2767. 2767
  2768. 2768 static PyObject *
  2769. 2769 CudaNdarray_get_dev_data(CudaNdarray *self, void *closure)
  2770. 2770 {
  2771. 2771 float * p = CudaNdarray_DEV_DATA(self);
  2772. 2772 //printf("get_dev_data %p %li \n", p, (long int)p );
  2773. 2773 return PyInt_FromSize_t((size_t) CudaNdarray_DEV_DATA(self));
  2774. 2774 }
  2775. 2775
  2776. 2776 static int
  2777. 2777 CudaNdarray_set_dev_data(CudaNdarray *self, PyObject *value, void *closure)
  2778. 2778 {
  2779. 2779 Py_ssize_t newdevdata = PyInt_AsSsize_t(value);
  2780. 2780 //printf("set_dev_data %p %li \n",(float*)newdevdata ,newdevdata);
  2781. 2781 if (PyErr_Occurred())
  2782. 2782 {
  2783. 2783 return -1;
  2784. 2784 }
  2785. 2785 return CudaNdarray_set_device_data(self, (float*)newdevdata, (CudaNdarray*)self->base);
  2786. 2786 }
  2787. 2787
  2788. 2788 static PyObject *
  2789. 2789 CudaNdarray_get_dtype(CudaNdarray *self, void *closure)
  2790. 2790 {
  2791. 2791 return PyString_FromString("float32");
  2792. 2792 }
  2793. 2793
  2794. 2794 static PyObject *
  2795. 2795 CudaNdarray_get_ndim(CudaNdarray *self, void *closure)
  2796. 2796 {
  2797. 2797 return PyInt_FromLong(self->nd);
  2798. 2798 }
  2799. 2799
  2800. 2800 static PyObject *
  2801. 2801 CudaNdarray_get_base(CudaNdarray *self, void *closure)
  2802. 2802 {
  2803. 2803 PyObject * base = self->base;
  2804. 2804 if (!base)
  2805. 2805 {
  2806. 2806 // We cannot return a NULL pointer, use None instead
  2807. 2807 base = Py_None;
  2808. 2808 }
  2809. 2809 Py_INCREF(base);
  2810. 2810 return base;
  2811. 2811 }
  2812. 2812
  2813. 2813 void put_in_dict(PyObject * dict, const char * key, int val)
  2814. 2814 {
  2815. 2815 PyObject * k = PyString_FromString(key);
  2816. 2816 PyObject * v = PyInt_FromLong(val);
  2817. 2817 PyDict_SetItem(dict, k, v);
  2818. 2818 Py_DECREF(k);
  2819. 2819 Py_DECREF(v);
  2820. 2820 }
  2821. 2821
  2822. 2822 PyObject *
  2823. 2823 GetDeviceProperties(PyObject* _unused, PyObject* args)
  2824. 2824 {
  2825. 2825 int dev_id = -1;
  2826. 2826 if (! PyArg_ParseTuple(args, "i", &dev_id))
  2827. 2827 return NULL;
  2828. 2828 cudaDeviceProp deviceProp;
  2829. 2829 cudaGetDeviceProperties(&deviceProp, dev_id);
  2830. 2830
  2831. 2831 PyObject * dict = PyDict_New();
  2832. 2832 PyObject * str= PyString_FromString("name");
  2833. 2833 PyObject * i = PyString_FromString(deviceProp.name);
  2834. 2834 PyDict_SetItem(dict, str, i);
  2835. 2835 Py_DECREF(str);
  2836. 2836 Py_DECREF(i);
  2837. 2837
  2838. 2838 put_in_dict(dict, "major", deviceProp.major);
  2839. 2839 put_in_dict(dict, "minor", deviceProp.minor);
  2840. 2840 #if CUDART_VERSION >= 2020
  2841. 2841 int driverVersion = 0, runtimeVersion = 0;
  2842. 2842 cudaDriverGetVersion(&driverVersion);
  2843. 2843 cudaRuntimeGetVersion(&runtimeVersion);
  2844. 2844 put_in_dict(dict, "driverVersion", driverVersion);
  2845. 2845 put_in_dict(dict, "runtimeVersion", runtimeVersion);
  2846. 2846 #endif
  2847. 2847 #if CUDART_VERSION >= 2000
  2848. 2848
  2849. 2849 put_in_dict(dict, "multiProcessorCount", deviceProp.multiProcessorCount);
  2850. 2850 //if ConvertSMVer2Cores is not defined in cuda_runtime_api.h, the run time is too old.
  2851. 2851 int sm_cores = -1;
  2852. 2852 if(deviceProp.major==1)
  2853. 2853 sm_cores = 32;
  2854. 2854 else if(deviceProp.major==2 && deviceProp.minor==0)
  2855. 2855 sm_cores = 32;
  2856. 2856 else if(deviceProp.major==2 && deviceProp.minor==1)
  2857. 2857 sm_cores = 48;
  2858. 2858 put_in_dict(dict, "coresCount", sm_cores * deviceProp.multiProcessorCount);
  2859. 2859 #endif
  2860. 2860 put_in_dict(dict, "totalConstMem", deviceProp.totalConstMem);
  2861. 2861 put_in_dict(dict, "sharedMemPerBlock", deviceProp.sharedMemPerBlock);
  2862. 2862 put_in_dict(dict, "regsPerBlock", deviceProp.regsPerBlock);
  2863. 2863 put_in_dict(dict, "warpSize", deviceProp.warpSize);
  2864. 2864 put_in_dict(dict, "maxThreadsPerBlock", deviceProp.maxThreadsPerBlock);
  2865. 2865 put_in_dict(dict, "maxThreadsDim0", deviceProp.maxThreadsDim[0]);
  2866. 2866 put_in_dict(dict, "maxThreadsDim1", deviceProp.maxThreadsDim[1]);
  2867. 2867 put_in_dict(dict, "maxThreadsDim2", deviceProp.maxThreadsDim[2]);
  2868. 2868 put_in_dict(dict, "maxGridSize0", deviceProp.maxGridSize[0]);
  2869. 2869 put_in_dict(dict, "maxGridSize1", deviceProp.maxGridSize[1]);
  2870. 2870 put_in_dict(dict, "maxGridSize2", deviceProp.maxGridSize[2]);
  2871. 2871 put_in_dict(dict, "memPitch", deviceProp.memPitch);
  2872. 2872 put_in_dict(dict, "textureAlignment", deviceProp.textureAlignment);
  2873. 2873 put_in_dict(dict, "clockRate", deviceProp.clockRate);
  2874. 2874 #if CUDART_VERSION >= 2000
  2875. 2875 put_in_dict(dict, "deviceOverlap", deviceProp.deviceOverlap);
  2876. 2876 #endif
  2877. 2877 #if CUDART_VERSION >= 2020
  2878. 2878 put_in_dict(dict, "kernelExecTimeoutEnabled", deviceProp.kernelExecTimeoutEnabled);
  2879. 2879 put_in_dict(dict, "integrated", deviceProp.integrated);
  2880. 2880 put_in_dict(dict, "canMapHostMemory", deviceProp.canMapHostMemory);
  2881. 2881 put_in_dict(dict, "computeMode", deviceProp.computeMode);
  2882. 2882 //in the doc of this fct tell that 0 - Normal mode, 1 - only 1 context, 2 - no context
  2883. 2883 #endif
  2884. 2884 #if CUDART_VERSION >= 3000
  2885. 2885 put_in_dict(dict, "concurrentKernels", deviceProp.concurrentKernels);
  2886. 2886 #endif
  2887. 2887 #if CUDART_VERSION >= 3010
  2888. 2888 put_in_dict(dict, "ECCEnabled", deviceProp.ECCEnabled);
  2889. 2889 #endif
  2890. 2890 #if CUDART_VERSION >= 3020
  2891. 2891 put_in_dict(dict, "tccDriver", deviceProp.tccDriver);
  2892. 2892 #endif
  2893. 2893
  2894. 2894 return dict;
  2895. 2895 }
  2896. 2896
  2897. 2897 /*
  2898. 2898 * Returns in *free and *total respectively, the free and total amount of memory available for allocation by the device in bytes.
  2899. 2899 */
  2900. 2900 PyObject *
  2901. 2901 GetDeviceMemInfo(PyObject* _unused, PyObject* dummy)
  2902. 2902 {
  2903. 2903 size_t free = 0, total = 0;
  2904. 2904 if(g_gpu_context_active == 0){
  2905. 2905 PyErr_Format(PyExc_RuntimeError, "No gpu device selected yet. Please make sure the gpu device was initialized by Theano before.");
  2906. 2906 return NULL;
  2907. 2907 }
  2908. 2908
  2909. 2909 cudaError_t err = cudaMemGetInfo(&free, &total);
  2910. 2910 if (err != cudaSuccess){
  2911. 2911 // Clear the error flag, cudaMemGetInfo doesn't do it.
  2912. 2912 // Currently this returns the same thing as err, but if in future
  2913. 2913 // it returns something else I still don't see why we should ignore
  2914. 2914 // it. All we want to do here is reset the flag.
  2915. 2915 cudaGetLastError();
  2916. 2916 PyErr_Format(PyExc_RuntimeError,
  2917. 2917 "Error while getting memory info about the gpu: %s",
  2918. 2918 cudaGetErrorString(err));
  2919. 2919 return NULL;
  2920. 2920 }
  2921. 2921 return PyTuple_Pack(2, PyLong_FromLong(free), PyLong_FromLong(total));
  2922. 2922 }
  2923. 2923
  2924. 2924 /*
  2925. 2925 * Synchronize with all the gpu device stream.
  2926. 2926 */
  2927. 2927 PyObject *
  2928. 2928 CudaNdarray_synchronize(PyObject* _unused, PyObject* dummy)
  2929. 2929 {
  2930. 2930 CNDA_BEGIN_ALLOW_THREADS
  2931. 2931 cudaThreadSynchronize();
  2932. 2932 CNDA_END_ALLOW_THREADS
  2933. 2933 Py_INCREF(Py_None);
  2934. 2934 return Py_None;
  2935. 2935 }
  2936. 2936
  2937. 2937 /*
  2938. 2938 * Exist and return true if we link with cublas v2.
  2939. 2939 */
  2940. 2940 PyObject *
  2941. 2941 CudaNdarray_cublasv2(PyObject* _unused, PyObject* dummy)
  2942. 2942 {
  2943. 2943 Py_INCREF(Py_True);
  2944. 2944 return Py_True;
  2945. 2945 }
  2946. 2946
  2947. 2947 PyObject *
  2948. 2948 CudaNdarray_select_a_gpu(PyObject* _unused, PyObject* dummy)
  2949. 2949 {
  2950. 2950 void * rval = NULL;
  2951. 2951 cudaError_t err;
  2952. 2952 int num_gpus = 0;
  2953. 2953
  2954. 2954 err = cudaGetDeviceCount(&num_gpus);
  2955. 2955 if (cudaSuccess != err){
  2956. 2956 printf("ERR!\\n");
  2957. 2957 PyErr_Format(PyExc_RuntimeError,
  2958. 2958 "Not able to get number of GPUs (%s).",
  2959. 2959 cudaGetErrorString(err));
  2960. 2960 return NULL;
  2961. 2961 }
  2962. 2962
  2963. 2963 for (int device = 0; device < num_gpus; device++) {
  2964. 2964 cudaSetDevice(device);
  2965. 2965 err = cudaDeviceSynchronize(); // << CUDA context gets created here.
  2966. 2966 cudaGetLastError(); // reset the error state
  2967. 2967 if (cudaSuccess == err)
  2968. 2968 break;
  2969. 2969 }
  2970. 2970
  2971. 2971 if (cudaSuccess != err){
  2972. 2972 printf("ERR!\\n");
  2973. 2973 PyErr_Format(PyExc_RuntimeError,
  2974. 2974 "Not able to select available GPU from %d cards (%s).",
  2975. 2975 num_gpus, cudaGetErrorString(err));
  2976. 2976 return NULL;
  2977. 2977 }
  2978. 2978
  2979. 2979 Py_INCREF(Py_None);
  2980. 2980 return Py_None;
  2981. 2981 }
  2982. 2982
  2983. 2983 #if COMPUTE_GPU_MEM_USED
  2984. 2984 /*
  2985. 2985 * Return the size in bytes that Theano currently have allocated on the gpu.
  2986. 2986 */
  2987. 2987 PyObject *
  2988. 2988 GetTheanoAllocInfo(PyObject* _unused, PyObject* dummy)
  2989. 2989 {
  2990. 2990 PyObject* a = PyLong_FromLong(_allocated_size);
  2991. 2991 PyObject* b = PyLong_FromLong(_max_allocated_size);
  2992. 2992
  2993. 2993 PyObject* tuple = PyTuple_New(2);
  2994. 2994 PyTuple_SetItem(tuple, 0, a);
  2995. 2995 PyTuple_SetItem(tuple, 1, b);
  2996. 2996 return tuple;
  2997. 2997 }
  2998. 2998 #endif
  2999. 2999
  3000. 3000 static PyGetSetDef CudaNdarray_getset[] = {
  3001. 3001 {"shape",
  3002. 3002 (getter)CudaNdarray_get_shape,
  3003. 3003 (setter)CudaNdarray_set_shape,
  3004. 3004 "shape of this ndarray (tuple)",
  3005. 3005 NULL},
  3006. 3006 {"_strides",
  3007. 3007 (getter)CudaNdarray_get_strides,
  3008. 3008 (setter)CudaNdarray_set_strides,
  3009. 3009 "data pointer strides (in elements)",
  3010. 3010 NULL},
  3011. 3011 {"strides",
  3012. 3012 (getter)CudaNdarray_get_strides,
  3013. 3013 (setter)CudaNdarray_set_strides,
  3014. 3014 "data pointer strides (in elements)",
  3015. 3015 NULL},
  3016. 3016 //gpudata is needed to allow calling pycuda fct with CudaNdarray input.
  3017. 3017 {"gpudata",
  3018. 3018 (getter)CudaNdarray_get_dev_data,
  3019. 3019 NULL,
  3020. 3020 "device data pointer",
  3021. 3021 NULL},
  3022. 3022 {"_dev_data",
  3023. 3023 (getter)CudaNdarray_get_dev_data,
  3024. 3024 (setter)CudaNdarray_set_dev_data,
  3025. 3025 "device data pointer",
  3026. 3026 NULL},
  3027. 3027 {"dtype",
  3028. 3028 (getter)CudaNdarray_get_dtype,
  3029. 3029 NULL,
  3030. 3030 "The dtype of the element. Now always float32",
  3031. 3031 NULL},
  3032. 3032 {"size",
  3033. 3033 (getter)CudaNdarray_SIZE_Object,
  3034. 3034 NULL,
  3035. 3035 "The number of elements in this object.",
  3036. 3036 NULL},
  3037. 3037 //mem_size is neede for pycuda.elementwise.ElementwiseKernel Why do they use size and mem_size of the same value?
  3038. 3038 {"mem_size",
  3039. 3039 (getter)CudaNdarray_SIZE_Object,
  3040. 3040 NULL,
  3041. 3041 "The number of elements in this object.",
  3042. 3042 NULL},
  3043. 3043 {"ndim",
  3044. 3044 (getter)CudaNdarray_get_ndim,
  3045. 3045 NULL,
  3046. 3046 "The number of dimensions in this object.",
  3047. 3047 NULL},
  3048. 3048 {"base",
  3049. 3049 (getter)CudaNdarray_get_base,
  3050. 3050 NULL,
  3051. 3051 "If this ndarray is a view, base is the original ndarray.",
  3052. 3052 NULL},
  3053. 3053
  3054. 3054 {NULL, NULL, NULL, NULL} /* Sentinel */
  3055. 3055 };
  3056. 3056
  3057. 3057 PyObject *CudaNdarray_repr(PyObject *self)
  3058. 3058 {
  3059. 3059 CudaNdarray *object = (CudaNdarray *)self;
  3060. 3060 PyObject * np_object = CudaNdarray_CreateArrayObj(object);
  3061. 3061 PyObject * str = PyObject_Str((PyObject *) np_object);
  3062. 3062 char * cstr = PyString_AsString(str);
  3063. 3063 PyObject * out = PyString_FromFormat("%s%s%s",
  3064. 3064 "CudaNdarray(",
  3065. 3065 cstr,
  3066. 3066 ")");
  3067. 3067 Py_DECREF(str);
  3068. 3068 Py_DECREF(np_object);
  3069. 3069 #if PY_MAJOR_VERSION >= 3
  3070. 3070 // In Python 3 PyString_FromFormat return a Bytes object
  3071. 3071 PyObject* out2 = PyObject_Str(out);
  3072. 3072 Py_DECREF(out);
  3073. 3073 return out2;
  3074. 3074 #endif
  3075. 3075 return out;
  3076. 3076 }
  3077. 3077
  3078. 3078 static PyTypeObject CudaNdarrayType =
  3079. 3079 {
  3080. 3080 #if PY_MAJOR_VERSION >= 3
  3081. 3081 PyVarObject_HEAD_INIT(NULL, 0)
  3082. 3082 #else
  3083. 3083 PyObject_HEAD_INIT(NULL)
  3084. 3084 0, /*ob_size*/
  3085. 3085 #endif
  3086. 3086 "CudaNdarray", /*tp_name*/
  3087. 3087 sizeof(CudaNdarray), /*tp_basicsize*/
  3088. 3088 0, /*tp_itemsize*/
  3089. 3089 (destructor)CudaNdarray_dealloc, /*tp_dealloc*/
  3090. 3090 0, /*tp_print*/
  3091. 3091 0, /*tp_getattr*/
  3092. 3092 0, /*tp_setattr*/
  3093. 3093 0, /*tp_compare*/
  3094. 3094 CudaNdarray_repr, /*tp_repr*/
  3095. 3095 &CudaNdarrayNumberMethods, /*tp_as_number*/
  3096. 3096 0, /*tp_as_sequence*/
  3097. 3097 &CudaNdarrayMappingMethods,/*tp_as_mapping*/
  3098. 3098 0, /*tp_hash */
  3099. 3099 0, /*tp_call*/
  3100. 3100 0, /*tp_str*/
  3101. 3101 0, /*tp_getattro*/
  3102. 3102 0, /*tp_setattro*/
  3103. 3103 0, /*tp_as_buffer*/
  3104. 3104 #if PY_MAJOR_VERSION >= 3
  3105. 3105 // Py_TPFLAGS_CHECKTYPES is always true and was removed in Python 3.
  3106. 3106 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
  3107. 3107 #else
  3108. 3108 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_CHECKTYPES, /*tp_flags*/
  3109. 3109 #endif
  3110. 3110 "CudaNdarray objects", /* tp_doc */
  3111. 3111 0, /* tp_traverse */
  3112. 3112 0, /* tp_clear */
  3113. 3113 0, /* tp_richcompare */
  3114. 3114 0, /* tp_weaklistoffset */
  3115. 3115 0, /* tp_iter */
  3116. 3116 0, /* tp_iternext */
  3117. 3117 CudaNdarray_methods, /* tp_methods */
  3118. 3118 CudaNdarray_members, /* tp_members */
  3119. 3119 CudaNdarray_getset, /* tp_getset */
  3120. 3120 0, /* tp_base */
  3121. 3121 0, /* tp_dict */
  3122. 3122 0, /* tp_descr_get */
  3123. 3123 0, /* tp_descr_set */
  3124. 3124 0, /* tp_dictoffset */
  3125. 3125 (initproc)CudaNdarray_init,/* tp_init */
  3126. 3126 0, /* tp_alloc */
  3127. 3127 CudaNdarray_new, /* tp_new */
  3128. 3128 };
  3129. 3129
  3130. 3130 static __global__ void get_gpu_ptr_size(int* dst)
  3131. 3131 {
  3132. 3132 dst[0] = sizeof(float*);
  3133. 3133 dst[1] = sizeof(int);
  3134. 3134 }
  3135. 3135
  3136. 3136 PyObject *
  3137. 3137 CudaNdarray_ptr_int_size(PyObject* _unused, PyObject* args)
  3138. 3138 {
  3139. 3139 int *gpu_data = (int*)device_malloc(sizeof(int)*2);
  3140. 3140 if(gpu_data == NULL){
  3141. 3141 return NULL;
  3142. 3142 }
  3143. 3143 get_gpu_ptr_size<<<1,1>>>(gpu_data);
  3144. 3144
  3145. 3145 cudaError_t cudaErr = cudaGetLastError();
  3146. 3146 if (cudaSuccess != cudaErr){
  3147. 3147
  3148. 3148 device_free(gpu_data);
  3149. 3149 return PyErr_Format(PyExc_RuntimeError,
  3150. 3150 "CudaNdarray_ptr_int_size: error when calling the gpu code. (%s)",
  3151. 3151 cudaGetErrorString(cudaErr));
  3152. 3152 }
  3153. 3153
  3154. 3154 // Transfer the result to cpu
  3155. 3155 int gpu_sizes[] = {-1,-1};
  3156. 3156 cublasStatus_t err;
  3157. 3157 err = cublasGetVector(2, sizeof(int), gpu_data, 1, gpu_sizes, 1);
  3158. 3158 device_free(gpu_data);
  3159. 3159
  3160. 3160 if (CUBLAS_STATUS_SUCCESS != err){
  3161. 3161 PyErr_SetString(PyExc_RuntimeError, "error copying data to from memory");
  3162. 3162 return NULL;
  3163. 3163 }
  3164. 3164 return Py_BuildValue("iiii", (int) gpu_sizes[0], (int)sizeof(float*),
  3165. 3165 (int)sizeof(int), (int) gpu_sizes[1]);
  3166. 3166 }
  3167. 3167
  3168. 3168 static int cublas_init();
  3169. 3169 static void cublas_shutdown();
  3170. 3170 // Initialize the gpu.
  3171. 3171 // Takes two optional parameters, the device number and if we should use cnmem.
  3172. 3172 // If the device number is provided, it sets that device to be the active device.
  3173. 3173 // If not provided (usually just to test whether the gpu is available at all),
  3174. 3174 // it does not set an active device.
  3175. 3175 // Raises EnvironmentError or ValueError (as appropriate) if the initialization failed.
  3176. 3176 // cnmem is threaded like a bool. If converted to 0, don't use cnmem. Otherwise, use it.
  3177. 3177 PyObject *
  3178. 3178 CudaNdarray_gpu_init(PyObject* _unused, PyObject* args)
  3179. 3179 {
  3180. 3180 int card_nb = 0;
  3181. 3181 int card_number_provided = 1;
  3182. 3182 float cnmem = 0; // Theano flag lib.cnmem
  3183. 3183 // if we're given something wildly invalid, this will throw a TypeError
  3184. 3184 if(!PyArg_ParseTuple(args, "|if", &card_nb, &cnmem))
  3185. 3185 return NULL;
  3186. 3186 if(cnmem)
  3187. 3187 g_use_cnmem = true;
  3188. 3188
  3189. 3189 if(PyTuple_Size(args) == 0) {
  3190. 3190 card_number_provided = 0;
  3191. 3191 card_nb = 0;
  3192. 3192 }
  3193. 3193
  3194. 3194 int deviceCount;
  3195. 3195 cudaError err = cudaGetDeviceCount(&deviceCount);
  3196. 3196 if(cudaSuccess != err) {
  3197. 3197 return PyErr_Format(PyExc_EnvironmentError,
  3198. 3198 "Unable to get the number of gpus available: %s",
  3199. 3199 cudaGetErrorString(cudaGetLastError()));
  3200. 3200 }
  3201. 3201
  3202. 3202 // as soon as the first successful call to a cuda* function is made, a
  3203. 3203 // gpu context has been created
  3204. 3204 g_gpu_context_active = 1;
  3205. 3205
  3206. 3206 if(deviceCount <= 0) {
  3207. 3207 return PyErr_Format(PyExc_EnvironmentError,
  3208. 3208 "Can't use the GPU, no devices support CUDA");
  3209. 3209 }
  3210. 3210 if(card_number_provided && (card_nb < 0 || card_nb > (deviceCount - 1))) {
  3211. 3211 return PyErr_Format(PyExc_ValueError,
  3212. 3212 "Bad device number %d. Only %d devices available.",
  3213. 3213 card_nb,
  3214. 3214 deviceCount);
  3215. 3215 }
  3216. 3216
  3217. 3217 cudaDeviceProp deviceProp;
  3218. 3218 err = cudaGetDeviceProperties(&deviceProp, card_nb);
  3219. 3219 if(cudaSuccess != err) {
  3220. 3220 return PyErr_Format(PyExc_EnvironmentError,
  3221. 3221 "Unable to get properties of gpu %i: %s",
  3222. 3222 card_nb,
  3223. 3223 cudaGetErrorString(cudaGetLastError()));
  3224. 3224 }
  3225. 3225
  3226. 3226 if(deviceProp.major == 9999 && deviceProp.minor == 9999 ){
  3227. 3227 return PyErr_Format(PyExc_EnvironmentError,
  3228. 3228 "There is no device that supports CUDA");
  3229. 3229 }
  3230. 3230
  3231. 3231 if(card_number_provided) {
  3232. 3232 err = cudaSetDevice(card_nb);
  3233. 3233 if(cudaSuccess != err) {
  3234. 3234 return PyErr_Format(PyExc_EnvironmentError,
  3235. 3235 "Unable to set device %i: %s",
  3236. 3236 card_nb,
  3237. 3237 cudaGetErrorString(cudaGetLastError()));
  3238. 3238 }
  3239. 3239 if (cublas_init() == -1)
  3240. 3240 return NULL;
  3241. 3241 }
  3242. 3242 if(card_number_provided && g_use_cnmem) {
  3243. 3243 size_t mem = 0;
  3244. 3244 if (cnmem > 1)
  3245. 3245 mem = cnmem * 1024 * 1024;
  3246. 3246 else{
  3247. 3247 // Clip to 95% to let memory for the driver.
  3248. 3248 // 98% didn't worked in some cases.
  3249. 3249 if (cnmem > .95){
  3250. 3250 cnmem = .95;
  3251. 3251 }
  3252. 3252 size_t free = 0, total = 0;
  3253. 3253 cudaError_t err = cudaMemGetInfo(&free, &total);
  3254. 3254 if (err != cudaSuccess){
  3255. 3255 // Clear the error flag, cudaMemGetInfo doesn't do it.
  3256. 3256 // Currently this returns the same thing as err, but if in future
  3257. 3257 // it returns something else I still don't see why we should ignore
  3258. 3258 // it. All we want to do here is reset the flag.
  3259. 3259 cudaGetLastError();
  3260. 3260 PyErr_Format(PyExc_RuntimeError,
  3261. 3261 "Error while getting memory info about the gpu: %s",
  3262. 3262 cudaGetErrorString(err));
  3263. 3263 return NULL;
  3264. 3264 }
  3265. 3265 mem = total * cnmem;
  3266. 3266 }
  3267. 3267 if(initCnmem(card_number_provided, card_nb, mem) == -1){
  3268. 3268 return NULL;
  3269. 3269 }
  3270. 3270 }
  3271. 3271
  3272. 3272 Py_INCREF(Py_None);
  3273. 3273 return Py_None;
  3274. 3274 }
  3275. 3275
  3276. 3276 PyObject *
  3277. 3277 CudaNdarray_active_device_number(PyObject* _unused, PyObject* _unused_args) {
  3278. 3278 // NB: No cuda error checking here; keeps things simple, and it's not
  3279. 3279 // really necessary.
  3280. 3280 int currentDevice;
  3281. 3281 cudaGetDevice(&currentDevice);
  3282. 3282 return PyInt_FromLong(currentDevice);
  3283. 3283 }
  3284. 3284
  3285. 3285 PyObject *
  3286. 3286 CudaNdarray_active_device_name(PyObject* _unused, PyObject* _unused_args) {
  3287. 3287 // NB: No cuda error checking here; keeps things simple, and it's not
  3288. 3288 // really necessary.
  3289. 3289 int currentDevice;
  3290. 3290 cudaGetDevice(&currentDevice);
  3291. 3291
  3292. 3292 cudaDeviceProp deviceProp;
  3293. 3293 cudaGetDeviceProperties(&deviceProp, currentDevice);
  3294. 3294 return PyString_FromString(deviceProp.name);
  3295. 3295 }
  3296. 3296
  3297. 3297 PyObject *
  3298. 3298 CudaNdarray_gpu_shutdown(PyObject* _unused, PyObject* _unused_args) {
  3299. 3299 // Don't handle errors here
  3300. 3300 cublas_shutdown();
  3301. 3301 g_gpu_context_active = 0; // context has now been closed down
  3302. 3302 if(g_use_cnmem) {
  3303. 3303 cnmemStatus_t status = cnmemFinalize();
  3304. 3304 if(status != CNMEM_STATUS_SUCCESS) {
  3305. 3305 fprintf(stderr, "CudaNdarray_gpu_shutdown: cnmemFinalize failed! Reason=%s\n",
  3306. 3306 cnmemGetErrorString(status));
  3307. 3307 if(status == CNMEM_STATUS_CUDA_ERROR) {
  3308. 3308 fprintf(stderr, " Cuda-Reason=%s\n",
  3309. 3309 cudaGetErrorString(cudaGetLastError()));
  3310. 3310 }
  3311. 3311 }
  3312. 3312 }
  3313. 3313 cudaThreadExit();
  3314. 3314
  3315. 3315 Py_INCREF(Py_None);
  3316. 3316 return Py_None;
  3317. 3317 }
  3318. 3318
  3319. 3319 /*
  3320. 3320 * This function is tested in theano/misc/test_pycuda_theano_simple.py
  3321. 3321 */
  3322. 3322 PyObject *
  3323. 3323 CudaNdarray_from_gpu_pointer(PyObject* _unused, PyObject* args)
  3324. 3324 {
  3325. 3325 int verbose = 0;
  3326. 3326 PyObject *gpu_ptr = NULL;
  3327. 3327 PyObject *shapes = NULL;
  3328. 3328 PyObject *strides = NULL;
  3329. 3329 PyObject *base = NULL;
  3330. 3330 PyObject *rval = NULL;
  3331. 3331
  3332. 3332 //args should consist of 3 python objects
  3333. 3333 //The first is the gpu ptr
  3334. 3334 //The second if the shape
  3335. 3335 //The third if the strides
  3336. 3336 if (! PyArg_ParseTuple(args, "OOOO", &gpu_ptr, &shapes, &strides, &base))
  3337. 3337 return NULL;
  3338. 3338
  3339. 3339 if (verbose) printf("In CudaNdarray_from_gpu_pointer\n");
  3340. 3340 if (!PyLong_Check(gpu_ptr))
  3341. 3341 {
  3342. 3342 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: The gpu pointor is not an long");
  3343. 3343 return NULL;
  3344. 3344 }
  3345. 3345
  3346. 3346 Py_ssize_t nd = PyObject_Length(shapes);
  3347. 3347 if (nd < 0)
  3348. 3348 {
  3349. 3349 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of second argument");
  3350. 3350 return NULL;
  3351. 3351 }
  3352. 3352 Py_ssize_t nd_stride = PyObject_Length(strides);
  3353. 3353 if (nd_stride < 0)
  3354. 3354 {
  3355. 3355 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Couldn't get length of third argument");
  3356. 3356 return NULL;
  3357. 3357 }
  3358. 3358
  3359. 3359 if (nd != nd_stride)
  3360. 3360 {
  3361. 3361 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: We need the same number of shapes and strides");
  3362. 3362 return NULL;
  3363. 3363 }
  3364. 3364
  3365. 3365 rval = CudaNdarray_New();
  3366. 3366
  3367. 3367 if (CudaNdarray_set_nd((CudaNdarray *)rval, nd))
  3368. 3368 {
  3369. 3369 //CudaNdarray_set_nd set the error msg
  3370. 3370 return NULL;
  3371. 3371 }
  3372. 3372 // set gpu pointeur
  3373. 3373 assert(((CudaNdarray *)rval)->data_allocated == 0);
  3374. 3374 if (CudaNdarray_set_device_data((CudaNdarray *)rval, (float *)PyInt_AsLong(gpu_ptr), base))
  3375. 3375 {
  3376. 3376 PyErr_SetString(PyExc_TypeError, "CudaNdarray_from_gpu_pointer: Error while setting the gpu pointor");
  3377. 3377 return NULL;
  3378. 3378
  3379. 3379 }
  3380. 3380
  3381. 3381 // Set dims and strides
  3382. 3382 for (int i = nd-1; i >= 0; --i)
  3383. 3383 {
  3384. 3384 PyObject * idx = PyLong_FromLong(i);
  3385. 3385 if (idx == NULL)
  3386. 3386 {
  3387. 3387 PyErr_SetString(PyExc_Exception, "CudaNdarray_from_gpu_pointer: Couldn't make long object to loop over list/tuple");
  3388. 3388 return NULL;
  3389. 3389 }
  3390. 3390 PyObject* dim_ = PyObject_GetItem(shapes, idx);
  3391. 3391 PyObject* strd_ = PyObject_GetItem(strides, idx);
  3392. 3392 if (!PyInt_Check(dim_))
  3393. 3393 {
  3394. 3394 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: shapes[%d] is not an int", i);
  3395. 3395 return NULL;
  3396. 3396 }
  3397. 3397 if (!PyInt_Check(strd_))
  3398. 3398 {
  3399. 3399 PyErr_Format(PyExc_Exception, "CudaNdarray_from_gpu_pointer: strides[%d] is not an int", i);
  3400. 3400 return NULL;
  3401. 3401 }
  3402. 3402 int dim = PyInt_AsLong(dim_);
  3403. 3403 int strd = PyInt_AsLong(strd_);
  3404. 3404 CudaNdarray_set_stride((CudaNdarray *)rval, i, strd);
  3405. 3405 CudaNdarray_set_dim((CudaNdarray *)rval, i, dim);
  3406. 3406 Py_DECREF(idx);
  3407. 3407 Py_DECREF(dim_);
  3408. 3408 Py_DECREF(strd_);
  3409. 3409 }
  3410. 3410 if (verbose) printf("CudaNdarray_from_gpu_pointer normal return\n");
  3411. 3411 return rval;
  3412. 3412 }
  3413. 3413
  3414. 3414 PyObject *
  3415. 3415 CudaNdarray_Dot(PyObject* _unused, PyObject* args)
  3416. 3416 {
  3417. 3417 PyObject *l=NULL;
  3418. 3418 PyObject *r=NULL;
  3419. 3419 PyObject * rval = NULL;
  3420. 3420
  3421. 3421 //args should consist of two python objects ("OO")
  3422. 3422 if (! PyArg_ParseTuple(args, "OO", &l, &r))
  3423. 3423 return NULL;
  3424. 3424
  3425. 3425 if (!CudaNdarray_Check(l) || !CudaNdarray_Check(r))
  3426. 3426 {
  3427. 3427 PyErr_SetString(PyExc_TypeError, "CudaNdarray arguments required ");
  3428. 3428 goto CudaNdarray_dot_fail;
  3429. 3429 }
  3430. 3430 if (((CudaNdarray*)l)->nd != 2)
  3431. 3431 {
  3432. 3432 PyErr_SetString(PyExc_TypeError, "need 2d CudaNdarray arg for now");
  3433. 3433 goto CudaNdarray_dot_fail;
  3434. 3434 }
  3435. 3435 if (((CudaNdarray*)r)->nd != 2)
  3436. 3436 {
  3437. 3437 PyErr_SetString(PyExc_TypeError, "need 2d CudaNdarray arg for now");
  3438. 3438 goto CudaNdarray_dot_fail;
  3439. 3439 }
  3440. 3440 rval = CudaNdarray_New();
  3441. 3441 if (!rval)
  3442. 3442 {
  3443. 3443 goto CudaNdarray_dot_fail;
  3444. 3444 }
  3445. 3445 int dims[2];
  3446. 3446 dims[0] = CudaNdarray_HOST_DIMS((CudaNdarray*)l)[0];
  3447. 3447 dims[1] = CudaNdarray_HOST_DIMS((CudaNdarray*)r)[1];
  3448. 3448 if (CudaNdarray_alloc_contiguous((CudaNdarray*)rval, 2, dims))
  3449. 3449 {
  3450. 3450 goto CudaNdarray_dot_fail;
  3451. 3451 }
  3452. 3452 if (CudaNdarray_gemm(1.0, (CudaNdarray*)l, (CudaNdarray*)r, 0.0, (CudaNdarray*)rval))
  3453. 3453 {
  3454. 3454 goto CudaNdarray_dot_fail;
  3455. 3455 }
  3456. 3456
  3457. 3457 return rval;
  3458. 3458
  3459. 3459 CudaNdarray_dot_fail:
  3460. 3460 Py_XDECREF(rval);
  3461. 3461 return NULL;
  3462. 3462 }
  3463. 3463
  3464. 3464 static PyObject *
  3465. 3465 filter(PyObject* __unsed_self, PyObject *args) // args = (data, broadcastable, strict, storage)
  3466. 3466 {
  3467. 3467 /*
  3468. 3468 * TODO: DOC what this function should do in the various cases of
  3469. 3469 * What is 'strict' supposed to mean in the context of this function?
  3470. 3470 * What do we do with input that could be interpreted as matching the broadcastable pattern in strict vs. non-strict cases?
  3471. 3471 *
  3472. 3472 */
  3473. 3473 PyObject *py_data=NULL;
  3474. 3474 PyArrayObject * data = NULL;
  3475. 3475 int strict = 0;
  3476. 3476 PyObject * broadcastable=NULL;
  3477. 3477 PyObject * storage=NULL;
  3478. 3478 CudaNdarray * rval=NULL;
  3479. 3479
  3480. 3480 //Python object references which are provided to the caller are borrowed references
  3481. 3481 if (!PyArg_ParseTuple(args, "OOiO", &py_data, &broadcastable, &strict, &storage)) return NULL;
  3482. 3482
  3483. 3483 if (!PyTuple_Check(broadcastable)){
  3484. 3484 PyErr_SetString(PyExc_TypeError, "broadcastable arg should be a tuple of int.");
  3485. 3485 return NULL;
  3486. 3486 }
  3487. 3487 Py_INCREF(py_data);
  3488. 3488 Py_INCREF(broadcastable);
  3489. 3489
  3490. 3490 CudaNdarray * cnda = (CudaNdarray*)py_data;
  3491. 3491
  3492. 3492 if (strict || CudaNdarray_Check(py_data))
  3493. 3493 {
  3494. 3494 //TODO: support non-strict "casting" from a vt to the broadcastable/type/size that we need.
  3495. 3495 if (!CudaNdarray_Check(py_data))
  3496. 3496 {
  3497. 3497 Py_DECREF(py_data);
  3498. 3498 Py_DECREF(broadcastable);
  3499. 3499 PyErr_SetString(PyExc_TypeError, "strict mode requires CudaNdarray");
  3500. 3500 return NULL;
  3501. 3501 }
  3502. 3502 if (cnda->nd != PyTuple_Size(broadcastable))
  3503. 3503 {
  3504. 3504 Py_DECREF(py_data);
  3505. 3505 Py_DECREF(broadcastable);
  3506. 3506 PyErr_Format(PyExc_TypeError, "Wrong rank: %i vs %li", cnda->nd, (long)PyTuple_Size(broadcastable));
  3507. 3507 return NULL;
  3508. 3508 }
  3509. 3509 for (int i = 0; i < cnda->nd; ++i)
  3510. 3510 {
  3511. 3511 if ((CudaNdarray_HOST_DIMS(cnda)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
  3512. 3512 {
  3513. 3513 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable vt dimension %i", i);
  3514. 3514 Py_DECREF(py_data);
  3515. 3515 Py_DECREF(broadcastable);
  3516. 3516 return NULL;
  3517. 3517 }else if (CudaNdarray_HOST_DIMS(cnda)[i] == 1 && CudaNdarray_HOST_STRIDES(cnda)[i] != 0){
  3518. 3518 PyErr_Format(PyExc_TypeError, "Non-zeros strides(%d) on dimension %d of size 1",
  3519. 3519 CudaNdarray_HOST_STRIDES(cnda)[i], i);
  3520. 3520 Py_DECREF(py_data);
  3521. 3521 Py_DECREF(broadcastable);
  3522. 3522 return NULL;
  3523. 3523 }
  3524. 3524 }
  3525. 3525 Py_DECREF(broadcastable);
  3526. 3526 return py_data;
  3527. 3527 }
  3528. 3528 else
  3529. 3529 {
  3530. 3530 data = (PyArrayObject*)PyArray_FromObject(py_data, REAL_TYPENUM, PyTuple_Size(broadcastable), PyTuple_Size(broadcastable));
  3531. 3531 if (!data)
  3532. 3532 {
  3533. 3533 //err message already defined
  3534. 3534 Py_DECREF(py_data);
  3535. 3535 Py_DECREF(broadcastable);
  3536. 3536 return NULL;
  3537. 3537 }
  3538. 3538 for (int i = 0; i < PyArray_NDIM(data); ++i)
  3539. 3539 {
  3540. 3540 if ((PyArray_DIMS(data)[i] > 1) && PyInt_AsLong(PyTuple_GetItem(broadcastable, Py_ssize_t(i))))
  3541. 3541 {
  3542. 3542 PyErr_Format(PyExc_TypeError, "Non-unit size in broadcastable dimension %i", i);
  3543. 3543 Py_DECREF(data);
  3544. 3544 Py_DECREF(py_data);
  3545. 3545 Py_DECREF(broadcastable);
  3546. 3546 return NULL;
  3547. 3547 }
  3548. 3548 }
  3549. 3549 if (storage && CudaNdarray_Check(storage))
  3550. 3550 {
  3551. 3551 rval = (CudaNdarray*) storage;
  3552. 3552 Py_INCREF(rval);
  3553. 3553 }
  3554. 3554 else
  3555. 3555 {
  3556. 3556 rval = (CudaNdarray*) CudaNdarray_New();
  3557. 3557 }
  3558. 3558 if (rval)
  3559. 3559 {
  3560. 3560 if (CudaNdarray_CopyFromArray(rval, data))
  3561. 3561 {
  3562. 3562 Py_DECREF(rval);
  3563. 3563 rval = NULL;
  3564. 3564 }
  3565. 3565 }
  3566. 3566 Py_DECREF(data);
  3567. 3567 Py_DECREF(py_data);
  3568. 3568 Py_DECREF(broadcastable);
  3569. 3569 return (PyObject*)rval;
  3570. 3570 }
  3571. 3571 }
  3572. 3572
  3573. 3573 //TODO-- CudaNdarray_Dot and CudaNdarray_active_device_name are following different capitalization conventions.
  3574. 3574 // Pick one and standardize it, this file is already annoying enough to grep through
  3575. 3575 static PyMethodDef module_methods[] = {
  3576. 3576 {"dimshuffle", CudaNdarray_Dimshuffle, METH_VARARGS, "Returns the dimshuffle of a CudaNdarray."},
  3577. 3577 {"dot", CudaNdarray_Dot, METH_VARARGS, "Returns the matrix product of two CudaNdarray arguments."},
  3578. 3578 {"gpu_init", CudaNdarray_gpu_init, METH_VARARGS, "Select the gpu card to use; also usable to test whether CUDA is available."},
  3579. 3579 {"select_a_gpu", CudaNdarray_select_a_gpu, METH_NOARGS, "Call this method if you want to select a GPU before gpu_init call and let the driver choose the GPU."},
  3580. 3580 {"active_device_name", CudaNdarray_active_device_name, METH_VARARGS, "Get the name of the active device."},
  3581. 3581 {"active_device_number", CudaNdarray_active_device_number, METH_VARARGS, "Get the number of the active device."},
  3582. 3582 {"gpu_shutdown", CudaNdarray_gpu_shutdown, METH_VARARGS, "Shut down the gpu."},
  3583. 3583 {"device_properties", GetDeviceProperties, METH_VARARGS, "Return a dictionary with the device properties."},
  3584. 3584 {"mem_info", GetDeviceMemInfo, METH_NOARGS, "Return a tuple with the free and total memory on the gpu in bytes."},
  3585. 3585 #if COMPUTE_GPU_MEM_USED
  3586. 3586 {"theano_allocated", GetTheanoAllocInfo, METH_NOARGS, "Return the size in bytes of memory Theano currently have allocated on the gpu."},
  3587. 3587 #endif
  3588. 3588 {"ptr_int_size", CudaNdarray_ptr_int_size, METH_VARARGS, "Return a tuple with the size of gpu pointer, cpu pointer and int in bytes."},
  3589. 3589 {"filter", filter, METH_VARARGS, "filter(obj, broadcastable, strict, storage) returns a CudaNdarray initialized to obj if it matches the constraints of broadcastable. strict=True prevents any numeric casting. If storage is a CudaNdarray it may be overwritten and used as the return value."},
  3590. 3590 {"outstanding_mallocs", outstanding_mallocs, METH_VARARGS, "how many more mallocs have been called than free's"},
  3591. 3591 {"from_gpu_pointer", CudaNdarray_from_gpu_pointer, METH_VARARGS, "Used to create a CudaNdarray from already allocated memory on the gpu.(example by pycuda)"},
  3592. 3592 {"synchronize", CudaNdarray_synchronize, METH_NOARGS, "Used to synchronize the device"},
  3593. 3593 {"cublas_v2", CudaNdarray_cublasv2, METH_NOARGS,
  3594. 3594 "Used to know if this version of cuda_ndarray is linked with cublas v2."},
  3595. 3595 {NULL, NULL, NULL, NULL} /* Sentinel */
  3596. 3596 };
  3597. 3597
  3598. 3598 #define CNDA_MOD_NAME "cuda_ndarray"
  3599. 3599 #define CNDA_DOCSTRING "CUDA implementation of a numpy ndarray-like object."
  3600. 3600
  3601. 3601 #if PY_MAJOR_VERSION == 3
  3602. 3602 static struct PyModuleDef cuda_ndarray_moduledef =
  3603. 3603 {
  3604. 3604 PyModuleDef_HEAD_INIT,
  3605. 3605 CNDA_MOD_NAME,
  3606. 3606 CNDA_DOCSTRING,
  3607. 3607 -1, /* size of per-interpreter state of the module,
  3608. 3608 or -1 if the module keeps state in global variables. */
  3609. 3609 module_methods
  3610. 3610 };
  3611. 3611
  3612. 3612 PyMODINIT_FUNC
  3613. 3613 PyInit_cuda_ndarray(void)
  3614. 3614 #else
  3615. 3615 PyMODINIT_FUNC
  3616. 3616 initcuda_ndarray(void)
  3617. 3617 #endif
  3618. 3618 {
  3619. 3619 import_array();
  3620. 3620
  3621. 3621 PyObject* m;
  3622. 3622
  3623. 3623 if (PyType_Ready(&CudaNdarrayType) < 0) {
  3624. 3624 #if PY_MAJOR_VERSION == 3
  3625. 3625 return NULL;
  3626. 3626 #else
  3627. 3627 return;
  3628. 3628 #endif
  3629. 3629 }
  3630. 3630
  3631. 3631 #if PY_MAJOR_VERSION == 3
  3632. 3632 m = PyModule_Create(&cuda_ndarray_moduledef);
  3633. 3633 #else
  3634. 3634 m = Py_InitModule3(CNDA_MOD_NAME, module_methods, CNDA_DOCSTRING);
  3635. 3635 #endif
  3636. 3636
  3637. 3637 if (m == NULL) {
  3638. 3638 #if PY_MAJOR_VERSION == 3
  3639. 3639 return NULL;
  3640. 3640 #else
  3641. 3641 return;
  3642. 3642 #endif
  3643. 3643 }
  3644. 3644
  3645. 3645 Py_INCREF(&CudaNdarrayType);
  3646. 3646 PyModule_AddObject(m, "CudaNdarray", (PyObject *)&CudaNdarrayType);
  3647. 3647 #if COMPUTE_GPU_MEM_USED
  3648. 3648 for(int i=0;i<TABLE_SIZE;i++){
  3649. 3649 _alloc_size_table[i].ptr=NULL;
  3650. 3650 _alloc_size_table[i].size=0;
  3651. 3651 }
  3652. 3652 #endif
  3653. 3653 // cublasInit();
  3654. 3654 //if (0&&CUBLAS_STATUS_SUCCESS != cublasGetError())
  3655. 3655 //{
  3656. 3656 //std::cerr << "WARNING: initcuda_ndarray: error initializing device\n";
  3657. 3657 //}
  3658. 3658 if (0) //TODO: is this necessary?
  3659. 3659 {
  3660. 3660 int deviceId = 0; // TODO: what number goes here?
  3661. 3661 cudaSetDevice(deviceId);
  3662. 3662 cudaError_t err = cudaGetLastError();
  3663. 3663 if( cudaSuccess != err)
  3664. 3664 {
  3665. 3665 std::cerr << "Error in SetDevice:" << cudaGetErrorString(err) << "\n";
  3666. 3666 }
  3667. 3667 }
  3668. 3668
  3669. 3669 #if PY_MAJOR_VERSION == 3
  3670. 3670 return m;
  3671. 3671 #endif
  3672. 3672 }
  3673. 3673
  3674. 3674
  3675. 3675 //////////////////////////////////////
  3676. 3676 //
  3677. 3677 // C API FOR CudaNdarray
  3678. 3678 //
  3679. 3679 //////////////////////////////////////
  3680. 3680
  3681. 3681 int
  3682. 3682 CudaNdarray_Check(const PyObject * ob)
  3683. 3683 {
  3684. 3684 //TODO: doesn't work with inheritance
  3685. 3685 return CudaNdarray_CheckExact(ob);
  3686. 3686 }
  3687. 3687 int
  3688. 3688 CudaNdarray_CheckExact(const PyObject * ob)
  3689. 3689 {
  3690. 3690 return ((Py_TYPE(ob) == &CudaNdarrayType) ? 1 : 0);
  3691. 3691 }
  3692. 3692
  3693. 3693 PyObject *
  3694. 3694 CudaNdarray_New(int nd)
  3695. 3695 {
  3696. 3696 CudaNdarray *self = (CudaNdarray *)CudaNdarrayType.tp_alloc(&CudaNdarrayType, 0);
  3697. 3697 if (self == NULL)
  3698. 3698 {
  3699. 3699 PyErr_SetString(PyExc_RuntimeError, "CudaNdarray_New failed to allocate self");
  3700. 3700 return NULL;
  3701. 3701 }
  3702. 3702 CudaNdarray_null_init(self);
  3703. 3703
  3704. 3704 if (nd == 0)
  3705. 3705 {
  3706. 3706 self->nd = 0;
  3707. 3707 }
  3708. 3708 else if (nd > 0)
  3709. 3709 {
  3710. 3710 if (CudaNdarray_set_nd(self, nd))
  3711. 3711 {
  3712. 3712 Py_DECREF(self);
  3713. 3713 return NULL;
  3714. 3714 }
  3715. 3715 }
  3716. 3716 ++_outstanding_mallocs[1];
  3717. 3717 return (PyObject *)self;
  3718. 3718 }
  3719. 3719
  3720. 3720
  3721. 3721
  3722. 3722 //////////////////////////////
  3723. 3723 //
  3724. 3724 // Published helper functions
  3725. 3725 //
  3726. 3726 //////////////////////////////
  3727. 3727
  3728. 3728 static int
  3729. 3729 cublas_init()
  3730. 3730 {
  3731. 3731 cublasStatus_t err;
  3732. 3732 err = cublasCreate(&handle);
  3733. 3733 if (CUBLAS_STATUS_SUCCESS != err)
  3734. 3734 {
  3735. 3735 if(CUBLAS_STATUS_NOT_INITIALIZED == err)
  3736. 3736 PyErr_SetString(PyExc_RuntimeError,
  3737. 3737 "cublasCreate() returned this error "
  3738. 3738 "'the CUDA Runtime initialization failed'");
  3739. 3739 else if(CUBLAS_STATUS_ALLOC_FAILED == err)
  3740. 3740 PyErr_SetString(PyExc_RuntimeError,
  3741. 3741 "cublasCreate() returned this error "
  3742. 3742 "'the resources could not be allocated'");
  3743. 3743 else
  3744. 3744 PyErr_SetString(PyExc_RuntimeError,
  3745. 3745 "unknow error during returned by cublasCreate()");
  3746. 3746 return -1;
  3747. 3747 }
  3748. 3748 // Set the default stream as the one to execute on (default)
  3749. 3749 cublasSetStream(handle, NULL);
  3750. 3750 // Pointer to scalars are on the host (also default)
  3751. 3751 cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
  3752. 3752 #if CUDA_VERSION >= 5000
  3753. 3753 // atomics can be used in kernels to speed up operations (not default)
  3754. 3754 // This may lead to a slight variance from run to run in some operations
  3755. 3755 cublasSetAtomicsMode(handle, CUBLAS_ATOMICS_ALLOWED);
  3756. 3756 #endif
  3757. 3757 return 0;
  3758. 3758 }
  3759. 3759
  3760. 3760 static void
  3761. 3761 cublas_shutdown()
  3762. 3762 {
  3763. 3763 if (handle != NULL)
  3764. 3764 cublasDestroy(handle);
  3765. 3765 // No point in handling any errors here
  3766. 3766 handle = NULL;
  3767. 3767 }
  3768. 3768
  3769. 3769 int
  3770. 3770 CudaNdarray_CopyFromArray(CudaNdarray * self, PyArrayObject*obj)
  3771. 3771 {
  3772. 3772 int err = CudaNdarray_alloc_contiguous(self, PyArray_NDIM(obj),
  3773. 3773 PyArray_DIMS(obj));
  3774. 3774 if (err) {
  3775. 3775 return err;
  3776. 3776 }
  3777. 3777
  3778. 3778 int typenum = PyArray_TYPE(obj);
  3779. 3779 if (typenum != REAL_TYPENUM)
  3780. 3780 {
  3781. 3781 PyErr_SetString(PyExc_TypeError, "can only copy from float arrays");
  3782. 3782 return -1;
  3783. 3783 }
  3784. 3784 assert( 4 == PyArray_ITEMSIZE(obj));
  3785. 3785 PyArrayObject * py_src = (PyArrayObject *)PyArray_ContiguousFromAny(
  3786. 3786 (PyObject*)obj, typenum, self->nd, self->nd);
  3787. 3787 if (!py_src) {
  3788. 3788 return -1;
  3789. 3789 }
  3790. 3790 npy_intp py_src_size = PyArray_SIZE(py_src);
  3791. 3791 void *py_src_data = PyArray_DATA(py_src);
  3792. 3792 cudaError_t cerr;
  3793. 3793 CNDA_BEGIN_ALLOW_THREADS;
  3794. 3794 cerr = cudaMemcpy(self->devdata, py_src_data,
  3795. 3795 py_src_size * sizeof(real),
  3796. 3796 cudaMemcpyHostToDevice);
  3797. 3797 //CNDA_THREAD_SYNC; // unneeded because cudaMemcpy is blocking anyway
  3798. 3798 CNDA_END_ALLOW_THREADS;
  3799. 3799 if (cudaSuccess != cerr)
  3800. 3800 {
  3801. 3801 PyErr_Format(PyExc_RuntimeError,
  3802. 3802 "Cuda error '%s' while copying %lli data element"
  3803. 3803 " to device memory",
  3804. 3804 cudaGetErrorString(cerr),
  3805. 3805 (long long)py_src_size);
  3806. 3806 Py_DECREF(py_src);
  3807. 3807 return -1;
  3808. 3808 }
  3809. 3809 Py_DECREF(py_src);
  3810. 3810 return 0;
  3811. 3811 }
  3812. 3812
  3813. 3813 PyObject *
  3814. 3814 CudaNdarray_new_nd(int nd)
  3815. 3815 {
  3816. 3816 CudaNdarray * rval = (CudaNdarray*) CudaNdarray_New();
  3817. 3817 if (!rval || CudaNdarray_set_nd(rval, nd))
  3818. 3818 {
  3819. 3819 Py_XDECREF(rval);
  3820. 3820 rval = NULL;
  3821. 3821 }
  3822. 3822 return (PyObject *) rval;
  3823. 3823 }
  3824. 3824
  3825. 3825
  3826. 3826 /**
  3827. 3827 * Initialize 'self' as a view of 'base', with memory storage 'data'
  3828. 3828 */
  3829. 3829
  3830. 3830 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, PyObject * base)
  3831. 3831 {
  3832. 3832 if (self->data_allocated)
  3833. 3833 {
  3834. 3834 assert(self->devdata);
  3835. 3835 if (device_free(self->devdata))
  3836. 3836 {
  3837. 3837 self->devdata = NULL;
  3838. 3838 self->data_allocated = 0;
  3839. 3839 return -1;
  3840. 3840 }
  3841. 3841 }
  3842. 3842 // Get the original base object (base.base.base...)
  3843. 3843 PyObject * orig_base = base;
  3844. 3844 // base is not always a CudaNdarray. It can be a GpuArray from pycuda, ...
  3845. 3845 while (orig_base && CudaNdarray_Check(orig_base) && ((CudaNdarray*) orig_base)->base)
  3846. 3846 {
  3847. 3847 // base_base is itself a view
  3848. 3848 orig_base = ((CudaNdarray*) orig_base)->base;
  3849. 3849 }
  3850. 3850 //N.B. XDECREF and XINCREF are no-ops for NULL pointers
  3851. 3851 if (self->base != orig_base)
  3852. 3852 {
  3853. 3853 Py_XDECREF(self->base);
  3854. 3854 self->base = orig_base;
  3855. 3855 Py_XINCREF(self->base);
  3856. 3856 }
  3857. 3857 self->data_allocated = 0;
  3858. 3858 self->devdata = data;
  3859. 3859 return 0;
  3860. 3860 }
  3861. 3861
  3862. 3862 static __global__ void k_copy_1d(const int N, const float * x, const int sx, float * y, const int sy)
  3863. 3863 {
  3864. 3864 for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < N; i += gridDim.x*blockDim.x)
  3865. 3865 {
  3866. 3866 y[i*sy] = x[i*sx];
  3867. 3867 }
  3868. 3868 }
  3869. 3869
  3870. 3870 // N1 through N4 are the size of y
  3871. 3871 static __global__ void k_copy_4d(const int N1,
  3872. 3872 const int N2, const int N3, const int N4,
  3873. 3873 const float * x, const int sx1, const int sx2, const int sx3,
  3874. 3874 const int sx4, float * y, const int sy1, const int sy2,
  3875. 3875 const int sy3, const int sy4)
  3876. 3876 {
  3877. 3877 // These must be made int instead of unsigned int due to a bug in nvcc
  3878. 3878 int bx = blockIdx.x;
  3879. 3879 int by = blockIdx.y;
  3880. 3880
  3881. 3881 for (int i = bx; i < N1; i += gridDim.x)
  3882. 3882 {
  3883. 3883 for (int j = by; j < N2; j += gridDim.y)
  3884. 3884 {
  3885. 3885 for (int k = threadIdx.x; k < N3; k += (int) blockDim.x)
  3886. 3886 {
  3887. 3887 for (int l = threadIdx.y; l < N4; l += (int) blockDim.y)
  3888. 3888 {
  3889. 3889 y[i * sy1 + j * sy2 + k * sy3 + l * sy4] =
  3890. 3890 x[i * sx1 + j * sx2 + k * sx3 + l * sx4];
  3891. 3891 }
  3892. 3892 }
  3893. 3893 }
  3894. 3894 }
  3895. 3895 }
  3896. 3896
  3897. 3897 //copy from other into self
  3898. 3898 int CudaNdarray_CopyFromCudaNdarray(CudaNdarray * self,
  3899. 3899 const CudaNdarray * other,
  3900. 3900 bool unbroadcast)
  3901. 3901 {
  3902. 3902 int verbose = 0;
  3903. 3903 if (verbose>1) fprintf(stderr, "CudaNdarray_CopyFromCudaNdarray\n");
  3904. 3904
  3905. 3905 //standard elemwise size checks
  3906. 3906 if (self->nd == -1)
  3907. 3907 {
  3908. 3908 PyErr_SetString(PyExc_TypeError,
  3909. 3909 "can't copy into un-initialized CudaNdarray");
  3910. 3910 return -1;
  3911. 3911 }
  3912. 3912 CudaNdarray * new_other = NULL;
  3913. 3913
  3914. 3914 if (self->nd < other->nd)
  3915. 3915 {
  3916. 3916 PyErr_Format(PyExc_NotImplementedError,
  3917. 3917 "CudaNdarray_CopyFromCudaNdarray: The number of dimensions of the "
  3918. 3918 "destination needs to be >= the number of dimensions of the "
  3919. 3919 "source. Got %d and %d.", self->nd, other->nd);
  3920. 3920 return -1;
  3921. 3921 }
  3922. 3922 else if (self->nd != other->nd)
  3923. 3923 {
  3924. 3924 new_other = (CudaNdarray *) CudaNdarray_View(other);
  3925. 3925 int added_dims = self->nd - other->nd;
  3926. 3926 int* pattern = (int*) alloca(self->nd * sizeof(int));
  3927. 3927 for(int i = 0; i < added_dims; i++)
  3928. 3928 pattern[i] = -1;
  3929. 3929 for(int i = 0; i < other->nd; i++)
  3930. 3930 pattern[i + added_dims] = i;
  3931. 3931 CudaNdarray_dimshuffle(new_other, self->nd, pattern);
  3932. 3932 other = new_other;
  3933. 3933 }
  3934. 3934 assert(self->nd == other->nd);
  3935. 3935 //standard elemwise dim checks (also compute total size)
  3936. 3936 unsigned int size = 1;
  3937. 3937 unsigned int size_source = 1;
  3938. 3938 for (int i = 0; i< self->nd; ++i)
  3939. 3939 {
  3940. 3940 if ((CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(other)[i])
  3941. 3941 && (1!=CudaNdarray_HOST_DIMS(other)[i] || !unbroadcast) )
  3942. 3942 {
  3943. 3943 PyErr_Format(PyExc_ValueError,
  3944. 3944 "CudaNdarray_CopyFromCudaNdarray:"
  3945. 3945 " need same dimensions for dim %d,"
  3946. 3946 " destination=%d, source=%d",
  3947. 3947 i, CudaNdarray_HOST_DIMS(self)[i],
  3948. 3948 CudaNdarray_HOST_DIMS(other)[i]);
  3949. 3949 Py_XDECREF(new_other);
  3950. 3950 return -1;
  3951. 3951 }
  3952. 3952 size *= (unsigned int) CudaNdarray_HOST_DIMS(self)[i];
  3953. 3953 size_source *= (unsigned int) CudaNdarray_HOST_DIMS(other)[i];
  3954. 3954 }
  3955. 3955 if (0 == size)
  3956. 3956 {
  3957. 3957 Py_XDECREF(new_other);
  3958. 3958 return 0; //nothing to copy, we're done.
  3959. 3959 }
  3960. 3960 if (CudaNdarray_is_c_contiguous(self) &&
  3961. 3961 CudaNdarray_is_c_contiguous(other) &&
  3962. 3962 size == size_source)
  3963. 3963 {
  3964. 3964 if (verbose)
  3965. 3965 fprintf(stderr, "Copying contiguous vector with cublasScopy\n");
  3966. 3966
  3967. 3967 cublasStatus_t err;
  3968. 3968 err = cublasScopy(handle, size, CudaNdarray_DEV_DATA(other), 1,
  3969. 3969 CudaNdarray_DEV_DATA(self), 1);
  3970. 3970 CNDA_THREAD_SYNC;
  3971. 3971 Py_XDECREF(new_other);
  3972. 3972 if (CUBLAS_STATUS_SUCCESS != err)
  3973. 3973 {
  3974. 3974 PyErr_SetString(PyExc_RuntimeError, "Error copying memory");
  3975. 3975 return -1;
  3976. 3976 }
  3977. 3977 return 0;
  3978. 3978 }
  3979. 3979 //TODO: rewrite these copy operations to be more efficient
  3980. 3980 // See, for example the transpose example in the cuda_sdk.
  3981. 3981 switch (self->nd)
  3982. 3982 {
  3983. 3983 case 0: // scalar
  3984. 3984 {
  3985. 3985 // THIS CASE SHOULD NEVER HAPPEN BECAUSE SCALARS ARE ALWAYS C CONTIGUOUS
  3986. 3986 assert(0);
  3987. 3987 }; break;
  3988. 3988 case 1: // vector
  3989. 3989 {
  3990. 3990 if (verbose) fprintf(stderr, "Copying non-contiguous vector\n");
  3991. 3991 if (verbose) fprint_CudaNdarray(stderr, other);
  3992. 3992 unsigned int n_blocks = std::min(size,
  3993. 3993 (unsigned int)NUM_VECTOR_OP_BLOCKS);
  3994. 3994 unsigned int n_threads = std::min(ceil_intdiv(size, n_blocks),
  3995. 3995 (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
  3996. 3996 k_copy_1d<<<n_blocks, n_threads>>>(size,
  3997. 3997 CudaNdarray_DEV_DATA(other),
  3998. 3998 CudaNdarray_HOST_STRIDES(other)[0],
  3999. 3999 CudaNdarray_DEV_DATA(self),
  4000. 4000 CudaNdarray_HOST_STRIDES(self)[0]);
  4001. 4001 CNDA_THREAD_SYNC;
  4002. 4002 cudaError_t err = cudaGetLastError();
  4003. 4003 if( cudaSuccess != err)
  4004. 4004 {
  4005. 4005 PyErr_Format(PyExc_RuntimeError,
  4006. 4006 "Cuda error: %s: %s. (n_blocks=%i,"
  4007. 4007 " n_threads_per_block=%i)\n", "k_copy_1d",
  4008. 4008 cudaGetErrorString(err), n_blocks, n_threads);
  4009. 4009 Py_XDECREF(new_other);
  4010. 4010 return -1;
  4011. 4011 }
  4012. 4012 }; break;
  4013. 4013 case 4: // 4-tensor
  4014. 4014 {
  4015. 4015 if (verbose)
  4016. 4016 {
  4017. 4017 if (0 != fprint_CudaNdarray(stderr, other))
  4018. 4018 {
  4019. 4019 Py_XDECREF(new_other);
  4020. 4020 return -1;
  4021. 4021 }
  4022. 4022 }
  4023. 4023
  4024. 4024 // The blocks implement the looping over the first two axes so
  4025. 4025 // this needs to be (N1, N2)
  4026. 4026 dim3 n_blocks( std::min(CudaNdarray_HOST_DIMS(self)[0],
  4027. 4027 NUM_VECTOR_OP_BLOCKS),
  4028. 4028 std::min(CudaNdarray_HOST_DIMS(self)[1],
  4029. 4029 NUM_VECTOR_OP_BLOCKS));
  4030. 4030 // For the threads, just make as many as possible
  4031. 4031 dim3 n_threads( std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[2],
  4032. 4032 (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK),
  4033. 4033 std::min( (unsigned int) CudaNdarray_HOST_DIMS(self)[3],
  4034. 4034 (unsigned int) NUM_VECTOR_OP_THREADS_PER_BLOCK));
  4035. 4035
  4036. 4036 n_threads.x = std::min( (unsigned int) 32, (unsigned int) n_threads.x);
  4037. 4037 n_threads.y = std::min( n_threads.y, NUM_VECTOR_OP_THREADS_PER_BLOCK / n_threads.x);
  4038. 4038
  4039. 4039 k_copy_4d<<<n_blocks, n_threads>>>(
  4040. 4040 // size of y
  4041. 4041 (unsigned int) CudaNdarray_HOST_DIMS(self)[0], // N1
  4042. 4042 (unsigned int) CudaNdarray_HOST_DIMS(self)[1], // N2
  4043. 4043 (unsigned int) CudaNdarray_HOST_DIMS(self)[2], // N3
  4044. 4044 (unsigned int) CudaNdarray_HOST_DIMS(self)[3], // N4
  4045. 4045 CudaNdarray_DEV_DATA(other), // x
  4046. 4046 // x strides
  4047. 4047 CudaNdarray_HOST_STRIDES(other)[0],
  4048. 4048 CudaNdarray_HOST_STRIDES(other)[1],
  4049. 4049 CudaNdarray_HOST_STRIDES(other)[2],
  4050. 4050 CudaNdarray_HOST_STRIDES(other)[3],
  4051. 4051 CudaNdarray_DEV_DATA(self), // y
  4052. 4052 // y strides
  4053. 4053 CudaNdarray_HOST_STRIDES(self)[0],
  4054. 4054 CudaNdarray_HOST_STRIDES(self)[1],
  4055. 4055 CudaNdarray_HOST_STRIDES(self)[2],
  4056. 4056 CudaNdarray_HOST_STRIDES(self)[3]
  4057. 4057 );
  4058. 4058 CNDA_THREAD_SYNC;
  4059. 4059 cudaError_t err = cudaGetLastError();
  4060. 4060 if( cudaSuccess != err)
  4061. 4061 {
  4062. 4062 PyErr_Format(PyExc_RuntimeError,
  4063. 4063 "Cuda error: %s: %s.",
  4064. 4064 "k_copy_4d",
  4065. 4065 cudaGetErrorString(err));
  4066. 4066 Py_XDECREF(new_other);
  4067. 4067 return -1;
  4068. 4068 }
  4069. 4069 }; break;
  4070. 4070 default:
  4071. 4071 {
  4072. 4072 cudaError_t err = cudaGetLastError();
  4073. 4073 if(cudaSuccess != err){
  4074. 4074 PyErr_Format(PyExc_RuntimeError,
  4075. 4075 "Unexpected Cuda error: %s: %s\n",
  4076. 4076 "CudaNdarray_CopyFromCudaNdarray",
  4077. 4077 cudaGetErrorString(err));
  4078. 4078 Py_XDECREF(new_other);
  4079. 4079 return -1;
  4080. 4080 }
  4081. 4081
  4082. 4082 if (verbose)
  4083. 4083 fprintf(stderr,
  4084. 4084 "Copying with default version unbroadcast=%d\n",
  4085. 4085 unbroadcast);
  4086. 4086 // call worker routine
  4087. 4087 unsigned int threads_per_block = std::min(size,
  4088. 4088 (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
  4089. 4089 unsigned int n_blocks = std::min(ceil_intdiv(size, threads_per_block),
  4090. 4090 (unsigned int)NUM_VECTOR_OP_BLOCKS);
  4091. 4091 const CudaNdarray * cuda_dims = other;
  4092. 4092 if(unbroadcast)
  4093. 4093 cuda_dims = self;
  4094. 4094 //copy from other into self
  4095. 4095 k_elemwise_unary_rowmajor_copy<<<n_blocks, threads_per_block>>>(
  4096. 4096 size,
  4097. 4097 (unsigned int)other->nd,
  4098. 4098 (const int *)CudaNdarray_DEV_DIMS(cuda_dims),
  4099. 4099 (const float*)CudaNdarray_DEV_DATA(other),
  4100. 4100 (const int *)CudaNdarray_DEV_STRIDES(other),
  4101. 4101 CudaNdarray_DEV_DATA(self),
  4102. 4102 (const int *)CudaNdarray_DEV_STRIDES(self));
  4103. 4103 CNDA_THREAD_SYNC;
  4104. 4104 err = cudaGetLastError();
  4105. 4105 if(verbose>1)
  4106. 4106 fprintf(stderr,
  4107. 4107 "INFO k_elemwise_unary_rowmaj (n_blocks=%i,"
  4108. 4108 " n_threads_per_block=%i)\n",
  4109. 4109 n_blocks, threads_per_block);
  4110. 4110 if( cudaSuccess != err)
  4111. 4111 {
  4112. 4112 //fprint_CudaNdarray(stderr, self);
  4113. 4113 //fprint_CudaNdarray(stderr, other);
  4114. 4114 PyErr_Format(PyExc_RuntimeError,
  4115. 4115 "Cuda error: %s: %s. (n_blocks=%i,"
  4116. 4116 " n_threads_per_block=%i)\n",
  4117. 4117 "k_elemwise_unary_rowmajor_copy",
  4118. 4118 cudaGetErrorString(err), n_blocks,
  4119. 4119 threads_per_block);
  4120. 4120 Py_XDECREF(new_other);
  4121. 4121 return -1;
  4122. 4122 }
  4123. 4123 }
  4124. 4124 };
  4125. 4125 Py_XDECREF(new_other);
  4126. 4126 return 0;
  4127. 4127 }
  4128. 4128
  4129. 4129 int CudaNdarray_gemm(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
  4130. 4130 {
  4131. 4131 if (A->nd != 2)
  4132. 4132 {
  4133. 4133 PyErr_SetString(PyExc_ValueError, "non-matrix arg A to gemm");
  4134. 4134 return -1;
  4135. 4135 }
  4136. 4136 if (B->nd != 2)
  4137. 4137 {
  4138. 4138 PyErr_SetString(PyExc_ValueError, "non-matrix arg B to gemm");
  4139. 4139 return -1;
  4140. 4140 }
  4141. 4141 if (C->nd != 2)
  4142. 4142 {
  4143. 4143 PyErr_SetString(PyExc_ValueError, "non-matrix arg C to gemm");
  4144. 4144 return -1;
  4145. 4145 }
  4146. 4146
  4147. 4147 // We must allow dimensions to be zeros.
  4148. 4148 if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
  4149. 4149 || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0])
  4150. 4150 || (CudaNdarray_HOST_DIMS(B)[1] != CudaNdarray_HOST_DIMS(C)[1]))
  4151. 4151 {
  4152. 4152 PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemm (%i,%i)x(%i,%i)->(%i,%i)",
  4153. 4153 CudaNdarray_HOST_DIMS(A)[0],
  4154. 4154 CudaNdarray_HOST_DIMS(A)[1],
  4155. 4155 CudaNdarray_HOST_DIMS(B)[0],
  4156. 4156 CudaNdarray_HOST_DIMS(B)[1],
  4157. 4157 CudaNdarray_HOST_DIMS(C)[0],
  4158. 4158 CudaNdarray_HOST_DIMS(C)[1]);
  4159. 4159 return -1;
  4160. 4160 }
  4161. 4161
  4162. 4162 // If matrix A or B has non-unit size and non-unit stride in both
  4163. 4163 // dimensions, we can make a copy.
  4164. 4164 CudaNdarray * A_new = NULL;
  4165. 4165 CudaNdarray * B_new = NULL;
  4166. 4166 if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
  4167. 4167 && (CudaNdarray_HOST_STRIDES(A)[0] != 1)
  4168. 4168 && (CudaNdarray_HOST_DIMS(A)[1] > 1)
  4169. 4169 && (CudaNdarray_HOST_STRIDES(A)[1] != 1))
  4170. 4170 || (CudaNdarray_HOST_STRIDES(A)[0] < 0)
  4171. 4171 || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
  4172. 4172 {
  4173. 4173 A_new = (CudaNdarray*) CudaNdarray_Copy(A);
  4174. 4174 if (!A_new)
  4175. 4175 return -1;
  4176. 4176 A = A_new;
  4177. 4177 }
  4178. 4178
  4179. 4179 if (((CudaNdarray_HOST_DIMS(B)[0] > 1)
  4180. 4180 && (CudaNdarray_HOST_STRIDES(B)[0] != 1)
  4181. 4181 && (CudaNdarray_HOST_DIMS(B)[1] > 1)
  4182. 4182 && (CudaNdarray_HOST_STRIDES(B)[1] != 1))
  4183. 4183 || (CudaNdarray_HOST_STRIDES(B)[0] < 0)
  4184. 4184 || (CudaNdarray_HOST_STRIDES(B)[1] < 0))
  4185. 4185 {
  4186. 4186 B_new = (CudaNdarray*) CudaNdarray_Copy(B);
  4187. 4187 if (!B_new)
  4188. 4188 {
  4189. 4189 // If A_new is NULL, meaning A was not copied nothing happens
  4190. 4190 Py_XDECREF(A_new);
  4191. 4191 return -1;
  4192. 4192 }
  4193. 4193 B = B_new;
  4194. 4194 }
  4195. 4195
  4196. 4196 // If matrix C has non-unit size and non-unit stride in both
  4197. 4197 // dimensions, or negative strides, we can't operate. We cannot copy
  4198. 4198 // C either, because the calling code will expect the result to be
  4199. 4199 // in the original C container.
  4200. 4200 if (((CudaNdarray_HOST_DIMS(C)[0] > 1)
  4201. 4201 && (CudaNdarray_HOST_STRIDES(C)[0] != 1)
  4202. 4202 && (CudaNdarray_HOST_DIMS(C)[1] > 1)
  4203. 4203 && (CudaNdarray_HOST_STRIDES(C)[1] != 1))
  4204. 4204 || (CudaNdarray_HOST_STRIDES(C)[0] < 0)
  4205. 4205 || (CudaNdarray_HOST_STRIDES(C)[1] < 0))
  4206. 4206 {
  4207. 4207 PyErr_Format(PyExc_AssertionError,
  4208. 4208 "non-unit or negative stride in gemm arg C (%i,%i) of shape (%i,%i)",
  4209. 4209 CudaNdarray_HOST_STRIDES(C)[0],
  4210. 4210 CudaNdarray_HOST_STRIDES(C)[1],
  4211. 4211 CudaNdarray_HOST_DIMS(C)[0],
  4212. 4212 CudaNdarray_HOST_DIMS(C)[1]);
  4213. 4213 Py_XDECREF(A_new);
  4214. 4214 Py_XDECREF(B_new);
  4215. 4215 return -1;
  4216. 4216 }
  4217. 4217
  4218. 4218 // the unit integer is divided logically into three fields of 4 bits
  4219. 4219 // the lowermost 4 bits encode the stride pattern of the output
  4220. 4220 // the next higher 4 bits encode the B variable (or y)
  4221. 4221 // the next higher 4 bits encode the C variable (or x)
  4222. 4222 //
  4223. 4223 // the stride pattern for each input is encoded as 0 for unit stride from col to col (Row major)
  4224. 4224 // 1 for unit stride from row to row (Col major)
  4225. 4225
  4226. 4226 // a stride of 0 implies a dimension of 1 - so we can actually define
  4227. 4227 // a stride of 0 as a 'unit' stride because gemm will never use it.
  4228. 4228 // If a dimension is 0, its stride will not be used either, so we can
  4229. 4229 // consider it a 'unit' stride too.
  4230. 4230 int unit = 0;
  4231. 4231 if (CudaNdarray_HOST_STRIDES(A)[1] == 1 || CudaNdarray_HOST_DIMS(A)[1] <= 1) {
  4232. 4232 unit |= (0x0 << 8);
  4233. 4233 } else if (CudaNdarray_HOST_STRIDES(A)[0] == 1 || CudaNdarray_HOST_DIMS(A)[0] <= 1) {
  4234. 4234 unit |= (0x1 << 8);
  4235. 4235 } else {
  4236. 4236 unit |= (0x2 << 8);
  4237. 4237 }
  4238. 4238 if (CudaNdarray_HOST_STRIDES(B)[1] == 1 || CudaNdarray_HOST_DIMS(B)[1] <= 1) {
  4239. 4239 unit |= (0x0 << 4);
  4240. 4240 } else if (CudaNdarray_HOST_STRIDES(B)[0] == 1 || CudaNdarray_HOST_DIMS(B)[0] <= 1) {
  4241. 4241 unit |= (0x1 << 4);
  4242. 4242 } else {
  4243. 4243 unit |= (0x2 << 4);
  4244. 4244 }
  4245. 4245 if (CudaNdarray_HOST_STRIDES(C)[1] == 1 || CudaNdarray_HOST_DIMS(C)[1] <= 1) {
  4246. 4246 unit |= (0x0 << 0);
  4247. 4247 } else if (CudaNdarray_HOST_STRIDES(C)[0] == 1 || CudaNdarray_HOST_DIMS(C)[0] <= 1) {
  4248. 4248 unit |= (0x1 << 0);
  4249. 4249 } else {
  4250. 4250 unit |= (0x2 << 0);
  4251. 4251 }
  4252. 4252
  4253. 4253 /* create appropriate strides for malformed matrices that are row or column
  4254. 4254 * vectors
  4255. 4255 */
  4256. 4256 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1];
  4257. 4257 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0];
  4258. 4258 int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : CudaNdarray_HOST_DIMS(B)[1];
  4259. 4259 int sb_1 = (CudaNdarray_HOST_DIMS(B)[1] > 1) ? CudaNdarray_HOST_STRIDES(B)[1] : CudaNdarray_HOST_DIMS(B)[0];
  4260. 4260 int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : CudaNdarray_HOST_DIMS(C)[1];
  4261. 4261 int sc_1 = (CudaNdarray_HOST_DIMS(C)[1] > 1) ? CudaNdarray_HOST_STRIDES(C)[1] : CudaNdarray_HOST_DIMS(C)[0];
  4262. 4262
  4263. 4263 float* a = CudaNdarray_DEV_DATA(A);
  4264. 4264 float* b = CudaNdarray_DEV_DATA(B);
  4265. 4265 float* c = CudaNdarray_DEV_DATA(C);
  4266. 4266 cublasOperation_t N = CUBLAS_OP_N;
  4267. 4267 cublasOperation_t T = CUBLAS_OP_T;
  4268. 4268 //std::cerr << (unit/256) MOD 16 << (unit / 16) MOD 16 << unit MOD 16<< '\\n';
  4269. 4269 // There should be no negative stride at that point
  4270. 4270 #define CHK_STRIDE_SGEMM(T0, T1, D0, D1, D2, a, x, sx, y, sy, b, z, sz) \
  4271. 4271 if (sx == 0){sx = 1;}\
  4272. 4272 if (sy == 0){sy = 1;}\
  4273. 4273 if (sz == 0){sz = 1;}\
  4274. 4274 if ((sx > 0) && (sy > 0) && (sz > 0)) { \
  4275. 4275 err = cublasSgemm(handle, T0, T1, D0, D1, D2, &a, x, sx, y, sy, &b, z, sz); \
  4276. 4276 } else { \
  4277. 4277 PyErr_SetString(PyExc_AssertionError, "negative stride to sGemm");\
  4278. 4278 Py_XDECREF(A_new);\
  4279. 4279 Py_XDECREF(B_new);\
  4280. 4280 return -1; \
  4281. 4281 }
  4282. 4282
  4283. 4283 cublasStatus_t err;
  4284. 4284 switch(unit)
  4285. 4285 {
  4286. 4286 case 0x000: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_0, beta, c, sc_0); break;
  4287. 4287 case 0x100: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_0, a, sa_1, beta, c, sc_0); break;
  4288. 4288 case 0x010: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_1, a, sa_0, beta, c, sc_0); break;
  4289. 4289 case 0x110: CHK_STRIDE_SGEMM(T, T, CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(A)[1], alpha, b, sb_1, a, sa_1, beta, c, sc_0); break;
  4290. 4290 case 0x001: CHK_STRIDE_SGEMM(T, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_0, beta, c, sc_1); break;
  4291. 4291 case 0x101: CHK_STRIDE_SGEMM(N, T, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_0, beta, c, sc_1); break;
  4292. 4292 case 0x011: CHK_STRIDE_SGEMM(T, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_0, b, sb_1, beta, c, sc_1); break;
  4293. 4293 case 0x111: CHK_STRIDE_SGEMM(N, N, CudaNdarray_HOST_DIMS(C)[0], CudaNdarray_HOST_DIMS(C)[1], CudaNdarray_HOST_DIMS(A)[1], alpha, a, sa_1, b, sb_1, beta, c, sc_1); break;
  4294. 4294 default: PyErr_Format(PyExc_ValueError, "some matrix has no unit stride (unit=%x)", unit);
  4295. 4295 return -1;
  4296. 4296 };
  4297. 4297 CNDA_THREAD_SYNC;
  4298. 4298 Py_XDECREF(A_new);
  4299. 4299 Py_XDECREF(B_new);
  4300. 4300
  4301. 4301 if (CUBLAS_STATUS_SUCCESS != err)
  4302. 4302 {
  4303. 4303 PyErr_Format(PyExc_RuntimeError,
  4304. 4304 "cublasSgemm failed (%i) %s\n"
  4305. 4305 " unit=%x N=%d, c.dims=[%d %d], a.dim=[%d %d], alpha=%f, beta=%f, a=%p, b=%p, c=%p"
  4306. 4306 " sa_0=%d, sa_1=%d, sb_0=%d, sb_1=%d, sc_0=%d, sc_1=%d",
  4307. 4307 err, cublasGetErrorString(err),
  4308. 4308 unit, N,
  4309. 4309 CudaNdarray_HOST_DIMS(C)[0],
  4310. 4310 CudaNdarray_HOST_DIMS(C)[1],
  4311. 4311 CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
  4312. 4312 alpha, beta, a, b, c, sa_0, sa_1, sb_0, sb_1, sc_0, sc_1);
  4313. 4313
  4314. 4314 return -1;
  4315. 4315 }
  4316. 4316 return 0;
  4317. 4317 }
  4318. 4318
  4319. 4319 int CudaNdarray_sgemv(float alpha, const CudaNdarray * A, const CudaNdarray * B, float beta, CudaNdarray * C)
  4320. 4320 {
  4321. 4321 /**
  4322. 4322 * C <- alpha A B + beta C
  4323. 4323 * A : matrix
  4324. 4324 * B, C: vector
  4325. 4325 * alpha, beta: scalars
  4326. 4326 */
  4327. 4327 if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg to gemv"); return -1; }
  4328. 4328 if (B->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
  4329. 4329 if (C->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg to gemv"); return -1; }
  4330. 4330
  4331. 4331 // We must allow dimensions to be zeros.
  4332. 4332 if ((CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(B)[0])
  4333. 4333 || (CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(C)[0]))
  4334. 4334 {
  4335. 4335 PyErr_Format(PyExc_ValueError, "dimension mismatch in args to gemv (%i,%i)x(%i)->(%i)",
  4336. 4336 CudaNdarray_HOST_DIMS(A)[0],
  4337. 4337 CudaNdarray_HOST_DIMS(A)[1],
  4338. 4338 CudaNdarray_HOST_DIMS(B)[0],
  4339. 4339 CudaNdarray_HOST_DIMS(C)[0]);
  4340. 4340 return -1;
  4341. 4341 }
  4342. 4342
  4343. 4343 // If matrix A has non-unit size and non-unit stride in both
  4344. 4344 // dimensions, or negative strides, we cannot operate, but we can
  4345. 4345 // make a copy.
  4346. 4346 CudaNdarray * A_new = NULL;
  4347. 4347 CudaNdarray * B_new = NULL;
  4348. 4348 if (((CudaNdarray_HOST_DIMS(A)[0] > 1)
  4349. 4349 && (CudaNdarray_HOST_STRIDES(A)[0] != 1)
  4350. 4350 && (CudaNdarray_HOST_DIMS(A)[1] > 1)
  4351. 4351 && (CudaNdarray_HOST_STRIDES(A)[1] != 1))
  4352. 4352 || (CudaNdarray_HOST_STRIDES(A)[0] < 0)
  4353. 4353 || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
  4354. 4354 {
  4355. 4355 A_new = (CudaNdarray*) CudaNdarray_Copy(A);
  4356. 4356 if (!A_new)
  4357. 4357 return -1;
  4358. 4358 A = A_new;
  4359. 4359 }
  4360. 4360
  4361. 4361 // If vector B as a negative stride, we also have to make a copy.
  4362. 4362 if (CudaNdarray_HOST_STRIDES(B)[0] < 0)
  4363. 4363 {
  4364. 4364 B_new = (CudaNdarray*) CudaNdarray_Copy(B);
  4365. 4365 if (!B_new)
  4366. 4366 {
  4367. 4367 // If A was not copied, A_new is NULL, and Py_XDECREF does not
  4368. 4368 // do anything
  4369. 4369 Py_XDECREF(A_new);
  4370. 4370 return -1;
  4371. 4371 }
  4372. 4372 B = B_new;
  4373. 4373 }
  4374. 4374
  4375. 4375 // cudablas does not handle negative strides as expected
  4376. 4376 if ( (CudaNdarray_HOST_STRIDES(A)[0] < 0)
  4377. 4377 || (CudaNdarray_HOST_STRIDES(A)[1] < 0))
  4378. 4378 {
  4379. 4379 PyErr_Format(PyExc_ValueError, "illegal strides in args to gemv (%i,%i)",
  4380. 4380 CudaNdarray_HOST_STRIDES(A)[0],
  4381. 4381 CudaNdarray_HOST_STRIDES(A)[1]);
  4382. 4382 Py_XDECREF(A_new);
  4383. 4383 Py_XDECREF(B_new);
  4384. 4384 return -1;
  4385. 4385 }
  4386. 4386
  4387. 4387 /* create appropriate strides for malformed matrices that are row or column
  4388. 4388 * vectors
  4389. 4389 */
  4390. 4390 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0] : CudaNdarray_HOST_DIMS(A)[1];
  4391. 4391 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1] : CudaNdarray_HOST_DIMS(A)[0];
  4392. 4392 int sb_0 = (CudaNdarray_HOST_DIMS(B)[0] > 1) ? CudaNdarray_HOST_STRIDES(B)[0] : 1;
  4393. 4393 int sc_0 = (CudaNdarray_HOST_DIMS(C)[0] > 1) ? CudaNdarray_HOST_STRIDES(C)[0] : 1;
  4394. 4394
  4395. 4395 if (sa_0 == 0)
  4396. 4396 sa_0 = 1;
  4397. 4397 if (sa_1 == 0)
  4398. 4398 sa_1 = 1;
  4399. 4399
  4400. 4400 // This is important because we can end up not calling Sgemv at all
  4401. 4401 cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
  4402. 4402 if (CudaNdarray_SIZE(C)) {
  4403. 4403 if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
  4404. 4404 || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
  4405. 4405 && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
  4406. 4406 {
  4407. 4407 err = cublasSgemv(handle, CUBLAS_OP_N,
  4408. 4408 CudaNdarray_HOST_DIMS(A)[0], CudaNdarray_HOST_DIMS(A)[1],
  4409. 4409 &alpha,
  4410. 4410 CudaNdarray_DEV_DATA(A), sa_1,
  4411. 4411 CudaNdarray_DEV_DATA(B), sb_0,
  4412. 4412 &beta,
  4413. 4413 CudaNdarray_DEV_DATA(C), sc_0);
  4414. 4414 }
  4415. 4415 else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
  4416. 4416 || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
  4417. 4417 && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
  4418. 4418 {
  4419. 4419 err = cublasSgemv(handle, CUBLAS_OP_T,
  4420. 4420 CudaNdarray_HOST_DIMS(A)[1], CudaNdarray_HOST_DIMS(A)[0],
  4421. 4421 &alpha,
  4422. 4422 CudaNdarray_DEV_DATA(A), sa_0,
  4423. 4423 CudaNdarray_DEV_DATA(B), sb_0,
  4424. 4424 &beta,
  4425. 4425 CudaNdarray_DEV_DATA(C), sc_0);
  4426. 4426 }
  4427. 4427 else
  4428. 4428 {
  4429. 4429 PyErr_Format(PyExc_AssertionError,
  4430. 4430 "Unexpected stride pattern in gemv: (%i, %i) x %i -> %i.\n"
  4431. 4431 "Shapes are: (%i, %i) x %i -> %i\n",
  4432. 4432 CudaNdarray_HOST_STRIDES(A)[0],
  4433. 4433 CudaNdarray_HOST_STRIDES(A)[1],
  4434. 4434 CudaNdarray_HOST_STRIDES(B)[0],
  4435. 4435 CudaNdarray_HOST_STRIDES(C)[0],
  4436. 4436 CudaNdarray_HOST_DIMS(A)[0],
  4437. 4437 CudaNdarray_HOST_DIMS(A)[1],
  4438. 4438 CudaNdarray_HOST_DIMS(B)[0],
  4439. 4439 CudaNdarray_HOST_DIMS(C)[0]);
  4440. 4440 Py_XDECREF(A_new);
  4441. 4441 Py_XDECREF(B_new);
  4442. 4442 return -1;
  4443. 4443 }
  4444. 4444 }
  4445. 4445
  4446. 4446 CNDA_THREAD_SYNC;
  4447. 4447 Py_XDECREF(A_new);
  4448. 4448 Py_XDECREF(B_new);
  4449. 4449
  4450. 4450 if (CUBLAS_STATUS_SUCCESS != err)
  4451. 4451 {
  4452. 4452 PyErr_Format(PyExc_RuntimeError,
  4453. 4453 "cublasSgemv failed (%i)",
  4454. 4454 err);
  4455. 4455 return -1;
  4456. 4456 }
  4457. 4457 return 0;
  4458. 4458 }
  4459. 4459
  4460. 4460 int CudaNdarray_sger(float alpha, const CudaNdarray * x, const CudaNdarray * y, CudaNdarray * A) {
  4461. 4461 if (x->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg x to sger"); return -1; }
  4462. 4462 if (y->nd != 1) { PyErr_SetString(PyExc_ValueError, "non-vector arg y to sger"); return -1; }
  4463. 4463 if (A->nd != 2) { PyErr_SetString(PyExc_ValueError, "non-matrix arg A to sger"); return -1; }
  4464. 4464
  4465. 4465 if ((CudaNdarray_HOST_DIMS(A)[0] != CudaNdarray_HOST_DIMS(x)[0])
  4466. 4466 || (CudaNdarray_HOST_DIMS(A)[1] != CudaNdarray_HOST_DIMS(y)[0])) {
  4467. 4467 PyErr_Format(PyExc_ValueError,
  4468. 4468 "dimension mismatch in args to sger (%i)x(%i)->(%i,%i)",
  4469. 4469 CudaNdarray_HOST_DIMS(x)[0],
  4470. 4470 CudaNdarray_HOST_DIMS(y)[0],
  4471. 4471 CudaNdarray_HOST_DIMS(A)[0],
  4472. 4472 CudaNdarray_HOST_DIMS(A)[1]);
  4473. 4473 return -1;
  4474. 4474 }
  4475. 4475
  4476. 4476 int x_strides = CudaNdarray_HOST_STRIDES(x)[0];
  4477. 4477 CudaNdarray * x_new = NULL;
  4478. 4478 if(x_strides == 0){
  4479. 4479 if(CudaNdarray_HOST_DIMS(x)[0] != 1){
  4480. 4480 PyErr_Format(PyExc_RuntimeError,
  4481. 4481 "CudaNdarray_sger: Invalid input x (should not happen)."
  4482. 4482 " We received a CudaNdarray vector with a stride of 0"
  4483. 4483 " that has more than 1 element!");
  4484. 4484 return -1;
  4485. 4485 }
  4486. 4486 x_strides = 1;
  4487. 4487 } else if(x_strides < 0){
  4488. 4488 x_new = (CudaNdarray*) CudaNdarray_Copy(x);
  4489. 4489 x = x_new;
  4490. 4490 x_strides = CudaNdarray_HOST_STRIDES(x)[0];
  4491. 4491 }
  4492. 4492
  4493. 4493 int y_strides = CudaNdarray_HOST_STRIDES(y)[0];
  4494. 4494 CudaNdarray * y_new = NULL;
  4495. 4495 if(y_strides == 0){
  4496. 4496 if(CudaNdarray_HOST_DIMS(y)[0] != 1){
  4497. 4497 PyErr_Format(PyExc_RuntimeError,
  4498. 4498 "CudaNdarray_sger: Invalid input y (should not happen)."
  4499. 4499 " We received a CudaNdarray vector with a stride of 0"
  4500. 4500 " that has more than 1 elements!");
  4501. 4501 Py_XDECREF(x_new);
  4502. 4502 return -1;
  4503. 4503 }
  4504. 4504 y_strides = 1;
  4505. 4505 } else if(y_strides < 0){
  4506. 4506 y_new = (CudaNdarray*) CudaNdarray_Copy(y);
  4507. 4507 y = y_new;
  4508. 4508 y_strides = CudaNdarray_HOST_STRIDES(y)[0];
  4509. 4509 }
  4510. 4510
  4511. 4511 // Create appropriate strides if A is a row or column vector
  4512. 4512 int sa_0 = (CudaNdarray_HOST_DIMS(A)[0] > 1) ? CudaNdarray_HOST_STRIDES(A)[0]
  4513. 4513 : CudaNdarray_HOST_DIMS(A)[1];
  4514. 4514 int sa_1 = (CudaNdarray_HOST_DIMS(A)[1] > 1) ? CudaNdarray_HOST_STRIDES(A)[1]
  4515. 4515 : CudaNdarray_HOST_DIMS(A)[0];
  4516. 4516
  4517. 4517 // This is important because we can end up not calling Sger at all
  4518. 4518 cublasStatus_t err = CUBLAS_STATUS_SUCCESS;
  4519. 4519 if(CudaNdarray_SIZE(A)){
  4520. 4520 // If A is in col-major
  4521. 4521 if ((CudaNdarray_HOST_DIMS(A)[0] <= 1)
  4522. 4522 || ((CudaNdarray_HOST_STRIDES(A)[0] == 1)
  4523. 4523 && (CudaNdarray_HOST_STRIDES(A)[1] > 0)))
  4524. 4524 {
  4525. 4525 err = cublasSger(handle, CudaNdarray_HOST_DIMS(x)[0], CudaNdarray_HOST_DIMS(y)[0], &alpha,
  4526. 4526 CudaNdarray_DEV_DATA(x), x_strides,
  4527. 4527 CudaNdarray_DEV_DATA(y), y_strides,
  4528. 4528 CudaNdarray_DEV_DATA(A), sa_1);
  4529. 4529 }
  4530. 4530 // Since Sger expects A in col-major, we invert x and y to fake this.
  4531. 4531 else if ((CudaNdarray_HOST_DIMS(A)[1] <= 1)
  4532. 4532 || ((CudaNdarray_HOST_STRIDES(A)[1] == 1)
  4533. 4533 && (CudaNdarray_HOST_STRIDES(A)[0] > 0)))
  4534. 4534 {
  4535. 4535 err = cublasSger(handle, CudaNdarray_HOST_DIMS(y)[0], CudaNdarray_HOST_DIMS(x)[0], &alpha,
  4536. 4536 CudaNdarray_DEV_DATA(y), y_strides,
  4537. 4537 CudaNdarray_DEV_DATA(x), x_strides,
  4538. 4538 CudaNdarray_DEV_DATA(A), sa_0);
  4539. 4539 }
  4540. 4540 // A has to be either c- or f-contiguous, with no negative strides
  4541. 4541 else
  4542. 4542 {
  4543. 4543 PyErr_SetString(PyExc_NotImplementedError,
  4544. 4544 "non-contiguous A, or negative strides, in sger");
  4545. 4545 Py_XDECREF(x_new);
  4546. 4546 Py_XDECREF(y_new);
  4547. 4547 return -1;
  4548. 4548 }
  4549. 4549 }
  4550. 4550 CNDA_THREAD_SYNC;
  4551. 4551 Py_XDECREF(x_new);
  4552. 4552 Py_XDECREF(y_new);
  4553. 4553
  4554. 4554 if (CUBLAS_STATUS_SUCCESS != err)
  4555. 4555 {
  4556. 4556 PyErr_Format(PyExc_RuntimeError,
  4557. 4557 "cublasSger failed (%i)",
  4558. 4558 err);
  4559. 4559 return -1;
  4560. 4560 }
  4561. 4561
  4562. 4562 return 0;
  4563. 4563 }
  4564. 4564
  4565. 4565 /**
  4566. 4566 *
  4567. 4567 * Precondition:
  4568. 4568 * a->dim[d] == (dims_a[d]==0) ? (1 << log2_dims_a[d]) : dims_a[d]
  4569. 4569 * z->dim[d] == (z_str[d]==0) ? 1 : dims_a[d];
  4570. 4570 *
  4571. 4571 * TODO: templatize this function to support other reductions.
  4572. 4572 * All that needs to change is the initial value for sum, and the reduction operator.
  4573. 4573 */
  4574. 4574
  4575. 4575 static __global__ void kernel_reduce_sum(const unsigned int size_z,
  4576. 4576 const unsigned int nd,
  4577. 4577 const int * dims_a,
  4578. 4578 const int * log2_dims_a,
  4579. 4579 const int * a_str,
  4580. 4580 const float * a_data,
  4581. 4581 const int * z_str,
  4582. 4582 float * z_data)
  4583. 4583 {
  4584. 4584 const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
  4585. 4585 const unsigned int numThreads = blockDim.x * gridDim.x;
  4586. 4586
  4587. 4587 //structure data contains the strides and dimensions of both a and z
  4588. 4588 // a_dim[0], a_dim[1], ... a_dim[nd-1],
  4589. 4589 // a_log2dim[0], a_log2dim[1], ... a_log2dim[nd-1],
  4590. 4590 // a_str[0], ... a_str[nd-1],
  4591. 4591 // z_str[0], ... z_str[nd-1]
  4592. 4592 extern __shared__ int structure_data[];
  4593. 4593 for (unsigned int i = threadIdx.x; i < nd; i += blockDim.x)
  4594. 4594 {
  4595. 4595 structure_data[i+0*nd] = dims_a[i];
  4596. 4596 structure_data[i+1*nd] = log2_dims_a[i];
  4597. 4597 structure_data[i+2*nd] = a_str[i];
  4598. 4598 structure_data[i+3*nd] = z_str[i];
  4599. 4599 }
  4600. 4600 dims_a = structure_data;
  4601. 4601 log2_dims_a = structure_data + nd;
  4602. 4602 a_str = structure_data + 2*nd;
  4603. 4603 z_str = structure_data + 3*nd;
  4604. 4604
  4605. 4605 __syncthreads(); //wait for all the shared structure to be loaded
  4606. 4606
  4607. 4607 for (unsigned int i = idx; i < size_z; i += numThreads)
  4608. 4608 {
  4609. 4609 unsigned int ii = i;
  4610. 4610 const float * a_data_i = a_data;
  4611. 4611 float * z_data_i = z_data;
  4612. 4612 unsigned int n_reduce_elements = 1;
  4613. 4613 unsigned int n_reduce_dims = 0;
  4614. 4614 unsigned int reduce_dim0 = nd-1;
  4615. 4615
  4616. 4616
  4617. 4617 //In this loop, we locate the initial element of the slice that we'd like to reduce with this thread
  4618. 4618 // At the same time, we [re]calculate the size of that slice (n_reduce_elements)
  4619. 4619 for (unsigned int d = 0; d < nd; ++d)
  4620. 4620 {
  4621. 4621 if (a_str[d] && (!z_str[d])) // this means 'd' is a dimension we are reducing over
  4622. 4622 {
  4623. 4623 n_reduce_elements *= dims_a[d];
  4624. 4624 n_reduce_dims += 1;
  4625. 4625 reduce_dim0 = (d < reduce_dim0) ? d : reduce_dim0;
  4626. 4626 }
  4627. 4627 else //'d' is not a dimension that we are reducing over
  4628. 4628 {
  4629. 4629 unsigned int pos_d;
  4630. 4630 if (log2_dims_a[d]==-1) //TODO: when things are working, use this switch
  4631. 4631 {
  4632. 4632 // this branch is not preferred,
  4633. 4633 // because the manual said that integer mod and div operations are slow on gpu
  4634. 4634 pos_d = (ii % dims_a[d]);
  4635. 4635 ii = (ii / dims_a[d]);
  4636. 4636 }
  4637. 4637 else
  4638. 4638 {
  4639. 4639 pos_d = (ii & ((1 << log2_dims_a[d])-1)); //take the lower log2_dims bits
  4640. 4640 ii = (ii >> log2_dims_a[d]); //shift those lower log2_dims bits off of ii
  4641. 4641 }
  4642. 4642 a_data_i += pos_d * a_str[d];
  4643. 4643 z_data_i += pos_d * z_str[d];
  4644. 4644 }
  4645. 4645 }
  4646. 4646 // now we've got pointers a_data_i and z_data_i into element 0 of the slice over which we are reducing
  4647. 4647 // do a similar loop
  4648. 4648
  4649. 4649 float sum = 0.0f;
  4650. 4650 switch(n_reduce_dims)
  4651. 4651 {
  4652. 4652 case 0:
  4653. 4653 {
  4654. 4654 sum = a_data_i[0];
  4655. 4655 }
  4656. 4656 break;
  4657. 4657 case 1:
  4658. 4658 {
  4659. 4659 const int stride = a_str[reduce_dim0];
  4660. 4660 const float * a_data_i_max = a_data_i + dims_a[reduce_dim0] * stride;
  4661. 4661 while (a_data_i != a_data_i_max)
  4662. 4662 {
  4663. 4663 sum += a_data_i[0];
  4664. 4664 a_data_i += stride;
  4665. 4665 }
  4666. 4666 }
  4667. 4667 break;
  4668. 4668 case 2:
  4669. 4669 {
  4670. 4670 int rd = reduce_dim0+1;
  4671. 4671 for (; rd < nd; ++rd)
  4672. 4672 {
  4673. 4673 if (a_str[rd] && (!z_str[rd])) // this means 'rd' is a dimension we are reducing over
  4674. 4674 break;
  4675. 4675 }
  4676. 4676 const int stride0 = a_str[reduce_dim0];
  4677. 4677 const int stride1 = a_str[rd];
  4678. 4678 for (int ii = 0; ii < dims_a[rd]; ++ii)
  4679. 4679 {
  4680. 4680 const float * a_data_ri = a_data_i + ii * stride1;
  4681. 4681 const float * a_data_ri_max = a_data_ri + dims_a[reduce_dim0] * stride0;
  4682. 4682 while (a_data_ri != a_data_ri_max)
  4683. 4683 {
  4684. 4684 sum += a_data_ri[0];
  4685. 4685 a_data_ri += stride0;
  4686. 4686 }
  4687. 4687 }
  4688. 4688 };
  4689. 4689 break;
  4690. 4690 default:
  4691. 4691 {
  4692. 4692 for (unsigned int reduce_i = 0; reduce_i < n_reduce_elements; ++reduce_i)
  4693. 4693 {
  4694. 4694 //TODO: optimize this loop to work more like theano's Elemwise. It's serial code.
  4695. 4695 unsigned int reduce_ii = reduce_i;
  4696. 4696 const float * a_data_ri = a_data_i;
  4697. 4697
  4698. 4698 //This loop finds the element in the a slice to add.
  4699. 4699 for (unsigned int rd = reduce_dim0; rd < nd; ++rd)
  4700. 4700 {
  4701. 4701 unsigned int pos_d;
  4702. 4702 if (a_str[rd] && (!z_str[rd])) // this means 'd' is a dimension we are reducing over
  4703. 4703 {
  4704. 4704 if (log2_dims_a[rd]==-1)
  4705. 4705 {
  4706. 4706 // this branch is not preferred,
  4707. 4707 // because the manual said that integer mod and div operations are slow on gpu
  4708. 4708 pos_d = (reduce_ii % dims_a[rd]);
  4709. 4709 reduce_ii = (reduce_ii / dims_a[rd]);
  4710. 4710 }
  4711. 4711 else
  4712. 4712 {
  4713. 4713 pos_d = (reduce_ii & ((1 << log2_dims_a[rd])-1)); //take the lower log2_dims bits
  4714. 4714 reduce_ii = (reduce_ii >> log2_dims_a[rd]); //shift those lower log2_dims bits off of ii
  4715. 4715 }
  4716. 4716 a_data_ri += pos_d * a_str[rd];
  4717. 4717 }
  4718. 4718 }
  4719. 4719 sum += a_data_ri[0];
  4720. 4720 }
  4721. 4721 }
  4722. 4722 }
  4723. 4723 z_data_i[0] = sum;
  4724. 4724 }
  4725. 4725 }
  4726. 4726
  4727. 4727 static __global__ void kernel_reduce_sum_1011(
  4728. 4728 const unsigned int d0,
  4729. 4729 const unsigned int d1,
  4730. 4730 const unsigned int d2,
  4731. 4731 const unsigned int d3,
  4732. 4732 const float *A, const int sA0, const int sA1, const int sA2, const int sA3,
  4733. 4733 float * Z, const int sZ0)
  4734. 4734 {
  4735. 4735 const int threadCount = blockDim.x * blockDim.y * blockDim.z;
  4736. 4736 const int threadNum = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
  4737. 4737 extern __shared__ float buf[];
  4738. 4738 float mysum = 0.0f;
  4739. 4739
  4740. 4740 if (warpSize != 32)
  4741. 4741 {
  4742. 4742 return; //TODO: set error code
  4743. 4743 }
  4744. 4744
  4745. 4745 for (int i0 = threadIdx.z; i0 < d0; i0 += blockDim.z)
  4746. 4746 {
  4747. 4747 float Ai = A[i0 * sA0 + blockIdx.x * sA1 + threadIdx.y * sA2 + threadIdx.x * sA3];
  4748. 4748 mysum += Ai;
  4749. 4749 }
  4750. 4750 buf[threadNum] = mysum;
  4751. 4751 __syncthreads();
  4752. 4752
  4753. 4753 // rest of function is handled by one warp
  4754. 4754 if (threadNum < warpSize)
  4755. 4755 {
  4756. 4756 for (int i = threadNum + warpSize; i < threadCount; i += warpSize)
  4757. 4757 {
  4758. 4758 mysum += buf[i];
  4759. 4759 }
  4760. 4760 buf[threadNum] = mysum;
  4761. 4761 if (threadNum < 16)
  4762. 4762 {
  4763. 4763 //reduce so that threadNum 0 has the sum of everything
  4764. 4764 if(threadNum + 16 < threadCount) buf[threadNum] += buf[threadNum+16];
  4765. 4765 if(threadNum + 8 < threadCount) buf[threadNum] += buf[threadNum+8];
  4766. 4766 if(threadNum + 4 < threadCount) buf[threadNum] += buf[threadNum+4];
  4767. 4767 if(threadNum + 2 < threadCount) buf[threadNum] += buf[threadNum+2];
  4768. 4768 if(threadNum + 1 < threadCount) buf[threadNum] += buf[threadNum+1];
  4769. 4769 if (threadNum == 0)
  4770. 4770 {
  4771. 4771 Z[blockIdx.x*sZ0] = buf[0];
  4772. 4772 }
  4773. 4773 }
  4774. 4774 }
  4775. 4775 }
  4776. 4776 /**
  4777. 4777 * Dimensions in which the self has size 1 and A has size > 1 are considered summing dimensions
  4778. 4778 * Dimensions in which self has size > 1 and A has size > 1 are considered non-summing dimensions, and in this case their sizes must be equal.
  4779. 4779 */
  4780. 4780 int
  4781. 4781 CudaNdarray_reduce_sum(CudaNdarray * self, CudaNdarray * A)
  4782. 4782 {
  4783. 4783 int verbose = 0;
  4784. 4784 //check input rank
  4785. 4785 if (self->nd != A->nd)
  4786. 4786 {
  4787. 4787 PyErr_Format(PyExc_TypeError, "Rank mismatch in CudaNdarray_sum: %i vs %i", self->nd, A->nd);
  4788. 4788 return -1;
  4789. 4789 }
  4790. 4790 for (int i = 0; i < self->nd; ++i)
  4791. 4791 {
  4792. 4792 if ((CudaNdarray_HOST_DIMS(self)[i] > 1) && (CudaNdarray_HOST_DIMS(self)[i] != CudaNdarray_HOST_DIMS(A)[i]))
  4793. 4793 {
  4794. 4794 PyErr_Format(PyExc_TypeError, "Dimension mismatch in CudaNdarray_sum: self->dim[%i] == %i , A->dim[%i] = %i",
  4795. 4795 i, CudaNdarray_HOST_DIMS(self)[i], i, CudaNdarray_HOST_DIMS(A)[i]);
  4796. 4796 return -1;
  4797. 4797 }
  4798. 4798 }
  4799. 4799
  4800. 4800 int n_summations = (unsigned int)CudaNdarray_SIZE(self);
  4801. 4801 if (verbose)
  4802. 4802 {
  4803. 4803 std::cerr << "reduce_sum n_summations " << n_summations << '\n';
  4804. 4804 std::cerr << "reduce_sum nd " << self->nd << '\n';
  4805. 4805 fprint_CudaNdarray(stderr, A);
  4806. 4806 fprint_CudaNdarray(stderr, self);
  4807. 4807 }
  4808. 4808 if (0 && (A->nd == 4) //check to see if kernel_reduce_sum_1011 applies
  4809. 4809 && (CudaNdarray_HOST_DIMS(self)[0] == 1)
  4810. 4810 && (CudaNdarray_HOST_DIMS(self)[2] == 1)
  4811. 4811 && (CudaNdarray_HOST_DIMS(self)[3] == 1)
  4812. 4812 )
  4813. 4813 {
  4814. 4814 dim3 n_threads(CudaNdarray_HOST_DIMS(A)[3], CudaNdarray_HOST_DIMS(A)[2]);
  4815. 4815 dim3 n_blocks(CudaNdarray_HOST_DIMS(A)[1]);
  4816. 4816 while (n_threads.x * n_threads.y * n_threads.z < NUM_VECTOR_OP_THREADS_PER_BLOCK) ++n_threads.z;
  4817. 4817 n_threads.z -= 1;
  4818. 4818 if (n_threads.z > 64) n_threads.z = 64;
  4819. 4819 if (n_threads.z)
  4820. 4820 {
  4821. 4821 if (verbose) printf("trying kernel_reduce_sum_1011\n");
  4822. 4822 int n_shared = sizeof(float) * n_threads.x * n_threads.y * n_threads.z;
  4823. 4823 kernel_reduce_sum_1011<<<n_blocks, n_threads, n_shared>>>(
  4824. 4824 CudaNdarray_HOST_DIMS(A)[0],
  4825. 4825 CudaNdarray_HOST_DIMS(A)[1],
  4826. 4826 CudaNdarray_HOST_DIMS(A)[2],
  4827. 4827 CudaNdarray_HOST_DIMS(A)[3],
  4828. 4828 CudaNdarray_DEV_DATA(A),
  4829. 4829 CudaNdarray_HOST_STRIDES(A)[0],
  4830. 4830 CudaNdarray_HOST_STRIDES(A)[1],
  4831. 4831 CudaNdarray_HOST_STRIDES(A)[2],
  4832. 4832 CudaNdarray_HOST_STRIDES(A)[3],
  4833. 4833 CudaNdarray_DEV_DATA(self),
  4834. 4834 CudaNdarray_HOST_STRIDES(self)[1]);
  4835. 4835 CNDA_THREAD_SYNC;
  4836. 4836 if (cudaSuccess == cudaGetLastError()) return 0;
  4837. 4837 if (verbose) printf("failed, falling back to kernel_reduce_sum\n");
  4838. 4838 }
  4839. 4839 }
  4840. 4840
  4841. 4841 int n_threads_per_block = std::min(n_summations,
  4842. 4842 NUM_VECTOR_OP_THREADS_PER_BLOCK);
  4843. 4843 int n_blocks = std::min(ceil_intdiv(n_summations,n_threads_per_block),
  4844. 4844 NUM_VECTOR_OP_BLOCKS);
  4845. 4845 int n_structure_cache = self->nd * 4 * sizeof(int);
  4846. 4846
  4847. 4847 if (verbose)
  4848. 4848 {
  4849. 4849 std::cerr << "n_blocks, n_threads_per_block " << n_blocks << ' ' << n_threads_per_block << '\n';
  4850. 4850 }
  4851. 4851 assert (self->nd > 0);
  4852. 4852 assert (self->nd == A->nd);
  4853. 4853 kernel_reduce_sum<<<n_blocks, n_threads_per_block, n_structure_cache>>>(
  4854. 4854 n_summations,
  4855. 4855 self->nd,
  4856. 4856 CudaNdarray_DEV_DIMS(A),
  4857. 4857 CudaNdarray_DEV_LOG2DIMS(A),
  4858. 4858 CudaNdarray_DEV_STRIDES(A),
  4859. 4859 CudaNdarray_DEV_DATA(A),
  4860. 4860 CudaNdarray_DEV_STRIDES(self),
  4861. 4861 CudaNdarray_DEV_DATA(self));
  4862. 4862 CNDA_THREAD_SYNC;
  4863. 4863 cudaError_t err = cudaGetLastError();
  4864. 4864 if (cudaSuccess != err)
  4865. 4865 {
  4866. 4866 PyErr_Format(PyExc_RuntimeError, "Cuda error: %s: %s.\n", "kernel_reduce_sum", cudaGetErrorString(err));
  4867. 4867 return -1;
  4868. 4868 }
  4869. 4869 return 0;
  4870. 4870 }
  4871. 4871 int
  4872. 4872 CudaNdarray_reduce_prod(CudaNdarray * self, const CudaNdarray * A)
  4873. 4873 {
  4874. 4874 PyErr_SetString(PyExc_NotImplementedError, "");
  4875. 4875 return -1;
  4876. 4876 }
  4877. 4877 int
  4878. 4878 CudaNdarray_reduce_min(CudaNdarray * self, const CudaNdarray * A)
  4879. 4879 {
  4880. 4880 PyErr_SetString(PyExc_NotImplementedError, "");
  4881. 4881 return -1;
  4882. 4882 }
  4883. 4883 int
  4884. 4884 CudaNdarray_reduce_max(CudaNdarray * self, const CudaNdarray * A)
  4885. 4885 {
  4886. 4886 PyErr_SetString(PyExc_NotImplementedError, "");
  4887. 4887 return -1;
  4888. 4888 }
  4889. 4889
  4890. 4890
  4891. 4891 /**
  4892. 4892 *
  4893. 4893 * pattern is a permutation of [0, 1, ... self->nd-1] with the following twists:
  4894. 4894 * - an element 'd' of the permutation can be dropped if CudaNdarray_HOST_DIMS(self)[d] == 1
  4895. 4895 * - any number of '-1' elements can be in the pattern, and they will cause new ranks (with dim==1) to be inserted.
  4896. 4896 *
  4897. 4897 * For example, if CudaNdarray_HOST_DIMS(self) == [4, 5, 1, 6], and pattern = [0,3,-1,-1, 1], then CudaNdarray_HOST_DIMS(self) would be modified to become:
  4898. 4898 * [4, 6, 1, 1, 5] (we dropped the original dim[2]==1, and inserted two singleton dimensions with the -1s.
  4899. 4899 */
  4900. 4900 int
  4901. 4901 CudaNdarray_dimshuffle(CudaNdarray * self, unsigned int len, const int * pattern)
  4902. 4902 {
  4903. 4903 //TODO: pass a workspace pointer to avoid the internal malloc
  4904. 4904 int * newdims = (int *)malloc(sizeof(int) * (len + len + self->nd)); //we tack on the taken buffer here for speed of not having to malloc twice.
  4905. 4905 int * newstrides = newdims + len;
  4906. 4906 int * dims_taken = newstrides + len;
  4907. 4907 if (!newdims)
  4908. 4908 {
  4909. 4909 PyErr_SetString(PyExc_MemoryError, "CudaNdarray_dimshuffle: Failed to allocate temporary space");
  4910. 4910 return -1;
  4911. 4911 }
  4912. 4912 for (int i = 0; i < self->nd; ++i)
  4913. 4913 {
  4914. 4914 dims_taken[i] = 0;
  4915. 4915 }
  4916. 4916 for (int i = 0; i < len; ++i)
  4917. 4917 {
  4918. 4918 if (pattern[i] < 0)
  4919. 4919 {
  4920. 4920 newdims[i] = 1;
  4921. 4921 newstrides[i] = 0;
  4922. 4922 }
  4923. 4923 else if(dims_taken[pattern[i]])
  4924. 4924 {
  4925. 4925 PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You used the dimensions %d multiple time",
  4926. 4926 pattern[i]);
  4927. 4927 free(newdims);
  4928. 4928 return -1;
  4929. 4929 }
  4930. 4930 else if (pattern[i]>= self->nd)
  4931. 4931 {
  4932. 4932 PyErr_Format(PyExc_ValueError, "Cudandarray_dimshuffle: invalid pattern for Cudandarray_dimshuffle. You asked for a dimensions that don't exist %d for a %d dims CudaNdarray",
  4933. 4933 pattern[i], self->nd);
  4934. 4934 free(newdims);
  4935. 4935 return -1;
  4936. 4936 }
  4937. 4937 else
  4938. 4938 {
  4939. 4939 newdims[i] = CudaNdarray_HOST_DIMS(self)[pattern[i]];
  4940. 4940 newstrides[i] = CudaNdarray_HOST_STRIDES(self)[pattern[i]];
  4941. 4941 dims_taken[pattern[i]] = 1;
  4942. 4942 }
  4943. 4943 }
  4944. 4944 //Check if we dropped not broadcastable dims
  4945. 4945 for (int i = 0; i < self->nd; ++i)
  4946. 4946 {
  4947. 4947 if (dims_taken[i]==0 && CudaNdarray_HOST_DIMS(self)[i]!=1)
  4948. 4948 {
  4949. 4949 PyErr_SetString(PyExc_ValueError, "Cudandarray_dimshuffle: You cannot drop a non-broadcastable dimension.");
  4950. 4950 free(newdims);
  4951. 4951 return -1;
  4952. 4952 }
  4953. 4953 }
  4954. 4954 //swap this structure in for the one in self, and sync to the card
  4955. 4955 if (CudaNdarray_set_nd(self, len))
  4956. 4956 {
  4957. 4957 free(newdims);
  4958. 4958 return -1;
  4959. 4959 }
  4960. 4960 for (int i = 0; i < len; ++i)
  4961. 4961 {
  4962. 4962 CudaNdarray_set_dim(self, i, newdims[i]);
  4963. 4963 CudaNdarray_set_stride(self, i, newstrides[i]);
  4964. 4964 }
  4965. 4965 if (cnda_copy_structure_to_device(self))
  4966. 4966 {
  4967. 4967 free(newdims);
  4968. 4968 return -1;
  4969. 4969 }
  4970. 4970 free(newdims);
  4971. 4971 return 0;
  4972. 4972 }
  4973. 4973
  4974. 4974
  4975. 4975
  4976. 4976 /**
  4977. 4977 *
  4978. 4978 * This is the function that bind to python.
  4979. 4979 * See CudaNdarray_dimshuffle to call from C.
  4980. 4980 * We use -1 to mean 'x' as in Tensor Dimshuffle.
  4981. 4981 */
  4982. 4982 PyObject *
  4983. 4983 CudaNdarray_Dimshuffle(PyObject* _unused, PyObject* args)
  4984. 4984 {
  4985. 4985 PyObject * self = NULL;
  4986. 4986 PyObject * pattern_object = NULL;
  4987. 4987 int * pattern = NULL;
  4988. 4988 PyObject * rval = NULL;
  4989. 4989 int success = -1;
  4990. 4990 //const int * dims = NULL;
  4991. 4991
  4992. 4992 //args should consist of two python objects ("OO")
  4993. 4993 if (! PyArg_ParseTuple(args, "OO", &self, &pattern_object))
  4994. 4994 return NULL;
  4995. 4995
  4996. 4996 if (!CudaNdarray_Check(self) )
  4997. 4997 {
  4998. 4998 PyErr_SetString(PyExc_TypeError, "First argument to cuda_ndarray.dimshuffle must be a CudaNdarray");
  4999. 4999 return NULL;
  5000. 5000 }
  5001. 5001
  5002. 5002 //parse pattern_object into int * pattern
  5003. 5003
  5004. 5004 Py_ssize_t pattern_dim = PyObject_Length(pattern_object);
  5005. 5005
  5006. 5006 if (pattern_dim < 0)
  5007. 5007 {
  5008. 5008 PyErr_SetString(PyExc_TypeError, "Couldn't get length of third argument to cuda_ndarray.dimshuffle");
  5009. 5009 return NULL;
  5010. 5010 }
  5011. 5011
  5012. 5012 pattern = (int *) malloc( pattern_dim * sizeof(int));
  5013. 5013
  5014. 5014 for (Py_ssize_t i = 0; i < pattern_dim; i++)
  5015. 5015 {
  5016. 5016 PyObject * idx = PyLong_FromLong(i);
  5017. 5017
  5018. 5018 if (idx == NULL)
  5019. 5019 {
  5020. 5020 PyErr_SetString(PyExc_Exception, "Couldn't make long object to loop over list/tuple");
  5021. 5021 goto CudaNdarray_dimshuffle_fail;
  5022. 5022 }
  5023. 5023
  5024. 5024 long elem_value = 0;
  5025. 5025
  5026. 5026 PyObject * elem = PyObject_GetItem(pattern_object, idx);
  5027. 5027
  5028. 5028 if (elem == NULL)
  5029. 5029 {
  5030. 5030 Py_XDECREF( elem);
  5031. 5031 PyErr_SetString(PyExc_ValueError, "Third argument to dimshuffle must be list or tuple of integers");
  5032. 5032 goto CudaNdarray_dimshuffle_fail;
  5033. 5033 }
  5034. 5034
  5035. 5035 elem_value = PyInt_AsLong(elem);
  5036. 5036
  5037. 5037 if (elem_value == -1 && PyErr_Occurred() )
  5038. 5038 {
  5039. 5039 Py_XDECREF(elem);
  5040. 5040 PyErr_SetString(PyExc_ValueError, "Third argument to dimshuffle must be list or tuple of integers");
  5041. 5041 goto CudaNdarray_dimshuffle_fail;
  5042. 5042 }
  5043. 5043
  5044. 5044 pattern[i] = elem_value;
  5045. 5045
  5046. 5046 Py_XDECREF( elem );
  5047. 5047 Py_XDECREF( idx );
  5048. 5048 }
  5049. 5049
  5050. 5050 //allocate rval
  5051. 5051 rval = (PyObject *) CudaNdarray_View((CudaNdarray *) self);
  5052. 5052
  5053. 5053 if (rval == NULL)
  5054. 5054 {
  5055. 5055 //CudaNdarray_New should have set the exception string
  5056. 5056 goto CudaNdarray_dimshuffle_fail;
  5057. 5057 }
  5058. 5058
  5059. 5059
  5060. 5060 //printf("pattern_dim: %d\n",pattern_dim);
  5061. 5061 //printf("pattern: %d %d\n",pattern[0],pattern[1]);
  5062. 5062 //dims = CudaNdarray_HOST_DIMS( (CudaNdarray *) self);
  5063. 5063 //printf("dims before: %d %d\n",dims[0],dims[1]);
  5064. 5064
  5065. 5065 success = CudaNdarray_dimshuffle((CudaNdarray *) rval, pattern_dim, pattern);
  5066. 5066
  5067. 5067 if (success != 0)
  5068. 5068 {
  5069. 5069 //Exception string should already be set by CudaNdarray_dimshuffle
  5070. 5070 goto CudaNdarray_dimshuffle_fail;
  5071. 5071 }
  5072. 5072
  5073. 5073 free(pattern);
  5074. 5074
  5075. 5075 return rval;
  5076. 5076
  5077. 5077 CudaNdarray_dimshuffle_fail:
  5078. 5078
  5079. 5079 if (pattern != NULL)
  5080. 5080 free(pattern);
  5081. 5081
  5082. 5082 Py_XDECREF(rval);
  5083. 5083 return NULL;
  5084. 5084 }
  5085. 5085
  5086. 5086
  5087. 5087 int
  5088. 5088 cnda_structure_size(int nd)
  5089. 5089 {
  5090. 5090 // dim0, dim1, ...
  5091. 5091 // str0, str1, ...
  5092. 5092 // log2(dim0), log2(dim1), ...
  5093. 5093 return nd + nd + nd;
  5094. 5094 }
  5095. 5095
  5096. 5096 const int *
  5097. 5097 CudaNdarray_HOST_DIMS(const CudaNdarray * self)
  5098. 5098 {
  5099. 5099 return self->host_structure;
  5100. 5100 }
  5101. 5101
  5102. 5102 const int *
  5103. 5103 CudaNdarray_HOST_STRIDES(const CudaNdarray * self)
  5104. 5104 {
  5105. 5105 return self->host_structure + self->nd;
  5106. 5106 }
  5107. 5107 const int *
  5108. 5108 CudaNdarray_HOST_LOG2DIMS(const CudaNdarray * self)
  5109. 5109 {
  5110. 5110 return self->host_structure + 2*self->nd;
  5111. 5111 }
  5112. 5112
  5113. 5113 int
  5114. 5114 CudaNdarray_EqualAndIgnore(CudaNdarray *cnda1, CudaNdarray *cnda2, int ignoreSync, int ignoreBase)
  5115. 5115 {
  5116. 5116 int verbose = 0;
  5117. 5117
  5118. 5118 if (!ignoreSync && cnda1->dev_structure_fresh != cnda2->dev_structure_fresh)
  5119. 5119 {
  5120. 5120 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 1\n");
  5121. 5121 return 0;
  5122. 5122 }
  5123. 5123
  5124. 5124 if (cnda1->nd != cnda2->nd)
  5125. 5125 {
  5126. 5126 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 2\n");
  5127. 5127 return 0;
  5128. 5128 }
  5129. 5129
  5130. 5130 for (int i=0; i < 2*cnda1->nd; i++)
  5131. 5131 {
  5132. 5132 if (cnda1->host_structure[i] != cnda2->host_structure[i])
  5133. 5133 {
  5134. 5134 if(verbose)
  5135. 5135 fprintf(stdout, "CUDANDARRAY_EQUAL : host_structure : %d, %d, %d\n", i, cnda1->host_structure[i], cnda2->host_structure[i]);
  5136. 5136 return 0;
  5137. 5137 }
  5138. 5138 }
  5139. 5139
  5140. 5140 if (!ignoreBase && cnda1->base != cnda2->base)
  5141. 5141 {
  5142. 5142 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 4");
  5143. 5143 return 0;
  5144. 5144 }
  5145. 5145 else if (cnda1->data_allocated != cnda2->data_allocated)
  5146. 5146 {
  5147. 5147 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 5");
  5148. 5148 return 0;
  5149. 5149 }
  5150. 5150 else if (cnda1->data_allocated && cnda1->devdata != cnda2->devdata)
  5151. 5151 {
  5152. 5152 if(verbose) fprintf(stdout, "CUDANDARRAY_EQUAL FAILED : 6");
  5153. 5153 // no need to check devdata if data is not allocated
  5154. 5154 return 0;
  5155. 5155 }
  5156. 5156
  5157. 5157 return 1;
  5158. 5158 }
  5159. 5159
  5160. 5160
  5161. 5161 int
  5162. 5162 CudaNdarray_Equal(CudaNdarray *cnda1, CudaNdarray *cnda2)
  5163. 5163 {
  5164. 5164 return CudaNdarray_EqualAndIgnore(cnda1, cnda2, 0, 0);
  5165. 5165 }
  5166. 5166
  5167. 5167 int
  5168. 5168 cnda_copy_structure_to_device(const CudaNdarray * self)
  5169. 5169 {
  5170. 5170 //If the device structure do not exists, create it.
  5171. 5171 //We allocate it here as we do not need it often.
  5172. 5172 //In fact, we need it so infrequently that we expect
  5173. 5173 //that most object won't need it. Not allocating it
  5174. 5174 //save a significant when creating object.
  5175. 5175 //This speed up a benchmark by 8% with the gc.
  5176. 5176 if (!self->dev_structure)
  5177. 5177 {
  5178. 5178 int struct_size = cnda_structure_size(self->nd);
  5179. 5179 if (struct_size)
  5180. 5180 {
  5181. 5181 self->dev_structure = (int*)device_malloc(struct_size* sizeof(int));
  5182. 5182 if (NULL == self->dev_structure)
  5183. 5183 {
  5184. 5184 return -1;
  5185. 5185 }
  5186. 5186 }
  5187. 5187 }
  5188. 5188 if (cublasSetVector(cnda_structure_size(self->nd),
  5189. 5189 sizeof(int),
  5190. 5190 self->host_structure,
  5191. 5191 1,
  5192. 5192 self->dev_structure,
  5193. 5193 1) != CUBLAS_STATUS_SUCCESS)
  5194. 5194 {
  5195. 5195 PyErr_SetString(PyExc_RuntimeError, "error copying structure to device memory");
  5196. 5196 return -1;
  5197. 5197 }
  5198. 5198 self->dev_structure_fresh = 1;
  5199. 5199 return 0;
  5200. 5200 }
  5201. 5201
  5202. 5202 const int *
  5203. 5203 CudaNdarray_DEV_DIMS(const CudaNdarray * self)
  5204. 5204 {
  5205. 5205 if (!self->dev_structure_fresh)
  5206. 5206 {
  5207. 5207 if (cnda_copy_structure_to_device(self))
  5208. 5208 return NULL;
  5209. 5209 }
  5210. 5210 return self->dev_structure;
  5211. 5211 }
  5212. 5212 const int *
  5213. 5213 CudaNdarray_DEV_STRIDES(const CudaNdarray * self)
  5214. 5214 {
  5215. 5215 if (!self->dev_structure_fresh)
  5216. 5216 {
  5217. 5217 if (cnda_copy_structure_to_device(self))
  5218. 5218 return NULL;
  5219. 5219 }
  5220. 5220 return self->dev_structure + self->nd;
  5221. 5221 }
  5222. 5222 const int *
  5223. 5223 CudaNdarray_DEV_LOG2DIMS(const CudaNdarray * self)
  5224. 5224 {
  5225. 5225 if (!self->dev_structure_fresh)
  5226. 5226 {
  5227. 5227 if (cnda_copy_structure_to_device(self))
  5228. 5228 return NULL;
  5229. 5229 }
  5230. 5230 return self->dev_structure + 2*self->nd;
  5231. 5231 }
  5232. 5232 float *
  5233. 5233 CudaNdarray_DEV_DATA(const CudaNdarray * self)
  5234. 5234 {
  5235. 5235 return self->devdata;
  5236. 5236 }
  5237. 5237
  5238. 5238 /**
  5239. 5239 * Return the number of elements in the ndarray (product of the dimensions)
  5240. 5240 */
  5241. 5241 size_t
  5242. 5242 CudaNdarray_SIZE(const CudaNdarray *self)
  5243. 5243 {
  5244. 5244 if (self->nd == -1) return 0;
  5245. 5245 size_t size = 1;
  5246. 5246 for (int i = 0; i < self->nd; ++i)
  5247. 5247 {
  5248. 5248 size *= CudaNdarray_HOST_DIMS(self)[i];
  5249. 5249 }
  5250. 5250 return size;
  5251. 5251 }
  5252. 5252
  5253. 5253 PyObject *
  5254. 5254 CudaNdarray_SIZE_Object(const CudaNdarray *self, void *closure)
  5255. 5255 {
  5256. 5256 return PyInt_FromLong(CudaNdarray_SIZE(self));
  5257. 5257 }
  5258. 5258
  5259. 5259 int CudaNdarray_set_device_data(CudaNdarray * self, float * data, const CudaNdarray * base)
  5260. 5260 {
  5261. 5261 return CudaNdarray_set_device_data(self, data, (PyObject *) base);
  5262. 5262 }
  5263. 5263
  5264. 5264 PyObject * CudaNdarray_IS_C_Contiguous(CudaNdarray * self)
  5265. 5265 {
  5266. 5266 return PyBool_FromLong(CudaNdarray_is_c_contiguous(self));
  5267. 5267 }
  5268. 5268
  5269. 5269 int fprint_CudaNdarray(FILE * fd, const CudaNdarray *self)
  5270. 5270 {
  5271. 5271 cudaError_t err = cudaGetLastError();
  5272. 5272 if( cudaSuccess != err)
  5273. 5273 {
  5274. 5274 PyErr_Format(PyExc_RuntimeError,
  5275. 5275 "Cuda error: %s: %s.",
  5276. 5276 "fprint_CudaNdarray was called with an uncleared error",
  5277. 5277 cudaGetErrorString(err));
  5278. 5278 return -1;
  5279. 5279 }
  5280. 5280 fprintf(fd, "CudaNdarray <%p, %p> nd=%i dev_structure_fresh=%d data_allocated=%d\n",
  5281. 5281 self, self->devdata, self->nd, self->dev_structure_fresh, self->data_allocated);
  5282. 5282 fprintf(fd, "\tHOST_DIMS: ");
  5283. 5283 for (int i = 0; i < self->nd; ++i)
  5284. 5284 {
  5285. 5285 fprintf(fd, "%i\t", CudaNdarray_HOST_DIMS(self)[i]);
  5286. 5286 }
  5287. 5287 fprintf(fd, "\n\tHOST_STRIDES: ");
  5288. 5288 for (int i = 0; i < self->nd; ++i)
  5289. 5289 {
  5290. 5290 fprintf(fd, "%i\t", CudaNdarray_HOST_STRIDES(self)[i]);
  5291. 5291 }
  5292. 5292
  5293. 5293 if (self->dev_structure)
  5294. 5294 {
  5295. 5295 int data=0;
  5296. 5296 fprintf(fd, "\n\tDEV_DIMS: ");
  5297. 5297 for (int i = 0; i < self->nd; ++i)
  5298. 5298 {
  5299. 5299 cublasGetVector(1, sizeof(int),
  5300. 5300 self->dev_structure+i, 1,
  5301. 5301 &data, 1);
  5302. 5302 fprintf(fd, "%i\t", data);
  5303. 5303 }
  5304. 5304 fprintf(fd, "\n\tDEV_STRIDES: ");
  5305. 5305 for (int i = 0; i < self->nd; ++i)
  5306. 5306 {
  5307. 5307 cublasGetVector(1, sizeof(int),
  5308. 5308 self->dev_structure + self->nd+i, 1,
  5309. 5309 &data, 1);
  5310. 5310 fprintf(fd, "%i \t", data);
  5311. 5311 }
  5312. 5312 fprintf(fd, "\n");
  5313. 5313 }
  5314. 5314 else
  5315. 5315 {
  5316. 5316 fprintf(fd, "\n\tdev_structure not allocated\n");
  5317. 5317 }
  5318. 5318
  5319. 5319 err = cudaGetLastError();
  5320. 5320 if( cudaSuccess != err)
  5321. 5321 {
  5322. 5322 PyErr_Format(PyExc_RuntimeError,
  5323. 5323 "Cuda error: %s: %s.",
  5324. 5324 "fprint_CudaNdarray",
  5325. 5325 cudaGetErrorString(err));
  5326. 5326 return -1;
  5327. 5327 }
  5328. 5328 return 0;
  5329. 5329 }
  5330. 5330
  5331. 5331
  5332. 5332 int CudaNdarray_prep_output(CudaNdarray ** arr, int nd,
  5333. 5333 const int * dims, int fortran)
  5334. 5334 {
  5335. 5335 bool allocated = false;
  5336. 5336 if (*arr == NULL)
  5337. 5337 {
  5338. 5338 // This allocates the metadata but not the data
  5339. 5339 *arr = (CudaNdarray *) CudaNdarray_new_nd(nd);
  5340. 5340 if (*arr == NULL)
  5341. 5341 return -1;
  5342. 5342 allocated = true;
  5343. 5343 }
  5344. 5344
  5345. 5345 if (CudaNdarray_alloc_contiguous(*arr, nd, dims, fortran))
  5346. 5346 {
  5347. 5347 if (allocated)
  5348. 5348 {
  5349. 5349 Py_DECREF(*arr);
  5350. 5350 *arr = NULL;
  5351. 5351 }
  5352. 5352 return -1;
  5353. 5353 }
  5354. 5354 return 0;
  5355. 5355 }
  5356. 5356
  5357. 5357
  5358. 5358 /*
  5359. 5359 Local Variables:
  5360. 5360 mode:c++
  5361. 5361 c-basic-offset:4
  5362. 5362 c-file-style:"stroustrup"
  5363. 5363 indent-tabs-mode:nil
  5364. 5364 fill-column:79
  5365. 5365 End:
  5366. 5366 */
  5367. 5367 // vim: filetype=cpp:expandtab:shiftwidth=4:tabstop=8:softtabstop=4:textwidth=79 :
  5368. 5368
  5369. ===============================
  5370. mod.cu(941): warning: pointless comparison of unsigned integer with zero
  5371. mod.cu(3001): warning: conversion from a string literal to "char *" is deprecated
  5372. mod.cu(3004): warning: conversion from a string literal to "char *" is deprecated
  5373. mod.cu(3006): warning: conversion from a string literal to "char *" is deprecated
  5374. mod.cu(3009): warning: conversion from a string literal to "char *" is deprecated
  5375. mod.cu(3011): warning: conversion from a string literal to "char *" is deprecated
  5376. mod.cu(3014): warning: conversion from a string literal to "char *" is deprecated
  5377. mod.cu(3017): warning: conversion from a string literal to "char *" is deprecated
  5378. mod.cu(3020): warning: conversion from a string literal to "char *" is deprecated
  5379. mod.cu(3022): warning: conversion from a string literal to "char *" is deprecated
  5380. mod.cu(3025): warning: conversion from a string literal to "char *" is deprecated
  5381. mod.cu(3027): warning: conversion from a string literal to "char *" is deprecated
  5382. mod.cu(3030): warning: conversion from a string literal to "char *" is deprecated
  5383. mod.cu(3032): warning: conversion from a string literal to "char *" is deprecated
  5384. mod.cu(3035): warning: conversion from a string literal to "char *" is deprecated
  5385. mod.cu(3038): warning: conversion from a string literal to "char *" is deprecated
  5386. mod.cu(3041): warning: conversion from a string literal to "char *" is deprecated
  5387. mod.cu(3043): warning: conversion from a string literal to "char *" is deprecated
  5388. mod.cu(3046): warning: conversion from a string literal to "char *" is deprecated
  5389. mod.cu(3048): warning: conversion from a string literal to "char *" is deprecated
  5390. mod.cu(3051): warning: conversion from a string literal to "char *" is deprecated
  5391. mod.cu(941): warning: pointless comparison of unsigned integer with zero
  5392. mod.cu(3001): warning: conversion from a string literal to "char *" is deprecated
  5393. mod.cu(3004): warning: conversion from a string literal to "char *" is deprecated
  5394. mod.cu(3006): warning: conversion from a string literal to "char *" is deprecated
  5395. mod.cu(3009): warning: conversion from a string literal to "char *" is deprecated
  5396. mod.cu(3011): warning: conversion from a string literal to "char *" is deprecated
  5397. mod.cu(3014): warning: conversion from a string literal to "char *" is deprecated
  5398. mod.cu(3017): warning: conversion from a string literal to "char *" is deprecated
  5399. mod.cu(3020): warning: conversion from a string literal to "char *" is deprecated
  5400. mod.cu(3022): warning: conversion from a string literal to "char *" is deprecated
  5401. mod.cu(3025): warning: conversion from a string literal to "char *" is deprecated
  5402. mod.cu(3027): warning: conversion from a string literal to "char *" is deprecated
  5403. mod.cu(3030): warning: conversion from a string literal to "char *" is deprecated
  5404. mod.cu(3032): warning: conversion from a string literal to "char *" is deprecated
  5405. mod.cu(3035): warning: conversion from a string literal to "char *" is deprecated
  5406. mod.cu(3038): warning: conversion from a string literal to "char *" is deprecated
  5407. mod.cu(3041): warning: conversion from a string literal to "char *" is deprecated
  5408. mod.cu(3043): warning: conversion from a string literal to "char *" is deprecated
  5409. mod.cu(3046): warning: conversion from a string literal to "char *" is deprecated
  5410. mod.cu(3048): warning: conversion from a string literal to "char *" is deprecated
  5411. mod.cu(3051): warning: conversion from a string literal to "char *" is deprecated
  5412. /usr/include/string.h: In function ‘void* __mempcpy_inline(void*, const void*, size_t)’:
  5413. /usr/include/string.h:652:42: error: ‘memcpy’ was not declared in this scope
  5414. return (char *) memcpy (__dest, __src, __n) + __n;
  5415. ^
  5416. mod.cu: In function ‘PyObject* CudaNdarray_Reshape(CudaNdarray*, PyObject*)’:
  5417. mod.cu:955:122: warning: format ‘%lld’ expects argument of type ‘long long int’, but argument 3 has type ‘size_t {aka long unsigned int}’ [-Wformat=]
  5418. PyErr_Format(PyExc_ValueError, "size must remain unchanged, changed from %lld to %lld", CudaNdarray_SIZE(self), rval_size);
  5419. ^
  5420. mod.cu:955:122: warning: format ‘%lld’ expects argument of type ‘long long int’, but argument 4 has type ‘size_t {aka long unsigned int}’ [-Wformat=]
  5421. ERROR (theano.sandbox.cuda): Failed to compile cuda_ndarray.cu: ('nvcc return status', 1, 'for cmd', 'nvcc -shared -O3 -m64 -Xcompiler -DCUDA_NDARRAY_CUH=c72d035fdf91890f3b36710688069b2e,-DNPY_NO_DEPRECATED_API=NPY_1_7_API_VERSION,-fPIC,-fvisibility=hidden -Xlinker -rpath,/home/moose/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-2.7.11+-64/cuda_ndarray -I/home/moose/.local/lib/python2.7/site-packages/theano/sandbox/cuda -I/usr/lib/python2.7/dist-packages/numpy/core/include -I/usr/include/python2.7 -I/home/moose/.local/lib/python2.7/site-packages/theano/gof -o /home/moose/.theano/compiledir_Linux-4.4--generic-x86_64-with-Ubuntu-16.04-xenial-x86_64-2.7.11+-64/cuda_ndarray/cuda_ndarray.so mod.cu -L/usr/lib -lcublas -lpython2.7 -lcudart')
  5422. WARNING (theano.sandbox.cuda): CUDA is installed, but device gpu is not available (error: cuda unavailable)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement