Material by: Kevin Stratford
/* Schematically */
/* Allocate and initialise host memory ... */
h_ptr = malloc(nbytes);
...
/* Allocate device memory and copy */
cudaMalloc(&d_ptr, nbytes)
cudaMemcpy(d_ptr, h_ptr, nbytes, cudaMemCpyHostToDevice);
...
/* Use device memory in kernel */
kernel<<<...>>> (d_ptr, ...)
/* Can we do both? */
cudaMallocManaged(&ptr, nbytes);
/* Initialise data on host */
for (i = 0; i < NDATA; i++) {
ptr[i] = value;
}
/* Use data in kernel */
kernel<<<...>>> (ptr, ...)
/* ... and get results back */
__host__ cudaErr_t cudaMallocManaged(void ** dptr, int sz);
cudaFree()
cudaMemcpy()
/* Prefetch to destination device */
cudaGetDevice(&device)
cudaMallocManaged(&ptr, nbytes);
...
cudaMemPrefetchAsync(ptr, nbytes, device, NULL);
...
/* Hints */
cudaMemAdvise(ptr, nbytes, advice, device);
/* advice: cudaMemAdviseSetReadMostly */
/* cudaMemAdviseSetPreferredLocation */
/* cudaMemAdviseSetAccessedBy */
/* Constant data declared at file scope with
* __constant__ memory space qualifier */
static __constant__ double coeffs[3];
int someHostFunction(...) {
/* ... assign some values at run time ... */
double values[3];
/* ... and before the relevant kernel ... */
cudaMemcpyToSymbol(coeffs, values, 3*sizeof(double));
...
}
/* Still in the appropriate scope ... */
static __constant__ double coeffs[3];
__global__ void someKernel(...) {
...
/* Reference constant data as usual ... */
result = coeffs[0]*x + coeffs[1]*y + coeffs[2]*z;
}
! Constant variable declared at e.g., module scope
! with constant attribute
real, constant :: coeffs(3)
contains
subroutine someHostRoutine(...)
! ...assign some values at runtime ...
coeffs(:) = values(1:3)
! ...and call relevant kernel ...
end subroutine someHostRoutine
! Still in the appropriate scope ...
real, constant :: coeffs(3)
contains
attributes(global) subroutine someKernel(...)
! Reference constant data as usual ...
result = coeffs(1)*x + coeffs(2)*y + coeffs(3)*z
end subroutine someKernel
__syncthreads()
__shared__
memory space qualifier
shared
attribute
/* Reverse elements so that the order 0,1,2,3,...
* becomes ...,3,2,1,0
* Assume we have one block. */
__global__ void reverseElements(int * myArray) {
__shared__ int tmp[THREADS_PER_BLOCK];
int idx = threadIdx.x;
tmp[idx] = myArray[idx];
__syncthreads();
myArray[THREADS_PER_BLOCK - (idx+1)] = tmp[idx];
}