Kevin Stratford
kevin@epcc.ed.ac.uk
Material by: Alan Gray, Kevin Stratford
dim3
structure
struct {
unsigned int x;
unsigned int y;
unsigned int z;
};
type :: dim3
integer :: x
integer :: y
integer :: z
end type dim3
/* Consider the one-dimensional loop: */
for (int i = 0; i < LOOP_LENGTH; i++) {
result[i] = 2*i;
}
__global__ void myKernel(int * result) {
int i;
i = threadIdx.x;
result[i] = 2*i;
}
/* Kernel is launched by the host by specifying
* Number of blocks (sometimes "blocksPerGrid")
* Number of threads per block */
dim3 blocks;
dim3 threadsPerBlock;
blocks.x = 1;
threadsPerBlock.x = LOOP_LENGTH;
myKernel <<< blocks, threadsPerBlock >>> (result);
! In Fortran an analogous kernel is...
attributes(global) subroutine myKernel(result)
integer, dimension(:) :: result
integer :: i
i = threadIdx%x
result(i) = 2*i
end subroutine myKernel
! ... with execution ...
blocks%x = 1
threadsPerBlock%x = LOOP_LENGTH
call myKernel <<< blocks, threadsPerBlock >>> (result)
/* One block only uses one SM; use of resources is very poor.
* Usually want large arrays using many blocks. */
__global__ void myKernel(int * result) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
result[i] = 2*i;
}
/* ... with execution ... */
block.x = NBLOCKS;
threadsPerBlock.x = LOOP_LENGTH/NBLOCKS;
myKernel <<< blocks, threadsPerBlock >>> (result);
attributes(global) subroutine myKernel(result)
integer, dimension(:) :: result
integer :: i
i = (blockIdx%x - 1)*blockDim%x + threadIdx%x
result(i) = 2*i
end subroutine myKernel
! ... with execution ...
blocks%x = NBLOCKS
threadsPerBlock%x = LOOP_LENGTH/NBLOCKS
call myKernel <<< blocks, threadsPerBlock >>> (result)
dim3 gridDim; /* Number of blocks */
dim3 blockDim; /* Number of threads per block */
dim3 blockIdx; /* 0 <= blockIdx.x < gridDim.x etc */
dim3 threadIdx; /* 0 <= threadIdx.x < blockDim.x etc */
type (dim3) :: gridDim ! Number of blocks
type (dim3) :: blockDim ! Number of threads per block
type (dim3) :: blockIdx ! 1 <= blockIdx%x <= gridDim%x etc
type (dim3) :: threadIdx ! 1 <= threadIdx%x <= blockDim%x etc
__global__ void matrix2d(float a[N][N], float b[N][N],
float c[N][N]) {
int j = blockIdx.x*blockDim.x + threadIdx.x;
int i = blockIdx.y*blockDim.y + threadIdx.y;
c[i][j] = a[i][j] + b[i][j];
}
/* ... with execution, e.g., ... */
dim3 blocksPerGrid(N/16, N/16, 1);
dim3 threadsPerBlock(16, 16, 1);
matrix2d <<< blocksPerGrid, threadsPerBlock >>> (a, b, c);
myKernel <<<blocksPerGrid, threadsPerBlock>>> (...)
/* ... could perform independent work here ... */
err = cudaDeviceSynchronize();
/* ... now safe to obtain results of kernel ... */
cudaMemcpyAsync()
, ...
/* For example, provide an allocation of "nSize" floats
* in the device memory: */
float * data;
err = cudaMalloc(&data, nSize*sizeof(float));
...
err = cudaFree(data);
cudaMemcpy()
/* Copy host data values to device memory ... */
err = cudaMemcpy(dataDevice, dataHost, nSize*sizeof(float),
cudaMemcpyHostToDevice);
/* And back again ... */
err = cudaMemcpy(dataHost, dataDevice, nSize*sizeof(float),
cudaMemcpyDeviceToHost);
cudaError_t cudaMemcpy(void * dest, const void * src,
size_t count,
cudaMemcpyKind kind);
device
attribute
! Make an allocation in device memory:
real, device, allocatable :: dataDevice(:)
allocate(dataDevice(nSize), stat = ...)
...
deallocate(dataDevice)
cudaMalloc()
, cudaFree()
! Copy from host to device
dataDevice(:) = dataHost(:)
! ... and back again ...
dataHost(:) = dataDevice(:)
err = cudaMemcpy(dataDevice, dataHost, nSize,
cudaMemcpyHostToDevice)
.cu
by convention nvcc
$ nvcc -o example example.cu
.cuf
by convention
pgf90
-Mcuda
(if file extension not .cuf
)
$ pgf90 -Mcuda -o example example.cuf