Material by: Alan Gray, Kevin Stratford
for (it = 0; it < nTimeSteps; it++) {
myCheapHostOperation(hostData);
cudaMemcpy(..., cudaMemcpyHostToDevice);
myExpensiveKernel <<<...>>> (deviceData, ...);
cudaMemcpy(..., cudaMemcpyDeviceToHost);
}
cudaMemcpy(..., cudaMemcpyHostToDevice);
for (it = 0; it < nTimeSteps; it++) {
myCheapKernel <<< ... >>> (deviceData, ...);
myExpensiveKernel <<< ... >>> (deviceData, ...)
}
cudeMemcpy(..., cudaMemcpyDeviceToHost);
for (i = 0; i < 512; i++) {
for (j = 0; j < 512; j++) {
/* ... work ... */
/* C: recall right-most index runs fastest */
for (i = 0; i < NI; i++) {
for (j = 0; j < NJ; j++) {
output[i][j] = input[i][j];
}
}
! Fortran: recall left-most index runs fastest
do j = 1, NJ
do i = 1, NI
output(i,j) = input(i,j)
end do
end do
consecutive threads access consecutive memory locations
threadsPerBlock = (nThreads, 1, 1)
/* In C: */
idx = blockIdx.x*blockDim.x + threadIdx.x;
output[idx] = input[idx];
! In Fortran:
idx = (blockIdx%x - 1)*blockDim%x + threadIdx%x
output(idx) = input(idx)
/* Bad: consecutive threads have strided access */
i = blockIdx.x*blockDim.x + threadIdx.x;
for (j = 0; j < NJ; j++) {
output[i][j] = input[i][j];
}
/* Good: consecutive threads have contiguous access */
j = blockIdx.x*blockDim.x + threadIdx.x;
for (i = 0; i < NI; i++) {
output[i][j] = input[i][j];
}
! Bad: consecutive threads have strided access
j = blockIdx.x*blockDim.x + threadIdx.x;
do i = 1, NI
output(i, j) = input(i, j);
end do
! Good: consecutive threads have contiguous access
i = blockIdx.x*blockDim.x + threadIdx.x;
do j = 1, NJ
output(i, j) = input(i, j);
end do
blocksPerGrid = (nBlocksX, nBlocksY, 1)
threadsPerBlock = (nThreadsX, nThreadsY, 1)
/* C: note apparent transposition of i, j here... */
int j = blockIdx.x*blockDim.x + threadIdx.x;
int i = blockIdx.y*blockDim.y + threadIdx.y;
output[i][j] = input[i][j];
! Fortran: looks more natural
i = (blockIdx%x - 1)*blockDim%x + threadIdx%x
j = (blockIdx%y - 1)*blockDim%y + threadIdx%y
output(i, j) = input(i, j)
/* Bad: threads in same warp diverge... */
tid = blockIdx.x*blockDim.x + threadIdx.x;
if (tid % 2 == 0) {
/* Threads 0, 2, 4, ... do one thing ... */
}
else {
/* Threads 1, 3, 5, ... do something else */
}
/* Good: threads in same warp follow same path ...
* Note use of the internal variable "warpSize" */
tid = blockIdx.x*blockDim.x + threadIdx.x;
if ( (tid / warpSize) % 2 == 0) {
/* Threads 0, 1, 2, 3, 4, ... do one thing ... */
}
else {
/* Threads 32, 33, 34, 35, ... do something else */
}
$ nvcc -Xptxas -v ...
nvprof
$ nvprof ./cudaExecutable