Material by: Mark Bull, Alan Gray, Alistair Hart (Cray), Kevin Stratford
#pragma acc parallel [clause ...]
{
/* ...structured block .. */
}
!$acc parallel [clause ...]
! ... structured block ...
!$acc end parallel
#pragma acc parallel loop
for (n = 0; n < NSIZE; n++) {
/* ...structured block .. */
}
!$acc parallel loop
do n = 1, NSIZE
! ...loop body...
end do
!$acc end parallel loop
#pragma acc parallel
{
#pragma acc loop
for (i = 0; i < NI; i++) {
/* ... loop one ... */
}
...
#pragma acc loop
for (i = 0; i < NI; j++) {
/* ... loop two ... */
}
}
!$acc parallel
...
!$acc loop
do i = 1, NI
! ... loop one ..
end do
!$acc end loop
...
!$acc loop
do i = 1, NI
! ... loop two ...
end do
!$acc end loop
!$acc end parallel
parallel
may do little in itself loop
to distribute work
#pragma acc loop gang vector
loop
or parallel loop
num_gangs(nblocks)
clause
vector_length(nthreads)
clause
nthreads
one of 1, 32, 64, 128 (the default), 256, 512, 1024
loop
or parallel loop
seq
: loop should be executed sequentially
if (condition)
:conditional execution on accelerator
reduction
: sum, minimum, maximum operations
collapse()
: merge loop nests
/* Consider... */
total = 0.0;
for (i = 1; i < NI; i++) {
total += data[i];
}
/* In OpenACC: */
total = 0.0;
#pragma acc loop reduction(+: total)
for (i = 0; i < NI; i++) {
total += data[i];
}
!$acc parallel loop
do i = 1, NI
output(i) = input(i)
end do
!$acc end parallel loop
input()
copied to device output()
copied back to host
/* Declare shape of arrays [] */
#pragma acc parallel loop copyin(in[0:NI]) copyout(out[0:NI])
for (i = 1; i < NI; i++) {
out[i] = in[i];
}
/* copyin() - copy to GPU at start */
/* copyout() - copy to host at end */
data
directive
#pragma acc data [clause ...]
{
/* ... structured block ... */
}
!$acc data [clause ...]
! ... structured block ...
!$acc end data
!$acc data copyin(input(1:NI)) copyout(output(1:NI))
!$acc parallel loop
do i = 1, NI
output(i) = input(i)
end do
!$acc end parallel loop
write (*,*) "Copied input to output"
!$acc parallel loop
do i = 1, NI
output(i) = function_of(output(i))
end do
!$add end parallel loop
!$acc end data
copyin
copyout
copy
copyin
and copyout
) create
program example
integer :: a(10000)
!$acc data copy(a)
!$acc parallel loop
! ... initialise a()
!$add end parallel loop
call my_double(a)
!$acc end data
end program example
subroutine my_double(b)
integer, intent(inout) :: b(:)
integer :: i
!$acc parallel loop copy(b)
do i = 1, size(b)
b(i) = 2*b(i)
end do
!$acc end parallel loop
end subroutine my_double
/* Consider */
void my_init(my_t * a) {
a->data = (double *) malloc(a->n*sizeof(double));
#pragma acc enter data copyin(a)
#pragma acc enter data copyin(a->data[a->n])
}
...
void my_finalise(my_t * a) {
#pragma acc exit data delete(a->data)
#pragma acc exit data delete(a)
free(a->data);
}