OpenMP Reduce Should do some manual loop unrolling
Currently vtk-m openmp reduction algorithm uses the following approach:
...
VTKM_OPENMP_DIRECTIVE(for schedule(static))
for (vtkm::Id i = numThreads * 2; i < numVals; i++)
{
accum = f(accum, data[i]);
}
The issue with this approach is that it makes compilers believe that accum
is a dependency between each iteration and the loop can't be vectorized. If we look at the assembly for skylake-avx512 generation we see:
.L152:
vpaddd (%rcx), %ymm0, %ymm0
addq $32, %rcx
cmpq %rax, %rcx
jne .L152
If we do our own manually unrolling such as:
...
VTKM_OPENMP_DIRECTIVE(for schedule(static))
for (vtkm::Id i = numThreads * 2; i < numVals; i+=4)
{
auto a = f(data[i], data[i+1]);
auto b = f(data[i+2], data[i+3]);
accum = f(accum, f(a,b));
}
We see better vectorization when we look at the assembly for skylake-avx512 generation:
.L152:
vmovdqu32 (%rax), %ymm1
vmovdqu32 32(%rax), %ymm9
vmovdqu32 64(%rax), %ymm4
vmovdqu32 (%rdi), %ymm0
vmovdqu32 64(%rdi), %ymm5
vmovdqu32 96(%rax), %ymm8
vmovdqa64 %ymm1, %ymm3
vpermt2d 96(%rdi), %ymm2, %ymm5
vpermt2d %ymm9, %ymm2, %ymm3
vmovdqa64 %ymm4, %ymm7
vpermt2d 32(%rdi), %ymm2, %ymm0
vpermt2d %ymm8, %ymm2, %ymm7
vpermt2d %ymm5, %ymm2, %ymm0
vpermt2d %ymm9, %ymm6, %ymm1
vmovdqa64 %ymm3, %ymm5
vpermt2d %ymm8, %ymm6, %ymm4
vpermt2d %ymm7, %ymm2, %ymm5
vpermt2d %ymm4, %ymm2, %ymm1
vpermt2d %ymm7, %ymm6, %ymm3
vpaddd %ymm5, %ymm0, %ymm0
vpaddd %ymm3, %ymm1, %ymm1
vpaddd %ymm1, %ymm0, %ymm0
subq $-128, %rax
vpaddd %ymm0, %ymm10, %ymm10
subq $-128, %rdi
cmpq %rdx, %rax
jne .L152