NightStrike
2014-10-20 23:18:26 UTC
I have been studying the asm generated by a typical clamping function,
and I am confused about the results. This is done on an Opteron 6k
series compiled with -fverbose-asm, -O3 and -march=native.
float clamp(float const x, float const min, float const max) {
#if defined (BRANCH)
if ( x > max )
return max;
else if ( x < min )
return min;
else
return x;
#elif defined (BRANCH2)
return x > max ? max : ( x < min ? min : x );
#elif defined (CALL)
return __builtin_fminf(__builtin_fmaxf(x, min), max);
#else
float const t = x < min ? min : x;
return t> max ? max : t;
#endif
}
-DBRANCH / -DBRANCH2:
The first two approaches are obviously identical, and produce:
clamp:
.LFB0:
.cfi_startproc
vucomiss %xmm2, %xmm0 # max, x
ja .L3 #,
vmaxss %xmm0, %xmm1, %xmm0 # x, min, D.2214
ret
.p2align 4,,7
.p2align 3
.L3:
vmovaps %xmm2, %xmm0 # max, D.2214
ret
.cfi_endproc
-DCALL:
This one I figured would be great, given the use of builtins:
clamp:
.LFB0:
.cfi_startproc
subq $24, %rsp #,
.cfi_def_cfa_offset 32
vmovss %xmm2, 12(%rsp) # max, %sfp
call fmaxf #
vmovss 12(%rsp), %xmm2 # %sfp, max
addq $24, %rsp #,
.cfi_def_cfa_offset 8
vmovaps %xmm2, %xmm1 # max,
jmp fminf #
.cfi_endproc
But then we have what appears to be the best of them all.... just a
couple instructions, no branches, no calls, nothing:
.LFB0:
.cfi_startproc
vmaxss %xmm0, %xmm1, %xmm0 # x, min, D.2219
vminss %xmm0, %xmm2, %xmm0 # D.2219, max, D.2219
ret
.cfi_endproc
So I'm curious.... why is the last approach optimized better than the
naive approach of some nested if statements?
and I am confused about the results. This is done on an Opteron 6k
series compiled with -fverbose-asm, -O3 and -march=native.
float clamp(float const x, float const min, float const max) {
#if defined (BRANCH)
if ( x > max )
return max;
else if ( x < min )
return min;
else
return x;
#elif defined (BRANCH2)
return x > max ? max : ( x < min ? min : x );
#elif defined (CALL)
return __builtin_fminf(__builtin_fmaxf(x, min), max);
#else
float const t = x < min ? min : x;
return t> max ? max : t;
#endif
}
-DBRANCH / -DBRANCH2:
The first two approaches are obviously identical, and produce:
clamp:
.LFB0:
.cfi_startproc
vucomiss %xmm2, %xmm0 # max, x
ja .L3 #,
vmaxss %xmm0, %xmm1, %xmm0 # x, min, D.2214
ret
.p2align 4,,7
.p2align 3
.L3:
vmovaps %xmm2, %xmm0 # max, D.2214
ret
.cfi_endproc
-DCALL:
This one I figured would be great, given the use of builtins:
clamp:
.LFB0:
.cfi_startproc
subq $24, %rsp #,
.cfi_def_cfa_offset 32
vmovss %xmm2, 12(%rsp) # max, %sfp
call fmaxf #
vmovss 12(%rsp), %xmm2 # %sfp, max
addq $24, %rsp #,
.cfi_def_cfa_offset 8
vmovaps %xmm2, %xmm1 # max,
jmp fminf #
.cfi_endproc
But then we have what appears to be the best of them all.... just a
couple instructions, no branches, no calls, nothing:
.LFB0:
.cfi_startproc
vmaxss %xmm0, %xmm1, %xmm0 # x, min, D.2219
vminss %xmm0, %xmm2, %xmm0 # D.2219, max, D.2219
ret
.cfi_endproc
So I'm curious.... why is the last approach optimized better than the
naive approach of some nested if statements?