/* double round( double )
*
* Reimplemented by Steve Canon, based on Ian Ollmann's implementations
* tuned for increased performance on in-order machines (but faster on
* out-of-order machines as well).
*
* Copyright 2009, Apple Inc.
*/
#ifdef __i386__
#ifdef __SSE3__
#define TRUNCATE \
fisttpll 4(%esp)#else
#define TRUNCATE \
fnstcw 4(%esp) orw $0xc00, 4(%esp) frndint; \
movw %dx, 4(%esp)#endif
.text
.align 4
.globl _round
_round:
movl 8(%esp), %ecx
cmpl $0x43300000, %ecx
fldl 4(%esp)
jae 2f
fld %st(0) // { x, x }
TRUNCATE // { trunc(x), x }
fxch // { x, trunc(x) }
fsub %st(1), %st(0) // { frac(x), x }
fadd %st(0), %st(0) // { 2*frac(x), x }
fld1 // { 1.0, 2*frac(x), x }
fucomi %st(1), %st(0)
fstp %st(1) // { 1.0, trunc(x) }
ja 1f
fadd %st(0), %st(1) // { 1.0, trunc(x) + 1.0 }
1: fstp %st(0) // { round(x) }
ret
2: andl $0x7fffffff, %ecx
cmpl $0x43300000, %ecx
jge 4f
fabs
fld %st(0)
TRUNCATE // { trunc(x), x }
fxch // { x, trunc(x) }
fsub %st(1), %st(0) // { frac(x), x }
fadd %st(0), %st(0) // { 2*frac(x), x }
fld1 // { 1.0, 2*frac(x), x }
fucomi %st(1), %st(0)
fstp %st(1) // { 1.0, trunc(x) }
ja 3f
fadd %st(0), %st(1) // { 1.0, trunc(x) + 1.0 }
3: fstp %st(0) // { round(x) }
fchs
4: ret
#else //x86_64
.const
.align 4
one: .quad 0x3ff0000000000000
absmask:.quad 0x7fffffffffffffff
half: .quad 0x3fe0000000000000
thresh: .quad 0x4330000000000000
.text
.align 4
.globl _round
_round:
movd %xmm0, %rcx
andq absmask(%rip), %rcx
movsd absmask(%rip), %xmm2
cmpq thresh(%rip), %rcx
jae 1f
cvttsd2si %xmm0, %rax
andnpd %xmm0, %xmm2 // signbit(x)
movsd half(%rip), %xmm3
cvtsi2sd %rax, %xmm1 // trunc(x)
subsd %xmm1, %xmm0 // frac(x)
orpd %xmm2, %xmm1
xorpd %xmm2, %xmm0 // |frac(x)|
cmpltpd %xmm3, %xmm0 // (|frac(x)| < 0.5) ?
andnpd one(%rip), %xmm0 // (|frac(x)| < 0.5) ? 0.0 : 1.0
orpd %xmm2, %xmm0
addsd %xmm1, %xmm0 // round(x)
1: ret
#endif