nearbyint.s [plain text]

//  float nearbyintf(float x)
//
//  returns the argument rounded to an integral value using the prevailing
//  rounding mode, and without raising the inexact flag (C99 7.12.9.4).
//
//  -- Stephen Canon, January 2010

#include <System/i386/cpu_capabilities.h>

.text
.align 4
.globl _nearbyint
_nearbyint:

// Fast path if SSE4.1 is available ------------------------------------------

#if defined __i386__
    movsd      4(%esp),         %xmm0
    mov         _COMM_PAGE_CPU_CAPABILITIES,    %eax
    test        $(kHasSSE4_1),  %eax
    jz          L_noSSE4_1
    roundsd     $0xc,  %xmm0,   %xmm0
    movsd       %xmm0,         4(%esp)
    fldl       4(%esp)
    ret
#elif defined __x86_64__
    movabs      _COMM_PAGE_CPU_CAPABILITIES,    %eax
    test        $(kHasSSE4_1),  %eax
    jz          L_noSSE4_1
    roundsd     $0xc,  %xmm0,   %xmm0
    ret
#else
#error "This implementations supports i386 and x86_64 only"
#endif

// Fallback path if SSE4.1 is not available ----------------------------------
//
// FPSCR rounding control bits 11:10
//
//  00  --  Round to nearest even
//  01  --  Round to -Inf
//  10  --  Round to +Inf
//  11  --  Round to zero
//  
// This path is adapted from Ian Ollmann's original implementation

L_noSSE4_1:
#if defined __i386__
    mov        8(%esp),         %ecx    // high word of input
    sub         $4,             %esp    // make space for the fp control word
    #define     fpcw            (%esp)
    #define     floatval       8(%esp)  // address of argument to nearbyintf
#else
    movd        %xmm0,          %rcx
    shr         $32,            %rcx    // high word of input
    #define     fpcw          -4(%rsp)  // space for fp control word in red zone
#endif
    fnstcw      fpcw
    
    mov         %ecx,           %edx
    pcmpeqb     %xmm1,          %xmm1
    and         $0x7fffffff,    %ecx    // |x|
    psllq       $63,            %xmm1   // -0.0
    xor         %ecx,           %edx    // signbit of x
    movapd      %xmm1,          %xmm2
    sub         $0x3ff00000,    %ecx    // high word of |x| - exponent bias
    andpd       %xmm0,          %xmm2   // signbit of x
    cmp         $0x03400000,    %ecx    // if |x| < 1.0 or |x| >= 0x1.0p52 or isnan(x)
    jae         L_edgeCases             //    branch
    
    shr         $20,            %ecx    // exponent of x, in [0, 51]
    sub         $52,            %ecx
    neg         %ecx                    // number of fractional bits in x
    pcmpeqb     %xmm1,          %xmm1
    movd        %ecx,           %xmm7
    psllq       %xmm7,          %xmm1   // mask covering integral bits of x
    
L_reentryForSmallX:
    andpd       %xmm0,          %xmm1   // trunc(x)
    ucomisd     %xmm1,          %xmm0   // if x == trunc(x)
    je          L_roundToZero           //    return trunc(x)
    
    mov         fpcw,           %ecx
    and         $0xc00,         %ecx    // current rounding mode encoding
    sub         $0x400,         %ecx
    cmp         $0x400,         %ecx
    jg          L_roundToZero           // ecx = 0x800
    je          L_roundUp               // ecx = 0x400
    jb          L_roundDown             // ecx = 0x000
    
L_roundToNearest:                       // fall into default rounding path
    subsd       %xmm1,          %xmm0   // fractional part of x
    pcmpeqb     %xmm3,          %xmm3
    psllq       $55,            %xmm3
    psrlq       $2,             %xmm3   // 0.5
    xorpd       %xmm2,          %xmm0   // |fract(x)|
    ucomisd     %xmm3,          %xmm0   // if |fractional part| == 0.5
    je          L_exactHalfway          //    branch to handle exact halfway case
    
    cmpltsd     %xmm3,          %xmm0   // |fract(x)| < 0.5 ? -1 : 0
    addsd       %xmm3,          %xmm3   // 1.0
    orpd        %xmm2,          %xmm3   // copysign(1.0,x)
    andnpd      %xmm3,          %xmm0   // |fract(x)| < 0.5 ? 0.0 : copysign(1.0,x)
    addsd       %xmm1,          %xmm0   // correctly rounded result
    orpd        %xmm2,          %xmm0   // restore sign of zero
#if defined( __i386__ )
    movsd       %xmm0,          floatval
    fldl        floatval
    add         $4,             %esp
#endif
    ret
    
L_exactHalfway:
    addsd       %xmm3,          %xmm3   // 1.0
    pcmpeqb     %xmm4,          %xmm4
    psubq       %xmm4,          %xmm7
    psllq       %xmm7,          %xmm4   // mask covering signbit and bits larger than 2^1
    orpd        %xmm2,          %xmm3   // copysign(1.0, x)
    pandn       %xmm1,          %xmm4   // trunc(x) odd ? nonzero : zero
    xorpd       %xmm5,          %xmm5
    pcmpeqd     %xmm5,          %xmm4   // trunc(x) odd ? at least one dword is 0 : -1
    pshufd      $0xe1,  %xmm4,  %xmm5
    pand        %xmm5,          %xmm4   // trunc(x) odd ? 0 : -1
    andnpd      %xmm3,          %xmm4   // trunc(x) odd ? copysign(1.0,x), 0
    addsd       %xmm4,          %xmm1
    orpd        %xmm2,          %xmm1   // reapply sign of zero
    
L_roundToZero:
#if defined( __i386__ )
    movsd       %xmm1,          floatval
    fldl        floatval
    add         $4,             %esp
#else
    movapd      %xmm1,          %xmm0
#endif
    ret
    
L_roundUp:                              // we already know that x is not integral
    xorpd       %xmm3,          %xmm3
    cmpltsd     %xmm0,          %xmm3   // x > 0 ? -1 : 0
    cvtdq2pd    %xmm3,          %xmm3   // x > 0 ? -1.0 : 0.0
    subsd       %xmm3,          %xmm1   // x > 0 ? trunc(x) + 1.0 : trunc(x)
#if defined( __i386__ )
    movsd       %xmm1,          floatval
    fldl        floatval
    add         $4,             %esp
#else
    movapd      %xmm1,          %xmm0
#endif
    ret

L_roundDown:                            // we already know that x is not integral
    psrlq       $63,            %xmm0   // x < 0 ? 1 : 0
    cvtdq2pd    %xmm0,          %xmm3   // x < 0 ? 1.0 : 0.0
    subsd       %xmm3,          %xmm1   // x < 0 ? trunc(x) - 1.0 : trunc(x)
    pcmpeqb     %xmm0,          %xmm0
    psrlq       $1,             %xmm0   // 0x7fffffffffffffff
    orpd        %xmm2,          %xmm0   // x < 0 ? 0xffffffffffffffff : 0x7fffffffffffffff
    andpd       %xmm1,          %xmm0
#if defined( __i386__ )
    movsd       %xmm0,          floatval
    fldl        floatval
    add         $4,             %esp
#endif
    ret

L_edgeCases:                            // if |x| < 1.0, jump back to mainline
    xorpd       %xmm7,          %xmm7   // initialize data in xmm7
    jl          L_reentryForSmallX      // otherwise, we just return x
#if defined( __i386__ )
    movsd       %xmm0,          floatval
    fldl        floatval
    add         $4,             %esp
#endif
    ret

/*
7:  //Round to even nearest, half way case
    addsd   %xmm6,                  %xmm6       // 1.0
    pcmpeqb %xmm3,                  %xmm3       // -1ULL
    psubq   %xmm3,                  %xmm7       // add one to the truncation mask shift constant
    psllq   %xmm7,                  %xmm3       // prepare a new truncation mask with left edge past unit bit. (Works for 1.0 too, since least significant bit of exponent has odd parity)
    orpd    %xmm2,                  %xmm6       // copysign( 1.0, x )
    pandn   %xmm1,                  %xmm3       // if trunc(x) is odd, this bit will be non-zero (in the +-0.5, case we have +-0.5 here instead of just a bit set )
    xorpd   %xmm4,                  %xmm4       // 0.0
    pcmpeqd %xmm4,                  %xmm3       // check if each 32-bit chunk is equal to 0. Unforunately there is no 64-bit integer compare. Wed hit denorms here if we use double precision compare.
    pshufd  $0xE1, %xmm3,           %xmm4       // swap both chunks
    pand    %xmm4,                  %xmm3       // trunc(x) is odd ? 0 : -1U    (make sure the other chunk is also equal to 0)
    andnpd  %xmm6,                  %xmm3       // trunc(x) is odd ? copysign( 1.0, x ) : 0.0f
    addsd   %xmm3,                  %xmm1       // round

#if defined( __i386__ )
    movsd   %xmm1,                  (STACKP)
    fldl    (STACKP)
#else
    movsd   %xmm1,                  %xmm0
#endif    
    ADDP    $(16-FRAME_SIZE),       STACKP
    ret
    
    */