nearbyintf.s   [plain text]


//  float nearbyintf(float x)
//
//  returns the argument rounded to an integral value using the prevailing
//  rounding mode, and without raising the inexact flag (C99 7.12.9.4).
//
//  -- Stephen Canon, January 2010

#include <System/i386/cpu_capabilities.h>

.text
.align 4
.globl _nearbyintf
_nearbyintf:

// Fast path if SSE4.1 is available ------------------------------------------

#if defined __i386__
    movss      4(%esp),         %xmm0
    mov         _COMM_PAGE_CPU_CAPABILITIES,    %eax
    test        $(kHasSSE4_1),  %eax
    jz          L_noSSE4_1
    roundss     $0xc,  %xmm0,   %xmm0
    movss       %xmm0,         4(%esp)
    flds       4(%esp)
    ret
#elif defined __x86_64__
    movabs      _COMM_PAGE_CPU_CAPABILITIES,    %eax
    test        $(kHasSSE4_1),  %eax
    jz          L_noSSE4_1
    roundss     $0xc,  %xmm0,   %xmm0
    ret
#else
#error "This implementations supports i386 and x86_64 only"
#endif

// Fallback path if SSE4.1 is not available ----------------------------------
//
// FPSCR rounding control bits 11:10
//
//  00  --  Round to nearest even
//  01  --  Round to -Inf
//  10  --  Round to +Inf
//  11  --  Round to zero
//  
// This path is adapted from Ian Ollmann's original implementation

L_noSSE4_1:
    movd        %xmm0,          %ecx
#if defined __i386__
    sub         $4,             %esp    // make space for the fp control word
    #define     fpcw            (%esp)
    #define     floatval       8(%esp)  // address of argument to nearbyintf
#else
    #define     fpcw          -4(%rsp)  // space for fp control word in red zone
#endif
    fnstcw      fpcw
      
// Detect special cases and branch
    mov         %ecx,           %edx
    and         $0x7fffffff,    %ecx    // |x|
    xor         %ecx,           %edx    // signbit of x
    movd        %edx,           %xmm2
    sub         $0x3f800000,    %ecx    // |x| - exponent bias
    mov         $0x80000000,    %eax
    cmp         $0x0b800000,    %ecx    // if |x| < 1.0 or |x| >= 0x1.0p23f or isnan(x)
    jae         L_edgeCases             //    branch
    
    shr         $23,            %ecx    // exponent of x, in [0,22]
    add         $8,             %ecx
    sar         %cl,            %eax    // mask covering integral bits of x
    
L_reentryForSmallX:
    movd        %eax,           %xmm1
    andps       %xmm0,          %xmm1   // trunc(x)
    ucomiss     %xmm1,          %xmm0   // if x == trunc(x)
    je          L_roundToZero           //    return trunc(x)

    mov         fpcw,           %ecx
    and         $0xc00,         %ecx    // current rounding mode encoding
    sub         $0x400,         %ecx
    cmp         $0x400,         %ecx
    jg          L_roundToZero           // ecx = 0x800
    je          L_roundUp               // ecx = 0x400
    jb          L_roundDown             // ecx = 0x000

L_roundToNearest:                       // fall into default rounding path
    subss       %xmm1,          %xmm0   // fractional part of x
    movd        %xmm0,          %ecx
    or          $0x3f000000,    %edx    // copysignf(0.5f,x)
    xor         %edx,           %ecx    // frac(x) ^ copysignf(0.5f,x)
    jz          L_exactHalfway          // handle exact halfway cases separately
    
    sub         $0x00800000,    %ecx    // |frac(x)| > 0.5 ? negative : positive
    or          $0x00800000,    %edx
    sar         $31,            %ecx    // |frac(x)| > 0.5 ? -1 : 0
    and         %ecx,           %edx    // |frac(x)| > 0.5 ? copysignf(1.0f,x) : 0
    movd        %edx,           %xmm0
    addss       %xmm1,          %xmm0   // |frac(x)| > 0.5 ? trunc(x) + copysign(1.0f,x) : trunc(x)
    orps        %xmm2,          %xmm0   // apply signbit to get the right zero
#if defined( __i386__ )
    movss       %xmm0,          floatval
    flds        floatval
    add         $4,             %esp
#endif
    ret

L_exactHalfway:
    shl         $1,             %eax
    addss       %xmm0,          %xmm0   // copysignf(1.0f,x)
    or          $0x80000000,    %eax    // mask covering signbit and bits larger than 2^1
    movd        %eax,           %xmm3
    pandn       %xmm1,          %xmm3   // trunc(x) odd ? non-zero : zero
    pxor        %xmm4,          %xmm4
    pcmpeqd     %xmm4,          %xmm3   // trunc(x) odd ? 0 : -1
    andnps      %xmm0,          %xmm3   // trunc(x) odd ? copysignf(1.0f,x) : 0.0f
    addss       %xmm3,          %xmm1
    orps        %xmm2,          %xmm1   // apply signbit to result
L_roundToZero:
#if defined( __i386__ )
    movss       %xmm1,          floatval
    flds        floatval
    add         $4,             %esp
#else
    movaps      %xmm1,          %xmm0
#endif
    ret
    
L_roundUp:                              // we know that x is not integral
    psrad       $31,            %xmm0   // x < 0 ? -1 : 0
    pcmpeqb     %xmm3,          %xmm3   // -1
    psubd       %xmm3,          %xmm0   // x < 0 ? 0 : 1
    cvtdq2ps    %xmm0,          %xmm0   // x < 0 ? 0.0 : 1.0
    addss       %xmm1,          %xmm0   // x < 0 ? trunc(x) : trunc(x) + 1
    orps        %xmm2,          %xmm0   // apply signbit to get the right zero
#if defined( __i386__ )
    movss       %xmm0,          floatval
    flds        floatval
    add         $4,             %esp
#endif
    ret
    
L_roundDown:                            // we know that x is not integral
    psrad       $31,            %xmm0   // x < 0 ? -1 : 0
    cvtdq2ps    %xmm0,          %xmm0   // x < 0 ? -1 : 0
    addss       %xmm1,          %xmm0   // x < 0 ? trunc(x) - 1 : trunc(x)
#if defined( __i386__ )
    movss       %xmm0,          floatval
    flds        floatval
    add         $4,             %esp
#endif
    ret

L_edgeCases:                            // if |x| < 1.0, jump back to mainline
    jl          L_reentryForSmallX      // otherwise, we just return x
#if defined( __i386__ )
    movss       %xmm0,          floatval
    flds        floatval
    add         $4,             %esp
#endif
    ret