expm1f.s   [plain text]


/*
 *  expm1f.s
 *
 *      by Ian Ollmann
 *
 *  Copyright (c) 2007, Apple Inc. All Rights Reserved.
 *
 *  Implementation for C99 expm1f function for i386 and x86_64  ABIs.
 */
 
#include <machine/asm.h>
#include "abi.h"

.const

//minimax polynomial for exp2(x)-1
.align 4
// 8th order minimax fit of exp2 on [-1.0,1.0].  |error| < 0.402865722354948566583852e-9:
expm1f_c:       .quad 0x40bc03f30399c376,   0x3dbea2a63403aaa8          // c4/c8 = 0.961813690023115610862381719985771e-2 / 0.134107709538786543922336536865157e-5, c0 = 0.278626872016317130037181614004e-10
                .quad 0x408f10e7f73e6d8f,   0x3fe62e42fd0933ee	        // c5/c8 = 0.133318252930790403741964203236548e-2 / 0.134107709538786543922336536865157e-5, c1 = .693147176943623740308984004029708   
                .quad 0x405cb616a9384e69,   0x3fcebfbdfd0f0afa	        // c6/c8 = 0.154016177542147239746127455226575e-3 / 0.134107709538786543922336536865157e-5, c2 = .240226505817268621584559118975830   
                .quad 0x4027173ebd288ba1,   0x3fac6b0a74f15403	        // c7/c8 = 0.154832722143258821052933667742417e-4 / 0.134107709538786543922336536865157e-5, c3 = 0.555041568519883074165425891257052e-1 
                .quad 0x3eb67fe1dc3105ba,   0x3ff0000000000000          // c8 = 0.134107709538786543922336536865157e-5, 1.0


//expm1f_table         
                .quad	0xbfef800000000000,	0xbfe40adf8d149383	//-63/64, expm1(-63/64)
                .quad	0xbfef000000000000,	0xbfe3daaae2395759	//-62/64, expm1(-62/64)
                .quad	0xbfee800000000000,	0xbfe3a9b3e10921cd	//-61/64, expm1(-61/64)
                .quad	0xbfee000000000000,	0xbfe377f77a0fcb45	//-60/64, expm1(-60/64)
                .quad	0xbfed800000000000,	0xbfe345729182bf1f	//-59/64, expm1(-59/64)
                .quad	0xbfed000000000000,	0xbfe31221ff0f3ecc	//-58/64, expm1(-58/64)
                .quad	0xbfec800000000000,	0xbfe2de028da7dc59	//-57/64, expm1(-57/64)
                .quad	0xbfec000000000000,	0xbfe2a910fb51295a	//-56/64, expm1(-56/64)
                .quad	0xbfeb800000000000,	0xbfe27349f8ed96eb	//-55/64, expm1(-55/64)
                .quad	0xbfeb000000000000,	0xbfe23caa2a088391	//-54/64, expm1(-54/64)
                .quad	0xbfea800000000000,	0xbfe2052e24a073a5	//-53/64, expm1(-53/64)
                .quad	0xbfea000000000000,	0xbfe1ccd270f070f9	//-52/64, expm1(-52/64)
                .quad	0xbfe9800000000000,	0xbfe1939389388e3d	//-51/64, expm1(-51/64)
                .quad	0xbfe9000000000000,	0xbfe1596dd9858ab1	//-50/64, expm1(-50/64)
                .quad	0xbfe8800000000000,	0xbfe11e5dbf7792a9	//-49/64, expm1(-49/64)
                .quad	0xbfe8000000000000,	0xbfe0e25f8a081941	//-48/64, expm1(-48/64)
                .quad	0xbfe7800000000000,	0xbfe0a56f794ec7a4	//-47/64, expm1(-47/64)
                .quad	0xbfe7000000000000,	0xbfe06789be457e3b	//-46/64, expm1(-46/64)
                .quad	0xbfe6800000000000,	0xbfe028aa7a8b63f2	//-45/64, expm1(-45/64)
                .quad	0xbfe6000000000000,	0xbfdfd19b804dffbf	//-44/64, expm1(-44/64)
                .quad	0xbfe5800000000000,	0xbfdf4fdf228eb2ad	//-43/64, expm1(-43/64)
                .quad	0xbfe5000000000000,	0xbfdecc17c0083500	//-42/64, expm1(-42/64)
                .quad	0xbfe4800000000000,	0xbfde463d1c396301	//-41/64, expm1(-41/64)
                .quad	0xbfe4000000000000,	0xbfddbe46d96cd831	//-40/64, expm1(-40/64)
                .quad	0xbfe3800000000000,	0xbfdd342c7833133a	//-39/64, expm1(-39/64)
                .quad	0xbfe3000000000000,	0xbfdca7e556da7e48	//-38/64, expm1(-38/64)
                .quad	0xbfe2800000000000,	0xbfdc1968b0e55333	//-37/64, expm1(-37/64)
                .quad	0xbfe2000000000000,	0xbfdb88ad9e7d52ea	//-36/64, expm1(-36/64)
                .quad	0xbfe1800000000000,	0xbfdaf5ab13e5474f	//-35/64, expm1(-35/64)
                .quad	0xbfe1000000000000,	0xbfda6057e0e846a4	//-34/64, expm1(-34/64)
                .quad	0xbfe0800000000000,	0xbfd9c8aab046af7a	//-33/64, expm1(-33/64)
                .quad	0xbfe0000000000000,	0xbfd92e9a0720d3ec	//-32/64, expm1(-32/64)
                .quad	0xbfdf000000000000,	0xbfd8921c445f4add	//-31/64, expm1(-31/64)
                .quad	0xbfde000000000000,	0xbfd7f327a018ddb2	//-30/64, expm1(-30/64)
                .quad	0xbfdd000000000000,	0xbfd751b22af608f0	//-29/64, expm1(-29/64)
                .quad	0xbfdc000000000000,	0xbfd6adb1cd9205ee	//-28/64, expm1(-28/64)
                .quad	0xbfdb000000000000,	0xbfd6071c47d953b2	//-27/64, expm1(-27/64)
                .quad	0xbfda000000000000,	0xbfd55de73065b4df	//-26/64, expm1(-26/64)
                .quad	0xbfd9000000000000,	0xbfd4b207f3d79870	//-25/64, expm1(-25/64)
                .quad	0xbfd8000000000000,	0xbfd40373d42ce2e3	//-24/64, expm1(-24/64)
                .quad	0xbfd7000000000000,	0xbfd3521fe8150d2b	//-23/64, expm1(-23/64)
                .quad	0xbfd6000000000000,	0xbfd29e011a428ec6	//-22/64, expm1(-22/64)
                .quad	0xbfd5000000000000,	0xbfd1e70c28b987f3	//-21/64, expm1(-21/64)
                .quad	0xbfd4000000000000,	0xbfd12d35a41ba104	//-20/64, expm1(-20/64)
                .quad	0xbfd3000000000000,	0xbfd07071eef11388	//-19/64, expm1(-19/64)
                .quad	0xbfd2000000000000,	0xbfcf616a79dda3a8	//-18/64, expm1(-18/64)
                .quad	0xbfd1000000000000,	0xbfcddbe7247382af	//-17/64, expm1(-17/64)
                .quad	0xbfd0000000000000,	0xbfcc5041854df7d4	//-16/64, expm1(-16/64)
                .quad	0xbfce000000000000,	0xbfcabe60e1f21836	//-15/64, expm1(-15/64)
                .quad	0xbfcc000000000000,	0xbfc9262c1c3430a1	//-14/64, expm1(-14/64)
                .quad	0xbfca000000000000,	0xbfc78789b0a5e0c0	//-13/64, expm1(-13/64)
                .quad	0xbfc8000000000000,	0xbfc5e25fb4fde211	//-12/64, expm1(-12/64)
                .quad	0xbfc6000000000000,	0xbfc43693d679612d	//-11/64, expm1(-11/64)
                .quad	0xbfc4000000000000,	0xbfc2840b5836cf67	//-10/64, expm1(-10/64)
                .quad	0xbfc2000000000000,	0xbfc0caab118a1278	//-9/64, expm1(-9/64)
                .quad	0xbfc0000000000000,	0xbfbe14aed893eef4	//-8/64, expm1(-8/64)
                .quad	0xbfbc000000000000,	0xbfba85e8c62d9c13	//-7/64, expm1(-7/64)
                .quad	0xbfb8000000000000,	0xbfb6e8caff341fea	//-6/64, expm1(-6/64)
                .quad	0xbfb4000000000000,	0xbfb33d1bb17df2e7	//-5/64, expm1(-5/64)
                .quad	0xbfb0000000000000,	0xbfaf0540438fd5c3	//-4/64, expm1(-4/64)
                .quad	0xbfa8000000000000,	0xbfa7723950130405	//-3/64, expm1(-3/64)
                .quad	0xbfa0000000000000,	0xbf9f8152aee9450e	//-2/64, expm1(-2/64)
                .quad	0xbf90000000000000,	0xbf8fc055004416db	//-1/64, expm1(-1/64)
expm1f_table:   .quad	0x0000000000000000,	0x0000000000000000	//0/64, expm1(0/64)
                .quad	0x3f90000000000000,	0x3f90202ad5778e46	//1/64, expm1(1/64)
                .quad	0x3fa0000000000000,	0x3fa040ac0224fd93	//2/64, expm1(2/64)
                .quad	0x3fa8000000000000,	0x3fa89246d053d178	//3/64, expm1(3/64)
                .quad	0x3fb0000000000000,	0x3fb082b577d34ed8	//4/64, expm1(4/64)
                .quad	0x3fb4000000000000,	0x3fb4cd4fc989cd64	//5/64, expm1(5/64)
                .quad	0x3fb8000000000000,	0x3fb92937074e0cd7	//6/64, expm1(6/64)
                .quad	0x3fbc000000000000,	0x3fbd96b0eff0e794	//7/64, expm1(7/64)
                .quad	0x3fc0000000000000,	0x3fc10b022db7ae68	//8/64, expm1(8/64)
                .quad	0x3fc2000000000000,	0x3fc353bc9fb00b21	//9/64, expm1(9/64)
                .quad	0x3fc4000000000000,	0x3fc5a5ac59b963cb	//10/64, expm1(10/64)
                .quad	0x3fc6000000000000,	0x3fc800f67b00d7b8	//11/64, expm1(11/64)
                .quad	0x3fc8000000000000,	0x3fca65c0b85ac1a9	//12/64, expm1(12/64)
                .quad	0x3fca000000000000,	0x3fccd4315e9e0833	//13/64, expm1(13/64)
                .quad	0x3fcc000000000000,	0x3fcf4c6f5508ee5d	//14/64, expm1(14/64)
                .quad	0x3fce000000000000,	0x3fd0e7510fd7c564	//15/64, expm1(15/64)
                .quad	0x3fd0000000000000,	0x3fd22d78f0fa061a	//16/64, expm1(16/64)
                .quad	0x3fd1000000000000,	0x3fd378c3b0847980	//17/64, expm1(17/64)
                .quad	0x3fd2000000000000,	0x3fd4c946033eb3de	//18/64, expm1(18/64)
                .quad	0x3fd3000000000000,	0x3fd61f14f169ebc1	//19/64, expm1(19/64)
                .quad	0x3fd4000000000000,	0x3fd77a45d8117fd5	//20/64, expm1(20/64)
                .quad	0x3fd5000000000000,	0x3fd8daee6a60c961	//21/64, expm1(21/64)
                .quad	0x3fd6000000000000,	0x3fda4124b2fe50cb	//22/64, expm1(22/64)
                .quad	0x3fd7000000000000,	0x3fdbacff156c79d7	//23/64, expm1(23/64)
                .quad	0x3fd8000000000000,	0x3fdd1e944f6fbdaa	//24/64, expm1(24/64)
                .quad	0x3fd9000000000000,	0x3fde95fb7a7a88f8	//25/64, expm1(25/64)
                .quad	0x3fda000000000000,	0x3fe009a6068f6a8c	//26/64, expm1(26/64)
                .quad	0x3fdb000000000000,	0x3fe0cb4eee42c98b	//27/64, expm1(27/64)
                .quad	0x3fdc000000000000,	0x3fe190048ef60020	//28/64, expm1(28/64)
                .quad	0x3fdd000000000000,	0x3fe257d334137dff	//29/64, expm1(29/64)
                .quad	0x3fde000000000000,	0x3fe322c75a963b98	//30/64, expm1(30/64)
                .quad	0x3fdf000000000000,	0x3fe3f0edb1d18acd	//31/64, expm1(31/64)
                .quad	0x3fe0000000000000,	0x3fe4c2531c3c0d38	//32/64, expm1(32/64)
                .quad	0x3fe0800000000000,	0x3fe59704b03ddca9	//33/64, expm1(33/64)
                .quad	0x3fe1000000000000,	0x3fe66f0fb901f2bd	//34/64, expm1(34/64)
                .quad	0x3fe1800000000000,	0x3fe74a81b74adcac	//35/64, expm1(35/64)
                .quad	0x3fe2000000000000,	0x3fe82968624ac88d	//36/64, expm1(36/64)
                .quad	0x3fe2800000000000,	0x3fe90bd1a87ef9a1	//37/64, expm1(37/64)
                .quad	0x3fe3000000000000,	0x3fe9f1cbb08eb151	//38/64, expm1(38/64)
                .quad	0x3fe3800000000000,	0x3feadb64da2d9acf	//39/64, expm1(39/64)
                .quad	0x3fe4000000000000,	0x3febc8abbf01c781	//40/64, expm1(40/64)
                .quad	0x3fe4800000000000,	0x3fecb9af338d4a9c	//41/64, expm1(41/64)
                .quad	0x3fe5000000000000,	0x3fedae7e481b8284	//42/64, expm1(42/64)
                .quad	0x3fe5800000000000,	0x3feea72849b21ebd	//43/64, expm1(43/64)
                .quad	0x3fe6000000000000,	0x3fefa3bcc305f191	//44/64, expm1(44/64)
                .quad	0x3fe6800000000000,	0x3ff05225beb9ce55	//45/64, expm1(45/64)
                .quad	0x3fe7000000000000,	0x3ff0d47240fe1412	//46/64, expm1(46/64)
                .quad	0x3fe7800000000000,	0x3ff158cc0d22ca02	//47/64, expm1(47/64)
                .quad	0x3fe8000000000000,	0x3ff1df3b68cfb9ef	//48/64, expm1(48/64)
                .quad	0x3fe8800000000000,	0x3ff267c8bb05d2a3	//49/64, expm1(49/64)
                .quad	0x3fe9000000000000,	0x3ff2f27c8ca598a0	//50/64, expm1(50/64)
                .quad	0x3fe9800000000000,	0x3ff37f5f88f7b4e5	//51/64, expm1(51/64)
                .quad	0x3fea000000000000,	0x3ff40e7a7e37aa30	//52/64, expm1(52/64)
                .quad	0x3fea800000000000,	0x3ff49fd65e20b96f	//53/64, expm1(53/64)
                .quad	0x3feb000000000000,	0x3ff5337c3e7cfe38	//54/64, expm1(54/64)
                .quad	0x3feb800000000000,	0x3ff5c97559b6cc28	//55/64, expm1(55/64)
                .quad	0x3fec000000000000,	0x3ff661cb0f6c564f	//56/64, expm1(56/64)
                .quad	0x3fec800000000000,	0x3ff6fc86e505a9dd	//57/64, expm1(57/64)
                .quad	0x3fed000000000000,	0x3ff799b2864d0569	//58/64, expm1(58/64)
                .quad	0x3fed800000000000,	0x3ff83957c6099668	//59/64, expm1(59/64)
                .quad	0x3fee000000000000,	0x3ff8db809e9ca670	//60/64, expm1(60/64)
                .quad	0x3fee800000000000,	0x3ff9803732a14221	//61/64, expm1(61/64)
                .quad	0x3fef000000000000,	0x3ffa2785cd8e63ad	//62/64, expm1(62/64)
                .quad	0x3fef800000000000,	0x3ffad176e45bab25	//63/64, expm1(63/64)


expm1f_taylor_polynomial:       .double         0.5
                                .double         0.16666666666666666666666666666
                                .double         0.04166666666666666666666666666

.literal8
reciprocalLn2:  .quad   0x3ff71547652b82fe                          // 1.0 / ln(2)
two6:           .quad   0x4050000000000000                          // 0x1.0p6


.text

#if defined( __x86_64__ )
	#define RELATIVE_ADDR( _a)								(_a)( %rip )
	#define RELATIVE_ADDR_B( _a)							(_a)( %rip )
#elif defined( __i386__ )
	#define RELATIVE_ADDR( _a)								(_a)-expm1f_body( CX_P )
	#define RELATIVE_ADDR_B( _a)							(_a)-expm1f_no_fenv_body( CX_P )

//a short routine to get the local address
.align 4
expm1f_pic:      movl    (%esp),     %ecx                    //copy address of local_addr to %ecx
                ret
#else
	#error arch not supported
#endif

//0x1.0p-24df
#define LOW_CUTOFF  0x33800000

ENTRY( expm1f )
#if defined( __i386__ )
    movl        FRAME_SIZE(STACKP),     %eax
    movss       FRAME_SIZE(STACKP),     %xmm0
#else
    movd        %xmm0,                  %eax
#endif
    
    movl        %eax,                   %ecx
    andl        $0x7fffffff,            %eax
    cvtss2sd    %xmm0,                  %xmm2
    subl        $LOW_CUTOFF,            %eax        
    cmpl        $(0x42b17218-LOW_CUTOFF), %eax            // if( |x| >= 128.0f * ln(2) || |x| <= 0x1.0p-24f || isnan(f) )
    jae         3f
    
1:
//PIC
#if defined( __i386__ )
    calll       expm1f_pic                                    // set %ecx to point to local_addr
expm1f_body:   
#endif
  
    cmpl        $(0x3f800000-LOW_CUTOFF),    %eax
    jl          2f
    
    // |x| >= 1.0f
    movsd       RELATIVE_ADDR( reciprocalLn2 ), %xmm1       // 1 / ln(2)
    mulsd       %xmm2,                      %xmm1           // x / ln(2)
    cvttsd2si   %xmm1,                      %eax            // trunc( x / ln(2) ) 
    lea         RELATIVE_ADDR( expm1f_c),   CX_P
    cvtsi2sd    %eax,                       %xmm3           // trunc( x / ln(2) )
    addl        $1023,                      %eax            // add bias for exponent
    subsd       %xmm3,                      %xmm1           // f = x / ln(2)  - trunc( x / ln(2) ) 
    movd        %eax,                       %xmm7           // 2**i >> 32
    psllq       $52,                        %xmm7           // 2**i

    // c0 + c1x1 + c2x2 + c3x3 + c4x4 + c5x5 + c6x6 + c7x7 + c8x8 
#if defined( __SSE3__ )
    movddup     %xmm1,                  %xmm2       // { x, x }
#else
    movapd      %xmm1,                  %xmm2       // x
    unpcklpd    %xmm2,                  %xmm2       // { x, x }
#endif    
    mulsd       %xmm1,                  %xmm1       // x*x
    movapd      %xmm2,                  %xmm3
    mulpd       48(CX_P),               %xmm2       // { c3x, (c7/c8)x }
    mulpd       16(CX_P),               %xmm3       // { c1x, (c5/c8)x }
#if defined( __SSE3__ )
    movddup     %xmm1,                  %xmm4       // { xx, xx }
#else
    movapd      %xmm1,                  %xmm4       // xx
    unpcklpd    %xmm4,                  %xmm4       // { xx, xx }
#endif
    mulsd       %xmm1,                  %xmm1       // xx*xx
    addpd       32(CX_P),               %xmm2       // { c2 + c3x, (c6/c8) + (c7/c8)x }
    addpd       (CX_P),                 %xmm3       // { c0 + c1x, (c4/c8) + (c5/c8)x }
    mulpd       %xmm4,                  %xmm2       // { c2xx + c3xxx, (c6/c8)xx + (c7/c8)xxx }
    addsd       %xmm1,                  %xmm3       // { c0 + c1x, (c4/c8) + (c5/c8)x + xxxx }
    mulsd       64(CX_P),               %xmm1       // c8 * xxxx
    addpd       %xmm2,                  %xmm3       // { c0 + c1x + c2xx + c3xxx, (c4/c8) + (c5/c8)x + (c6/c8)xx + (c7/c8)xxx + xxxx }
    movhlps     %xmm3,                  %xmm6       // { ?, c0 + c1x + c2xx + c3xxx }
    mulsd       %xmm1,                  %xmm3       // { ..., c8xxxx* ((c4/c8) + (c5/c8)x + (c6/c8)xx + (c7/c8)xxx + xxxx) }
    addsd       %xmm6,                  %xmm3       // c0 + c1x + c2xx + c3xxx + c4xxxx + c5xxxxx + c6xxxxxx + c7xxxxxxx + c8xxxxxxxxx
    
    //result = 2**i * ((2**d-1) + 1) - 1
    //       = 2**i * (2**d-1) + 2**i - 1
    mulsd       %xmm7,                  %xmm3       // 2**i * {c0 + c1x + c2xx + c3xxx + c4xxxx + c5xxxxx + c6xxxxxx + c7xxxxxxx + c8xxxxxxxxx}
    subsd       72(CX_P),               %xmm7       // 2**i - 1.0
    addsd       %xmm7,                  %xmm3       

//  convert to single precision and return
    cvtsd2ss    %xmm3,                  %xmm0
#if defined( __i386__ )
    movss       %xmm0,                  FRAME_SIZE(STACKP)
    flds        FRAME_SIZE(STACKP)
#endif
    ret
    
2:  // |x| < 1.0f
    movsd       RELATIVE_ADDR( two6 ),  %xmm1       // 0x1.0p6
    mulsd       %xmm2,                  %xmm1       //  x * 0x1.0p6
    cvttsd2si   %xmm1,                  %eax        // i = trunc( x * 0x1.0p6 )
#if defined( __x86_64__ )
    cdqe                                            // sign extend eax
#endif
    shl         $4,                     AX_P      // sizeof( double[2] ) * trunc( x * 0x1.0p6 )
    lea         RELATIVE_ADDR( expm1f_table ), DX_P
    subsd       (DX_P, AX_P, 1),        %xmm2       // x -= i/64
    movsd       8(DX_P, AX_P, 1),       %xmm7       // e**i - 1
    
    //calculate e**d-1 using minimax polynomial
    lea         RELATIVE_ADDR( expm1f_taylor_polynomial), CX_P
    
    movsd       (2*8)(CX_P),            %xmm3       // 1/24
    mulsd       %xmm2,                  %xmm3       // d/24
    movapd      %xmm2,                  %xmm4       // d
    addsd       (1*8)(CX_P),            %xmm3       // 1/6 + d/24
    mulsd       %xmm4,                  %xmm4       // d * d
    mulsd       %xmm2,                  %xmm3       // d/6 + dd/24
    addsd       (CX_P),                 %xmm3       // 1/2 + d/6 + dd/24
    mulsd       %xmm4,                  %xmm3       // dd/2 + ddd/6 + dddd/24
    addsd       %xmm2,                  %xmm3       // d + dd/2 + ddd/6 + dddd/24
    
    //we have to do a little reduction here, otherwise we end up doing a 6th order Taylor series and it still is not very good:
    // e**x-1   = e**(i+d) -1         c is a table entry chosen so |b| < 1/32
    //          = [(e**(i)-1)+1][(e**(d)-1)+1] -1  = (y+1)(z+1)-1               y = e**(i)-1,  z = e**(d)-1
    //          = yz + y + z + 1 -1 = y + z + yz    
    movapd      %xmm7,                  %xmm4       // y
    mulsd       %xmm3,                  %xmm4       // y*z
    addsd       %xmm3,                  %xmm4       // yz + z
    addsd       %xmm7,                  %xmm4       // (yz + z) + y
    
//  convert to single precision and return
    cvtsd2ss    %xmm4,                  %xmm0
#if defined( __i386__ )
    movss       %xmm0,                  FRAME_SIZE(STACKP)
    flds        FRAME_SIZE(STACKP)
#endif
    ret
    
3: // |x| >= 128.0f * ln(2) || |x| <= LOW_CUTOFF || isnan(f)
    jl          4f
    
    // |x| >= 128.0f * ln(2) || isnan(f)

    //handle inf and NaN arguments
    cmpl    $(0x7f800000-LOW_CUTOFF),   %eax
    je          5f          // |x| == inf
    jg          6f          // isnan( x )
    
    //bounce denormal results back to the main code path
    // if( x > -150.0f * ln(2)
    cmpl    $0xc2cff1b4,                %ecx        //if( -128.0f*ln(2) >= x > -150.0f*ln(2) )
    jle          1b                                  //     go back to 1
    
    // |x| >= 128.0f * ln(2)
    xorps       %xmm1,                  %xmm1       // 0.0f
    movl        $0x7f7fffff,            %ecx
    movl        $0x21800000,            %eax        // 0x1.0p-60
    cmpltss     %xmm1,                  %xmm0       // x < 0 
    movd        %ecx,                   %xmm3
    movd        %eax,                   %xmm4
    movl        $0x3f800000,            %eax
    andps       %xmm0,                  %xmm4       // x < 0 ? 0x1.0p-60f : 0
    andnps      %xmm3,                  %xmm0       // x < 0 ? 0 : MAX_FLOAT
    orps        %xmm4,                  %xmm0       // x < 0 ? 0x1.0p-60f : MAX_FLT
    mulss       %xmm0,                  %xmm0       // x < 0 ? 0x1.0p-120f : Inf, overflow 
    movd        %eax,                   %xmm1
    subss       %xmm1,                  %xmm0       // x < 0 ? 0x1.0p-120f - 1.0f (inexact, round to correct result) : Inf, overflow
    
#if defined( __i386__ )
    movss       %xmm0,                  FRAME_SIZE( STACKP )
    flds        FRAME_SIZE(STACKP)
#endif
    ret
    
4: // |x| <= LOW_CUTOFF
    movl        $0x3cb00000,            %eax        // 0x1.0p-52 >> 32
    movl        $0x3ff00000,            %ecx        // 1.0 >> 32
    movd        %eax,                   %xmm1
    movd        %ecx,                   %xmm3
    psllq       $32,                    %xmm1       // 0x1.0p-52
    psllq       $32,                    %xmm3       // 1.0
    addsd       %xmm3,                  %xmm1       // 1.0 + DBL_EPSILON
    mulsd       %xmm1,                  %xmm2       // nudge towards correctly rounded
    cvtsd2ss    %xmm2,                  %xmm0       // round correctly, set inexact
#if defined( __i386__ )
    movss       %xmm0,                  FRAME_SIZE( STACKP )
    flds        FRAME_SIZE(STACKP)
#endif
    ret

5:  // |x| == inf, return -1 or Inf
    movl        $0x3f800000,            %eax
    movl        $0x7f800000,            %ecx
    movd        %eax,                   %xmm1
    movd        %ecx,                   %xmm3
    cmpeqss     %xmm0,                  %xmm3       // x == inf
    andps       %xmm3,                  %xmm0       // x == inf ? inf : 0
    subss       %xmm1,                  %xmm0       // x == inf ? inf : -1.0
#if defined( __i386__ )
    movss       %xmm0,                  FRAME_SIZE( STACKP )
    flds        FRAME_SIZE(STACKP)
#endif
    ret

6:  // |x|| == NaN
    addss       %xmm0,                  %xmm0
#if defined( __i386__ )
    movss       %xmm0,                  FRAME_SIZE( STACKP )
    flds        FRAME_SIZE(STACKP)
#endif
    ret