#if defined( __i386__ )
#include "xmmLibm_prefix.h"
static const double T1 = 0x1.F800000000000p-1; static const double T2 = 0x1.0400000000000p+0; static const double T52 = 0x1.0000000000000p+52; static const double T53 = 0x1.0000000000000p+53; static const double Emax = 0x1.62E42FEFA39EFp+9; static const double Emin = -0x1.74385446D71C3p+9; static const double A1 = 0x1.5555555555555p-4; static const double A2 = 0x1.99999999FE736p-7; static const double A3 = 0x1.24919E6600596p-9; static const double B1 = 0x1.555555554F9EDp-4; static const double B2 = 0x1.999A118F10BD9p-7; static const double C1 = 0x1.5555555555555p-4; static const double C2 = 0x1.99999999AF7C8p-7; static const double C3 = 0x1.24923B2F1315Cp-9; static const double C4 = 0x1.CE20EE795DBA1p-12; static const double C5 = 0x1.CE20EE795DBA1p-12; static const double Inv_L = 0x1.71547652B82FEp+5; static const double L1 = 0x1.62E42FEF00000p-6; static const double L2 = 0x1.473DE6AF278EDp-39; static const double P1 = 0x1.0000000000000p-1; static const double P2 = 0x1.5555555547334p-3; static const double P3 = 0x1.555555554A9D2p-5; static const double P4 = 0x1.1111609FBE568p-7; static const double P5 = 0x1.6C172098CB78Bp-10; static const double plusinf = 1e500;
static const double logtable[129*2] = {
0.0, 0.0, 0x1.FE02A6B200000p-8, -0x1.F30EE07912DF9p-41, 0x1.FC0A8B1000000p-7, -0x1.FE0E183092C59p-42, 0x1.7B91B07D80000p-6, -0x1.2772AB6C0559Cp-41, 0x1.F829B0E780000p-6, 0x1.980267C7E09E4p-45, 0x1.39E87BA000000p-5, -0x1.42A056FEA4DFDp-41, 0x1.77458F6340000p-5, -0x1.2303B9CB0D5E1p-41, 0x1.B42DD71180000p-5, 0x1.71BEC28D14C7Ep-41, 0x1.F0A30C0100000p-5, 0x1.62A6617CC9717p-41, 0x1.16536EEA40000p-4, -0x1.0A3E2F3B47D18p-41, 0x1.341D7961C0000p-4, -0x1.717B6B33E44F8p-43, 0x1.51B073F060000p-4, 0x1.83F69278E686Ap-44, 0x1.6F0D28AE60000p-4, -0x1.2968C836CC8C2p-41, 0x1.8C345D6320000p-4, -0x1.937C294D2F567p-42, 0x1.A926D3A4A0000p-4, 0x1.AAC6CA17A4554p-41, 0x1.C5E548F5C0000p-4, -0x1.C5E7514F4083Fp-43, 0x1.E27076E2A0000p-4, 0x1.E5CBD3D50FFFCp-41, 0x1.FEC9131DC0000p-4, -0x1.54555D1AE6607p-44, 0x1.0D77E7CD10000p-3, -0x1.C69A65A23A170p-41, 0x1.1B72AD52F0000p-3, 0x1.9E80A41811A39p-41, 0x1.29552F8200000p-3, -0x1.5B967F4471DFCp-44, 0x1.371FC201F0000p-3, -0x1.C22F10C9A4EA8p-41, 0x1.44D2B6CCB0000p-3, 0x1.F4799F4F6543Ep-41, 0x1.526E5E3A20000p-3, -0x1.2F21746FF8A47p-41, 0x1.5FF3070A80000p-3, -0x1.B0B0DE3077D7Ep-41, 0x1.6D60FE71A0000p-3, -0x1.6F1B955C4D1DAp-42, 0x1.7AB8902110000p-3, -0x1.37B720E4A694Bp-42, 0x1.87FA065210000p-3, -0x1.B77B7EFFB7F41p-42, 0x1.9525A9CF40000p-3, 0x1.5AD1D904C1D4Ep-41, 0x1.A23BC1FE30000p-3, -0x1.2A739B23B93E1p-41, 0x1.AF3C94E810000p-3, -0x1.00349CC67F9B2p-41, 0x1.BC286742E0000p-3, -0x1.CCA75818C5DBCp-41, 0x1.C8FF7C79B0000p-3, -0x1.97794F689F843p-41, 0x1.D5C216B500000p-3, -0x1.11BA91BBCA682p-41, 0x1.E27076E2B0000p-3, -0x1.A342C2AF0003Cp-44, 0x1.EF0ADCBDC0000p-3, 0x1.64D948637950Ep-41, 0x1.FB9186D5E0000p-3, 0x1.F1546AAA3361Cp-42, 0x1.0402594B50000p-2, -0x1.7DF928EC217A5p-41, 0x1.0A324E2738000p-2, 0x1.0E35F73F7A018p-42, 0x1.1058BF9AE8000p-2, -0x1.A9573B02FAA5Ap-41, 0x1.1675CABAB8000p-2, 0x1.30701CE63EAB9p-41, 0x1.1C898C1698000p-2, 0x1.9FAFBC68E7540p-42, 0x1.22941FBCF8000p-2, -0x1.A6976F5EB0963p-44, 0x1.2895A13DE8000p-2, 0x1.A8D7AD24C13F0p-44, 0x1.2E8E2BAE10000p-2, 0x1.D309C2CC91A85p-42, 0x1.347DD9A988000p-2, -0x1.5594DD4C58092p-45, 0x1.3A64C55698000p-2, -0x1.D0B1C68651946p-41, 0x1.4043086868000p-2, 0x1.3F1DE86093EFAp-41, 0x1.4618BC21C8000p-2, -0x1.09EC17A426426p-41, 0x1.4BE5F95778000p-2, -0x1.D7C92CD9AD824p-44, 0x1.51AAD872E0000p-2, -0x1.F4BD8DB0A7CC1p-44, 0x1.5767717458000p-2, -0x1.2C9D5B2A49AF9p-41, 0x1.5D1BDBF580000p-2, 0x1.394A11B1C1EE4p-43, 0x1.62C82F2BA0000p-2, -0x1.C356848506EADp-41, 0x1.686C81E9B0000p-2, 0x1.4AEC442BE1015p-42, 0x1.6E08EAA2B8000p-2, 0x1.0F1C609C98C6Cp-41, 0x1.739D7F6BC0000p-2, -0x1.7FCB18ED9D603p-41, 0x1.792A55FDD8000p-2, -0x1.C2EC1F512DC03p-41, 0x1.7EAF83B828000p-2, 0x1.7E1B259D2F3DAp-41, 0x1.842D1DA1E8000p-2, 0x1.62E927628CBC2p-43, 0x1.89A3386C18000p-2, -0x1.ED2A52C73BF78p-41, 0x1.8F11E87368000p-2, -0x1.D3881E8962A96p-42, 0x1.947941C210000p-2, 0x1.6FABA4CDD147Dp-42, 0x1.99D9581180000p-2, -0x1.F753456D113B8p-42, 0x1.9F323ECBF8000p-2, 0x1.84BF2B68D766Fp-42, 0x1.A484090E58000p-2, 0x1.D8515FE535B87p-41, 0x1.A9CEC9A9A0000p-2, 0x1.0931A909FEA5Ep-43, 0x1.AF12932478000p-2, -0x1.E53BB31EED7A9p-44, 0x1.B44F77BCC8000p-2, 0x1.EC5197DDB55D3p-43, 0x1.B985896930000p-2, 0x1.0FB598FB14F89p-42, 0x1.BEB4D9DA70000p-2, 0x1.B7BF7861D37ACp-42, 0x1.C3DD7A7CD8000p-2, 0x1.6A6B9D9E0A5BDp-41, 0x1.C8FF7C79A8000p-2, 0x1.A21AC25D81EF3p-42, 0x1.CE1AF0B860000p-2, -0x1.8290905A86AA6p-43, 0x1.D32FE7E010000p-2, -0x1.42A9E21373414p-42, 0x1.D83E7258A0000p-2, 0x1.79F2828ADD176p-41, 0x1.DD46A04C20000p-2, -0x1.DAFA08CECADB1p-41, 0x1.E24881A7C8000p-2, -0x1.3D9E34270BA6Bp-42, 0x1.E744261D68000p-2, 0x1.E1F8DF68DBCF3p-44, 0x1.EC399D2468000p-2, 0x1.9802EB9DCA7E7p-43, 0x1.F128F5FAF0000p-2, 0x1.BB2CD720EC44Cp-44, 0x1.F6123FA700000p-2, 0x1.45630A2B61E5Bp-41, 0x1.FAF588F790000p-2, -0x1.9C24CA098362Bp-43, 0x1.FFD2E08580000p-2, -0x1.6CF54D05F9367p-43, 0x1.02552A5A5C000p-1, 0x1.0FEC69C695D7Fp-41, 0x1.04BDF9DA94000p-1, -0x1.92D9A033EFF75p-41, 0x1.0723E5C1CC000p-1, 0x1.F404E57963891p-41, 0x1.0986F4F574000p-1, -0x1.5BE8DC04AD601p-42, 0x1.0BE72E4254000p-1, -0x1.57D49676844CCp-41, 0x1.0E44985D1C000p-1, 0x1.917EDD5CBBD2Dp-42, 0x1.109F39E2D4000p-1, 0x1.92DFBC7D93617p-42, 0x1.12F7195940000p-1, -0x1.043ACFEDCE638p-41, 0x1.154C3D2F4C000p-1, 0x1.5E9A98F33A396p-41, 0x1.179EABBD88000p-1, 0x1.9A0BFC60E6FA0p-41, 0x1.19EE6B467C000p-1, 0x1.2DD98B97BAEF0p-42, 0x1.1C3B81F714000p-1, -0x1.EDA1B58389902p-44, 0x1.1E85F5E704000p-1, 0x1.A07BD8B34BE7Cp-46, 0x1.20CDCD192C000p-1, -0x1.4926CAFC2F08Ap-41, 0x1.23130D7BEC000p-1, -0x1.7AFA4392F1BA7p-46, 0x1.2555BCE990000p-1, -0x1.06987F78A4A5Ep-42, 0x1.2795E1289C000p-1, -0x1.DCA290F81848Dp-42, 0x1.29D37FEC2C000p-1, -0x1.EEA6F465268B4p-42, 0x1.2C0E9ED448000p-1, 0x1.D1772F5386374p-42, 0x1.2E47436E40000p-1, 0x1.34202A10C3491p-44, 0x1.307D7334F0000p-1, 0x1.0BE1FB590A1F5p-41, 0x1.32B1339120000p-1, 0x1.D71320556B67Bp-41, 0x1.34E289D9D0000p-1, -0x1.E2CE9146D277Ap-41, 0x1.37117B5474000p-1, 0x1.ED71774092113p-43, 0x1.393E0D3564000p-1, -0x1.5E6563BBD9FC9p-41, 0x1.3B6844A000000p-1, -0x1.EEA838909F3D3p-44, 0x1.3D9026A714000p-1, 0x1.6FAA404263D0Bp-41, 0x1.3FB5B84D18000p-1, -0x1.0BDA4B162AFA3p-41, 0x1.41D8FE8468000p-1, -0x1.AA33736867A17p-42, 0x1.43F9FE2F9C000p-1, 0x1.CCEF4E4F736C2p-42, 0x1.4618BC21C4000p-1, 0x1.EC27D0B7B37B3p-41, 0x1.48353D1EA8000p-1, 0x1.1BEE7ABD17660p-42, 0x1.4A4F85DB04000p-1, -0x1.44FDD840B8591p-45, 0x1.4C679AFCD0000p-1, -0x1.1C64E971322CEp-41, 0x1.4E7D811B74000p-1, 0x1.BB09CB0985646p-41, 0x1.50913CC018000p-1, -0x1.794B434C5A4F5p-41, 0x1.52A2D265BC000p-1, 0x1.6ABB9DF22BC57p-43, 0x1.54B2467998000p-1, 0x1.497A915428B44p-41, 0x1.56BF9D5B40000p-1, -0x1.8CD7DC73BD194p-42, 0x1.58CADB5CD8000p-1, -0x1.9DB3DB43689B4p-43, 0x1.5AD404C358000p-1, 0x1.F2CFB29AAA5F0p-41, 0x1.5CDB1DC6C0000p-1, 0x1.7648CF6E3C5D7p-41, 0x1.5EE02A9240000p-1, 0x1.67570D6095FD2p-41, 0x1.60E32F4478000p-1, 0x1.1B194F912B417p-42, 0x1.62E42FEFA4000p-1, -0x1.8432A1B0E2634p-43 };
static const double exptable[32*2] = {
0x1.0000000000000p+0, 0.0, 0x1.059B0D3158540p+0, 0x1.A1D73E2A475B4p-47, 0x1.0B5586CF98900p+0, 0x1.EC5317256E308p-49, 0x1.11301D0125B40p+0, 0x1.0A4EBBF1AED93p-48, 0x1.172B83C7D5140p+0, 0x1.D6E6FBE462876p-47, 0x1.1D4873168B980p+0, 0x1.53C02DC0144C8p-47, 0x1.2387A6E756200p+0, 0x1.C3360FD6D8E0Bp-47, 0x1.29E9DF51FDEC0p+0, 0x1.09612E8AFAD12p-47, 0x1.306FE0A31B700p+0, 0x1.52DE8D5A46306p-48, 0x1.371A7373AA9C0p+0, 0x1.54E28AA05E8A9p-49, 0x1.3DEA64C123400p+0, 0x1.11ADA0911F09Fp-47, 0x1.44E0860618900p+0, 0x1.68189B7A04EF8p-47, 0x1.4BFDAD5362A00p+0, 0x1.38EA1CBD7F621p-47, 0x1.5342B569D4F80p+0, 0x1.DF0A83C49D86Ap-52, 0x1.5AB07DD485400p+0, 0x1.4AC64980A8C8Fp-47, 0x1.6247EB03A5580p+0, 0x1.2C7C3E81BF4B7p-50, 0x1.6A09E667F3BC0p+0, 0x1.921165F626CDDp-49, 0x1.71F75E8EC5F40p+0, 0x1.9EE91B8797785p-47, 0x1.7A11473EB0180p+0, 0x1.B5F54408FDB37p-50, 0x1.82589994CCE00p+0, 0x1.28ACF88AFAB35p-48, 0x1.8ACE5422AA0C0p+0, 0x1.B5BA7C55A192Dp-48, 0x1.93737B0CDC5C0p+0, 0x1.27A280E1F92A0p-47, 0x1.9C49182A3F080p+0, 0x1.01C7C46B071F3p-48, 0x1.A5503B23E2540p+0, 0x1.C8B424491CAF8p-48, 0x1.AE89F995AD380p+0, 0x1.6AF439A68BB99p-47, 0x1.B7F76F2FB5E40p+0, 0x1.BAA9EC206AD4Fp-50, 0x1.C199BDD855280p+0, 0x1.C2220CB12A092p-48, 0x1.CB720DCEF9040p+0, 0x1.48A81E5E8F4A5p-47, 0x1.D5818DCFBA480p+0, 0x1.C976816BAD9B8p-50, 0x1.DFC97337B9B40p+0, 0x1.EB968CAC39ED3p-48, 0x1.EA4AFA2A490C0p+0, 0x1.9858F73A18F5Ep-48, 0x1.F50765B6E4540p+0, 0x1.9D3E12DD8A18Bp-54 };
static const double one = 1.0;
static const double mOne = -1.0;
static const double two = 2.0;
static const xSInt64 bias[2] = { {2045, 0}, {1023, 0} };
static const xUInt64 POWER_NAN = { 0x7FF8000000000025ULL, 0 };
static inline xDouble ipow( xDouble x, uint32_t u ) ALWAYS_INLINE;
static inline xDouble _pow( xDouble X, xDouble Y ) ALWAYS_INLINE;
static inline xDouble xscalb( xDouble x, int M )
{
static const double scale[2] = { 0x1.0p1022, 0x1.0p-1022 };
static const int step[2] = { 1022, -1022 };
int index = M >> 31;
int count = abs(M);
xSInt64 xm = _mm_cvtsi32_si128( M + 1023 );
xSInt64 xstep = _mm_cvtsi32_si128( step[-index] );
xDouble xscale = _mm_load_sd( scale - index );
if( count > 1022 )
{
x = _mm_mul_sd( x, xscale );
xm = _mm_sub_epi64( xm, xstep );
count -= 1022;
if( count > 1022 )
{
x = _mm_mul_sd( x, xscale );
xm = _mm_sub_epi64( xm, xstep );
count -= 1022;
if( count > 1022 )
return _mm_mul_sd( x, xscale );
}
}
xm = _mm_slli_epi64( xm, 52 );
return _mm_mul_sd( x, (xDouble) xm );
}
static inline xDouble ipow( xDouble x, uint32_t u )
{
xDouble result;
uint32_t mask = 1;
if( 0 == u )
return _mm_load_sd( &one );
while( u && 0 == (u & mask) )
{
x = _mm_mul_sd( x,x );
mask += mask;
}
result = x;
u &= ~mask;
while( u )
{
x = _mm_mul_sd( x,x );
mask += mask;
if( u & mask )
{
result = _mm_mul_sd( result, x );
u &= ~mask;
}
}
return result;
}
static inline xDouble _pow( xDouble X, xDouble Y )
{
int oldMXCSR = _mm_getcsr();
int newMXCSR = (oldMXCSR | DEFAULT_MXCSR) & DEFAULT_MASK; if( newMXCSR != oldMXCSR )
_mm_setcsr( newMXCSR );
static const double tenM20 = 1e-20;
static const double ten20 = 1e20;
static const double d16 = 16.0;
static const double d128 = 128.0;
static const double r128 = 1.0/128.0;
static const double r8 = 1.0/8.0;
static const double smallestNormal = 0x1.0p-1022;
static const xUInt64 iOne = { 1, 0 };
if( _mm_istrue_sd( _mm_or_pd( _mm_cmpeq_sdm( X, &one ), _mm_cmpeq_sdm( Y, (double*) &minusZeroD ) )) )
{
X = _mm_load_sd( &one );
if( newMXCSR != oldMXCSR )
_mm_setcsr( oldMXCSR );
return X;
}
if( _mm_istrue_sd( _mm_cmpunord_sd( X, Y ) ) )
{
if( newMXCSR != oldMXCSR )
_mm_setcsr( oldMXCSR );
return _mm_add_sd( X, Y );
}
xDouble fabsY = _mm_andnot_pd( minusZeroD, Y );
xDouble fabsX = _mm_andnot_pd( minusZeroD, X );
xDouble fabsyIsInf = _mm_cmpeq_sdm( fabsY, &plusinf );
xDouble signPatch = _mm_setzero_pd();
xDouble floorbias = _mm_load_sd( &T52 );
floorbias = _mm_and_pd( floorbias, _mm_cmplt_sd( fabsY, floorbias ) );
xDouble yShifted = _mm_add_sd( fabsY, floorbias);
xDouble roundFabsY = _mm_sub_sd( yShifted, floorbias );
xDouble yIsEven = (xDouble) _mm_sub_epi64( _mm_and_si128( (xUInt64) yShifted, iOne ), iOne );
xDouble yIsInt = _mm_cmpeq_sd( roundFabsY, fabsY);
int intY = _mm_cvtsd_si32( Y );
_mm_setcsr( newMXCSR ); yIsEven = _mm_or_pd( yIsEven, _mm_cmpge_sdm( fabsY, &T53 ) ); yIsInt = _mm_andnot_pd( fabsyIsInf, yIsInt );
xDouble fabsxLTInf = _mm_cmplt_sdm( fabsX, &plusinf );
if( _mm_istrue_sd( _mm_and_pd( _mm_cmplt_sdm( X, (double*) &minusZeroD ), fabsxLTInf ))) {
if( _mm_istrue_sd( yIsInt ) )
{
X = fabsX; signPatch = _mm_andnot_pd( yIsEven, minusZeroD );
if( _mm_istrue_sd( _mm_cmpeq_sdm( X, &one )) )
{
X = _mm_load_sd( &one );
goto exit;
}
}
else
{
if( _mm_istrue_sd( _mm_cmplt_sdm( fabsY, &plusinf ) ) )
{
X = (xDouble) POWER_NAN;
oldMXCSR |= INVALID_FLAG;
goto exit;
}
}
}
xDouble xGTzero = _mm_cmpgt_sdm( X, (double*) &minusZeroD );
xDouble yLTzero = _mm_cmplt_sdm( Y, (double*) &minusZeroD );
xDouble xEQzero = _mm_cmpeq_sdm( X, (double*) &minusZeroD );
if( _mm_istrue_sd( _mm_andnot_pd( xEQzero, yIsInt)) && intY != 0x80000000 )
{
if( intY < 0 )
{
_mm_setcsr( DEFAULT_MXCSR );
xDouble recip = _mm_div_sd( _mm_load_sd( &one ), X );
if( 0 == (_mm_getcsr() & INEXACT_FLAG ) )
{
X = ipow( recip, -intY );
oldMXCSR |= _mm_getcsr() & ALL_FLAGS;
_mm_setcsr( oldMXCSR );
X = _mm_xor_pd( X, signPatch );
return X;
}
_mm_setcsr( newMXCSR );
}
else
{
X = ipow( X, intY );
oldMXCSR |= _mm_getcsr() & ALL_FLAGS;
_mm_setcsr( oldMXCSR );
X = _mm_xor_pd( X, signPatch );
return X;
}
}
xDouble fabsyEQhalf = _mm_cmpeq_sdm( fabsY, &P1 );
if( _mm_istrue_sd( _mm_and_pd( fabsyEQhalf, xGTzero ) ) )
{
_mm_setcsr( oldMXCSR );
X = _MM_SQRT_SD( X );
if( _mm_istrue_sd( yLTzero ) )
X = _mm_div_sd( _mm_load_sd( &one ), X );
return X;
}
xDouble xLTinf = _mm_cmplt_sdm( X, &plusinf );
xDouble fabsyGT1em20 = _mm_cmpgt_sdm( fabsY, &tenM20 );
xDouble fabsyLT1e20 = _mm_cmplt_sdm( fabsY, &ten20 );
if( _mm_istrue_sd( _mm_and_pd( _mm_and_pd( xGTzero, xLTinf ), _mm_and_pd( fabsyGT1em20, fabsyLT1e20 ) ) ) ) {
xDouble z1, z2, w1, w2;
xDouble t1LTx = _mm_cmpgt_sdm( X, &T1 );
xDouble xLTt2 = _mm_cmplt_sdm( X, &T2 );
if( _mm_istrue_sd( _mm_and_pd( t1LTx, xLTt2 ) ) )
{
xDouble xEQone = _mm_cmpeq_sdm( X, &one );
if( _mm_istrue_sd( xEQone ) )
{
X = _mm_load_sd( &one );
goto exit;
}
xDouble f = _mm_sub_sdm( X, &one );
xDouble f1 = _mm_cvtss_sd( f, _mm_cvtsd_ss( (xFloat)f, f ));
xDouble f2 = _mm_sub_sd( f, f1 );
xDouble g = _mm_add_sdm( f, &two );
g = _mm_div_sd( _mm_load_sd( &one ), g );
xDouble u = _mm_mul_sd( f, g );
u = _mm_add_sd( u, u );
xDouble v = _mm_mul_sd( u, u );
xDouble q = _mm_add_sdm( _mm_mul_sdm( v, &C5 ), &C4 );
q = _mm_add_sdm( _mm_mul_sd( q, v ), &C3 );
q = _mm_add_sdm( _mm_mul_sd( q, v ), &C2 );
q = _mm_add_sdm( _mm_mul_sd( q, v ), &C1 );
v = _mm_mul_sd( v, u );
q = _mm_mul_sd( q, v );
xDouble u1 = _mm_cvtss_sd( u, _mm_cvtsd_ss( (xFloat)u, u ));
xDouble u2 = _mm_sub_sd( f, u1 );
u2 = _mm_add_sd( u2, u2 );
u2 = _mm_sub_sd( u2, _mm_mul_sd( u1, f1 ) );
u2 = _mm_sub_sd( u2, _mm_mul_sd( u1, f2 ) );
u2 = _mm_mul_sd( u2, g );
z2 = _mm_add_sd( q, u2 );
z1 = _mm_add_sd( u1, z2 );
z1 = _mm_cvtss_sd( z1, _mm_cvtsd_ss( (xFloat)z1, z1 ));
z2 = _mm_add_sd( z2, _mm_sub_sd( u1, z1 ));
}
else
{
xDouble xOne = _mm_load_sd( &one );
xDouble isDenormal = _mm_cmplt_sdm( fabsX, &smallestNormal );
xDouble denormBias = _mm_and_pd( xOne, isDenormal );
int isdenorm = _mm_cvtsi128_si32( (xSInt64) isDenormal );
X = _mm_or_pd( X, denormBias );
X = _mm_sub_sd( X, denormBias );
xDouble inf = _mm_load_sd( &plusinf );
xDouble exponent = _mm_and_pd( X, inf);
xDouble x = _mm_andnot_pd( inf, X );
xSInt64 xi = _mm_srli_epi64( (xUInt64) exponent, 52 );
x = _mm_or_pd( x, xOne );
xi = _mm_sub_epi32( xi, bias[1 + isdenorm] );
exponent = _mm_cvtepi32_pd( (xSInt32) xi );
int j = _mm_cvttsd_si32( _mm_add_sdm( _mm_mul_sdm( x, &d128) , &P1 ));
xDouble F = _mm_mul_sdm( _mm_cvtsi32_sd( X, j ), &r128 );
xDouble g = _mm_div_sd( xOne, _mm_add_sd( F, x ) );
xDouble f = _mm_sub_sd( x, F );
xDouble a1 = _mm_add_sdm( _mm_mul_sdm( exponent, &logtable[128*2]), &logtable[(j-128)*2] );
xDouble a2 = _mm_add_sdm( _mm_mul_sdm( exponent, &logtable[128*2]+1), &logtable[(j-128)*2+1] );
xDouble u = _mm_mul_sd( f, g );
u = _mm_add_sd( u, u );
xDouble u1 = _mm_cvtss_sd( u, _mm_cvtsd_ss( (xFloat)u, u ) );
xDouble v = _mm_mul_sd( u, u );
xDouble c = _mm_mul_sd( Y, a1 );
c = _mm_andnot_pd( minusZeroD, c );
xDouble cLTsixteen = _mm_cmplt_sdm( c, &d16 );
xDouble cLToneeighth = _mm_cmplt_sdm( c, &r8 );
xDouble q, u2;
if( _mm_istrue_sd( cLToneeighth ) )
{
u2 = _mm_sub_sd( f, _mm_mul_sd( u1, F ) );
u2 = _mm_add_sd( u2, u2 );
u2 = _mm_sub_sd( u2, _mm_mul_sd( u1, f ) );
u2 = _mm_mul_sd( u2, g );
q = _mm_add_sdm( _mm_mul_sdm( v, &B2 ), &B1 );
q = _mm_mul_sd( q, v );
q = _mm_mul_sd( q, u );
}
else
{
if( _mm_istrue_sd( cLTsixteen ) )
{
u2 = _mm_sub_sd( f, _mm_mul_sd( u1, F ) );
u2 = _mm_add_sd( u2, u2 );
u2 = _mm_sub_sd( u2, _mm_mul_sd( u1, f ) );
u2 = _mm_mul_sd( u2, g );
q = _mm_add_sdm( _mm_mul_sdm( v, &A3 ), &A2 );
q = _mm_add_sdm( _mm_mul_sd( q, v ), &A1 );
q = _mm_mul_sd( _mm_mul_sd( q, v ), u );
}
else
{
xDouble f1 = _mm_cvtss_sd( f, _mm_cvtsd_ss( (xFloat) f, f ));
xDouble f2 = _mm_sub_sd( f, f1 );
u2 = _mm_sub_sd( f, _mm_mul_sd( u1, F ) ); u2 = _mm_add_sd( u2, u2 ); u2 = _mm_sub_sd( u2, _mm_mul_sd( u1, f1 ) ); u2 = _mm_sub_sd( u2, _mm_mul_sd( u1, f2 ) ); u2 = _mm_mul_sd( u2, g );
q = _mm_add_sdm( _mm_mul_sdm( v, &A3 ), &A2 );
q = _mm_add_sdm( _mm_mul_sd( q, v ), &A1 );
q = _mm_mul_sd( _mm_mul_sd( q, v ), u );
}
}
xDouble p = _mm_add_sd( u2, q );
xDouble t = _mm_add_sd( a1, u1 );
a2 = _mm_add_sd( a2, _mm_add_sd( _mm_sub_sd( a1, t), u1 ) );
z2 = _mm_add_sd( p, a2 );
z1 = _mm_add_sd( t, z2 );
z1 = _mm_cvtss_sd( z1, _mm_cvtsd_ss( (xFloat) z1, z1 ));
z2 = _mm_add_sd( z2, _mm_sub_sd( t, z1) );
}
{
xDouble y1 = _mm_cvtss_sd( Y, _mm_cvtsd_ss( (xFloat) Y, Y ));
xDouble y2 = _mm_sub_sd( Y, y1 );
w2 = _mm_add_sd( _mm_mul_sd( y2, z1 ), _mm_mul_sd( y2, z2 ));
w2 = _mm_add_sd( w2, _mm_mul_sd( y1, z2 ) );
w1 = _mm_mul_sd( y1, z1 );
}
xDouble overflow = _mm_add_sd( w1, w2);
xDouble underflow = _mm_cmplt_sdm( overflow, &Emin );
overflow = _mm_cmpgt_sdm( overflow, &Emax );
if( _mm_istrue_sd( _mm_or_pd( overflow, underflow ) ) )
{
int overflowFlag = _mm_cvtsi128_si32( (xUInt64) overflow ) & OVERFLOW_FLAG;
int underflowFlag = _mm_cvtsi128_si32( (xUInt64) underflow ) & UNDERFLOW_FLAG;
oldMXCSR |= overflowFlag | underflowFlag | INEXACT_FLAG;
X = _mm_and_pd( _mm_load_sd( &plusinf ), overflow );
goto exit;
}
{
int N = _mm_cvtsd_si32( _mm_mul_sdm( w1, &Inv_L) );
int j = N & 31;
int M = N >> 5;
xDouble dN = _mm_cvtsi32_sd( w1, N );
xDouble r = _mm_sub_sd( w1, _mm_mul_sdm( dN, &L1 ) );
r = _mm_add_sd( r, w2 );
r = _mm_sub_sd( r, _mm_mul_sdm( dN, &L2 ) );
xDouble p = _mm_add_sdm( _mm_mul_sdm( r, &P5 ), &P4 );
p = _mm_add_sdm( _mm_mul_sd( p, r ), &P3 );
p = _mm_add_sdm( _mm_mul_sd( p, r ), &P2 );
p = _mm_add_sdm( _mm_mul_sd( p, r ), &P1 );
p = _mm_add_sd( _mm_mul_sd( _mm_mul_sd( p, r ), r), r );
xDouble S = _mm_add_sdm( _mm_load_sd( &exptable[ 2 * j ] ), &exptable[ 2 * j +1] );
xDouble t = _mm_add_sdm( _mm_mul_sd( S, p ), &exptable[ 2 * j +1] );
t = _mm_add_sdm( t, &exptable[ 2 * j ] );
X = xscalb(t, M);
oldMXCSR |= _mm_getcsr() & ALL_FLAGS;
goto exit;
}
}
xDouble yIsOddInt = _mm_andnot_pd( yIsEven, yIsInt );
xDouble resultSign = _mm_and_pd( X, minusZeroD );
if( _mm_istrue_sd( xEQzero ) )
{
int div_zero_flag = _mm_cvtsi128_si32( (xSInt64) yLTzero ) & DIVIDE_0_FLAG;
X = _mm_and_pd( _mm_load_sd( &plusinf), yLTzero );
resultSign = _mm_and_pd( yIsOddInt, resultSign );
X = _mm_or_pd( X, resultSign );
oldMXCSR |= div_zero_flag;
goto exit;
}
if( _mm_istrue_sd( _mm_cmpeq_sdm( fabsX, &plusinf ) ) )
{
X = _mm_andnot_pd( yLTzero, _mm_load_sd( &plusinf) );
resultSign = _mm_and_pd( yIsOddInt, resultSign );
X = _mm_or_pd( X, resultSign );
goto exit;
}
if( _mm_istrue_sd( fabsyIsInf ) )
{
xDouble fabsxGTone = _mm_cmpgt_sdm( fabsX, &one );
xDouble isInf = _mm_xor_pd( yLTzero, fabsxGTone );
xDouble isOne = _mm_cmpeq_sdm( X, &mOne ); X = _mm_and_pd( isInf, _mm_load_sd( &plusinf ) );
X = _mm_sel_pd( X, _mm_load_sd( &one ), isOne );
X = _mm_xor_pd( X, signPatch );
goto exit;
}
xDouble fabsxLTone = _mm_cmplt_sdm( fabsX, &one );
xDouble underflow = _mm_xor_pd( fabsxLTone, yLTzero );
xDouble overflow = _mm_andnot_pd( underflow, xLTinf ); xDouble fabsyLTone = _mm_cmplt_sdm( fabsY, &one );
xDouble fabsxEQone = _mm_cmpeq_sdm( fabsX, &one );
underflow = _mm_andnot_pd( fabsyLTone, underflow );
overflow = _mm_andnot_pd( fabsyLTone, overflow );
int overflowMask = _mm_cvtsi128_si32( (xUInt32) overflow ) & OVERFLOW_FLAG;
int underflowMask = _mm_cvtsi128_si32( (xUInt32) underflow ) & UNDERFLOW_FLAG;
int inexactMask = (~ _mm_cvtsi128_si32( (xUInt32) fabsxEQone )) & INEXACT_FLAG;
X = _mm_and_pd( overflow, _mm_load_sd( &plusinf ) );
X = _mm_sel_pd( X, _mm_load_sd( &one), fabsyLTone );
oldMXCSR |= overflowMask | underflowMask | inexactMask;
_mm_setcsr( oldMXCSR );
return X;
exit:
_mm_setcsr( oldMXCSR );
return _mm_xor_pd( X, signPatch );
}
double pow( double x, double y )
{
xDouble xd = DOUBLE_2_XDOUBLE( x );
xDouble yd = DOUBLE_2_XDOUBLE( y );
xd = _pow( xd, yd );
return XDOUBLE_2_DOUBLE( xd );
}
float powf( float x, float y )
{
xDouble xd = FLOAT_2_XDOUBLE( x );
xDouble yd = FLOAT_2_XDOUBLE( y );
xd = _pow( xd, yd );
return XDOUBLE_2_FLOAT( xd );
}
#include <math.h>
#include <float.h>
long double powl( long double x, long double y )
{
static const double neg_epsilon = 0x1.0p63;
if( x == 1.0 )
return x;
if( y == 0.0 )
return 1.0L;
if( x != x || y != y )
return x + y;
long double fabsy = __builtin_fabsl( y );
long double fabsx = __builtin_fabsl( x );
long double infinity = __builtin_infl();
long double iy = nearbyintl( fabsy ); if( iy > fabsy ) iy -= 1.0L;
int isOddInt = 0;
if( fabsy == iy && fabsy != infinity && iy < neg_epsilon )
isOddInt = iy - 2.0L * nearbyintl( 0.5L * iy );
if( x == 0.0 )
{
if( ! isOddInt )
x = 0.0L;
if( y < 0 )
x = 1.0L/ x;
return x;
}
if( fabsx == infinity )
{
if( x < 0 )
{
if( isOddInt )
{
if( y < 0 )
return -0.0;
else
return -infinity;
}
else
{
if( y < 0 )
return 0.0;
else
return infinity;
}
}
if( y < 0 )
return 0;
return infinity;
}
if( fabsy == infinity )
{
if( x == -1 )
return 1;
if( y < 0 )
{
if( fabsx < 1 )
return infinity;
return 0;
}
if( fabsx < 1 )
return 0;
return infinity;
}
if( x < 0 && iy != fabsy )
return nanl("37") + sqrtl( -1 );
if( fabsy == 0.5 )
{
x = sqrtl( x );
if( y < 0 )
x = 1.0L/ x;
return x;
}
long double fy = fabsy - iy;
long double fx = 1.0;
long double ix = 1.0;
if( fy != 0 ) {
fx =log2l(x);
long double fabsfx = __builtin_fabsl( fx );
long double min = fminl( fy, fabsfx );
long double max = fmaxl( fy, fabsfx );
if( y < 0 )
fy = -fy;
if( min < 0x1.0p-8191L && max < neg_epsilon ) {
fx = 1; }
else
{ fx *= fy;
fx = exp2l( fx );
}
}
if( y < 0 && iy != 0 )
x = 1.0L/ x;
while( iy != 0.0L )
{
long double ylo;
if( x == 0.0 || x == infinity )
{
ix *= x; break;
}
if( iy > 0x1.0p30 )
{
long double scaled = iy * 0x1.0p-30L;
long double yhi = nearbyintl( scaled );
if( yhi > scaled )
yhi -= 1.0;
ylo = iy - 0x1.0p30L * yhi;
iy = yhi;
}
else
{ ylo = iy;
iy = 0;
}
int j;
int i = ylo;
int mask = 1;
if( i & 1 )
{
ix *= x;
i -= mask;
}
for( j = 0; j < 30 && i != 0; j++ )
{
mask += mask;
x *= x;
if( i & mask )
{
ix *= x;
i -= mask;
}
}
if( 0.0 != iy )
for( ; j < 30; j++ )
x *= x;
}
x = fx * ix;
return x;
}
#pragma mark -
#pragma mark cbrt
static inline double _cbrt( double _x ) ALWAYS_INLINE;
double _cbrt( double _x )
{
static const double infinity = __builtin_inf();
static const double smallestNormal = 0x1.0p-1022;
static const double twom968 = 0x1.0p-968;
static const xUInt64 oneThird = { 0x55555556ULL, 0 };
static const xUInt32 denormBias = { 0, 696219795U, 0, 0 };
static const xUInt32 normalBias = { 0, 0x1200000U, 0, 0 };
static const double C = 5.42857142857142815906e-01;
static const double D = -7.05306122448979611050e-01;
static const double E = 1.41428571428571436819e+00;
static const double F = 1.60714285714285720630e+00;
static const double G = 3.57142857142857150787e-01;
static const xDouble expMask = { __builtin_inf(), 0 };
static const xUInt64 topMask = { 0xFFFFFFFF00000000ULL, 0 };
static const double twom20 = 0x1.0p-20;
xDouble x = DOUBLE_2_XDOUBLE( _x );
if( EXPECT_FALSE( _mm_istrue_sd( _mm_cmpunord_sd( x, x ) ) ) )
return _x + _x;
xDouble sign = _mm_and_pd( x, minusZeroD ); x = _mm_andnot_pd( minusZeroD, x );
if( EXPECT_FALSE( _mm_istrue_sd( _mm_cmpeq_sdm( x, &infinity)) ) )
return _x;
if( EXPECT_TRUE( _x != 0.0 ) )
{
xDouble isDenorm = _mm_cmplt_sdm( x, &smallestNormal );
xDouble t = _mm_and_pd( _mm_load_sd( &twom968 ), isDenorm ); t = _mm_sub_sd( _mm_or_pd( t, x ), t );
xSInt32 addend = _mm_add_epi32( _mm_andnot_si128( (xSInt32) isDenorm, normalBias), denormBias );
t = (xDouble) _mm_mul_epu32( _mm_srli_epi64( (xSInt64) t, 32 ), oneThird );
t = (xDouble) _mm_add_epi32( (xSInt32) t, addend );
xDouble r = _mm_div_sd( _mm_mul_sd( t, t ), x ); xDouble s = _mm_add_sdm( _mm_mul_sd( r, t ), &C ); xDouble y = _mm_add_sdm( s, &E );
xDouble z = _mm_div_sd( _mm_load_sd( &D ), s );
y = _mm_add_sd( y, z );
y = _mm_div_sd( _mm_load_sd( &F ), y );
y = _mm_add_sdm( y, &G );
t = _mm_mul_sd( t, y );
xDouble add = _mm_mul_sdm( t, &twom20 );
add = _mm_and_pd( add, expMask );
t = _mm_add_sd( t, add );
t = _mm_and_pd( t, (xDouble) topMask );
s = _mm_mul_sd( t, t ); r = _mm_div_sd( x, s );
xDouble w = _mm_add_sd( t, t );
w = _mm_add_sd( w, r );
r = _mm_sub_sd( r, t );
r = _mm_div_sd( r, w );
t = _mm_add_sd( t, _mm_mul_sd( t, r ) );
t = _mm_or_pd( t, sign );
return XDOUBLE_2_DOUBLE( t );
}
return _x; }
double cbrt( double x )
{
return _cbrt( x );
}
float cbrtf( float x )
{
return _cbrt(x);
}
#endif