#include "xmmLibm_prefix.h"
#include <math.h>
static inline long double extended_accum( long double *A, long double B ) ALWAYS_INLINE;
static inline long double extended_accum( long double *A, long double B )
{
long double r = *A + B;
long double carry = B - ( r - *A );
*A = r;
return carry;
}
double fma( double a, double b, double c )
{
static const xUInt64 mask26 = { 0xFFFFFFFFFC000000ULL, 0 };
double ahi, bhi;
xDouble xa = DOUBLE_2_XDOUBLE(a);
xDouble xb = DOUBLE_2_XDOUBLE(b);
xa = _mm_and_pd( xa, (xDouble) mask26 );
xb = _mm_and_pd( xb, (xDouble) mask26 );
ahi = XDOUBLE_2_DOUBLE( xa );
bhi = XDOUBLE_2_DOUBLE( xb );
long double Ahi = ahi;
long double Bhi = bhi;
long double Alo = (long double) a - Ahi;
long double Blo = (long double) b - Bhi;
long double C = c;
long double AhiBhi = Ahi * Bhi; long double AloBhi = Alo * Bhi; long double AhiBlo = Ahi * Blo; long double AloBlo = Alo * Blo;
long double a0 = AloBlo;
long double a1 = extended_accum( &a0, AhiBlo );
a1 += extended_accum( &a0, AloBhi );
a1 += extended_accum( &a0, AhiBhi );
if( fabsl( C ) > fabsl( a0 ) )
{
long double temp = C;
C = a0;
a0 = temp;
}
a1 += extended_accum( &a0, C );
a1 = extended_accum( &a0, a1 );
return a0;
}
float fmaf( float a, float b, float c )
{
xDouble xa = FLOAT_2_XDOUBLE( a );
xDouble xb = FLOAT_2_XDOUBLE( b );
xDouble xc = FLOAT_2_XDOUBLE( c );
xa = _mm_mul_sd( xa, xb ); xa = _mm_add_sd( xa, xc );
return XDOUBLE_2_FLOAT( xa ); }
#if 0 //doesn't work yet
long double fmal( long double a, long double b, long double c )
{
union{ long double ld; struct{ uint64_t mantissa; int16_t sexp; }parts; }ua = {a};
union{ long double ld; struct{ uint64_t mantissa; int16_t sexp; }parts; }ub = {b};
int16_t sign = (ua.parts.sexp ^ ub.parts.sexp) & 0x8000;
int32_t aexp = (ua.parts.sexp & 0x7fff);
int32_t bexp = (ub.parts.sexp & 0x7fff);
int32_t exp = aexp + (ub.parts.sexp & 0x7fff) - 16383;
if( a != a ) return a;
if( b != b ) return b;
if( c != c ) return c;
if( exp > 16384 ) {
if( __builtin_fabs(c) == __builtin_infl() )
{
if( sign & 0x8000 )
{
if( c > 0 )
return __builtin_nan();
return c;
}
if( c < 0 )
return __builtin_nan();
return c;
}
if( sign & 0x8000 )
return -__builtin_infl();
return __builtin_infl();
}
if( exp < -16382 - 63 ) return c;
uint32_t ahi = ua.parts.mantissa >> 32;
uint32_t bhi = ub.parts.mantissa >> 32;
uint32_t alo = ua.parts.mantissa & 0xFFFFFFFFULL;
uint32_t blo = ub.parts.mantissa & 0xFFFFFFFFULL;
uint64_t templl, temphl, templh, temphh;
xUInt64 l, h, a;
templl = (uint64_t) alo * (uint64_t) blo;
temphl = (uint64_t) ahi * (uint64_t) blo;
templh = (uint64_t) alo * (uint64_t) bhi;
temphh = (uint64_t) ahi * (uint64_t) bhi;
l = _mm_cvtsi32_si128( (int32_t) templl ); templl >>= 32;
temphl += templl;
temphl += templh & 0xFFFFFFFFULL;
h = _mm_cvtsi32_si128( (int32_t) temphl); temphl >>= 32;
a = _mm_unpacklo_epi32( l, h );
temphh += templh >> 32;
temphh += temphl;
a = _mm_cvtsi32_si128( (int32_t) temphh ); temphh >>= 32;
h = _mm_cvtsi32_si128( (int32_t) temphh);
a = _mm_unpacklo_epi32( a, h );
l = _mm_unpacklo_epi64( l, a );
union
{
xUInt64 v;
uint64_t u[2];
}z = { l };
long double lo = (long double) temphh.
}
#endif