xmm_arcsincostan.c [plain text]

/*
 *  xmm_arcsincos.c
 *  xmmLibm
 *
 *  Created by Ian Ollmann, Ph.D. on 8/2/05.
 *  Copyright © 2005 Apple Computer, Inc. All rights reserved.
 *
 *  Based on code by Jim Thomas for 68881/68882 Macintoshes
 *  underlying arc tangent follows algorithm by Ali Sazegari, Jan 1991
 *          which in turn was based on SANE pack 5 for 68881/68882
 *          by Ali Sazegari and pascal by K Hanson and C McMaster
 */


#include "xmmLibm_prefix.h"
#include <math.h>

static inline xDouble _fatn( xDouble ) ALWAYS_INLINE;
static inline xDouble _xatan( xDouble ) ALWAYS_INLINE;
static inline xDouble _xasin( xDouble ) ALWAYS_INLINE;
static inline xDouble _xacos( xDouble ) ALWAYS_INLINE;

static const union
{
    uint64_t    u;
    double      d;
}inverseTrigNaN = { 0x7ff8000000000022ULL };


#pragma mark -

static const double q0 =  0x1.880B38AA99E19p+5; //49.0054791763506685243
static const double q1 =  0x1.9B497D4358B3Ep+6; //102.821766903199176113
static const double q2 =  0x1.1C6FDB0AE1716p+6; //71.1092340183352860095
static const double q3 =  0x1.1EF299B1679BEp+4; //17.9342286043045007204
static const double q4 =  0x1.31A63D5C55F4Cp+0; //1.19394286636630209131

static const double p0 =                   0.0; //                0
static const double p1 =  0x1.055CD071BBEBBp+4; //16.3351597254502216572
static const double p2 =  0x1.8790B27BF4749p+4; //24.47282646579632015
static const double p3 =  0x1.4476D8263F242p+3; //10.1395073649410996097
static const double p4 =  0x1.0000000000000p+0; //                1

static inline xDouble _fatn( xDouble x )
{
    x = _mm_mul_sd( x, x );         //x = x*x
    
    //num   = ( x * ( p[1].dbl + x * ( p[2].dbl + x * ( p[3].dbl + x ) ) ) );
    //denum = ( q[0].dbl + x * ( q[1].dbl + x * ( q[2].dbl + x * ( q[3].dbl + q[4].dbl * x ) ) ) );
    xDouble denom = _mm_mul_sdm( x, &q4);       xDouble num = _mm_add_sdm( x, &p3 );
    denom = _mm_add_sdm( denom, &q3 );          num = _mm_mul_sd( num, x );
    denom = _mm_mul_sd( denom, x );             num = _mm_add_sdm( num, &p2 );
    denom = _mm_add_sdm( denom, &q2 );          num = _mm_mul_sd( num, x );
    denom = _mm_mul_sd( denom, x );             num = _mm_add_sdm( num, &p1 );
    denom = _mm_add_sdm( denom, &q1 );          num = _mm_mul_sd( num, x );
    denom = _mm_mul_sd( denom, x );             
    denom = _mm_add_sdm( denom, &q0 );             

    return _mm_div_sd( num, denom );
}


static const double FPKX2       =  0x1.279A74590331Cp-1; //0.577350269189625731059
static const double FPKX2FX2    =  0x1.B8550D62BFB6Dp-5; //0.0537514935913268945833
static const double FPKATNCONS  =  0x1.126145E9ECD56p-2; //0.267949192431122695801
static const double FPKPI2      =  0x1.921FB54442D18p+0; //1.570796326794896558
static const double one = 1.0;


static inline xDouble _xatan( xDouble x )
{
    //get input NaNs out of the way
    if( _mm_istrue_sd( _mm_cmpunord_sd( x, x ) ) )
        return _mm_add_sd( x, x );

//      About the argument reduction:  arctg(x) is evaluated for 0 <= x <= 1    //
//      with either a Short or a Long formula depending on the value of x.      //
//                                                                              //
//      Recall the identities:                                                  //
//                                                                              //
//      arctg(-x)  = -arctg(x)                                                  //
//      arctg(1/x) =  pi/2 - arctg(x)                                           //
//                                                                              //
//      If x < 0 then arctg(-x) is computed, and the result negated.            //
//      If |x| > 1 then arctg(1/|x|) is computed and the result subtracted      //
//      from π/2.                                                               //
//                                                                              //
//      To compute atan of reduced x use Short or Long formula:                 //
//                                                                              //
//      if ( x <= ATnCons =~ 0.267 ) then:                                      //
//            arctg(x) = x - x * P( x * x ) / Q( x * x ),                       //
//      else  arctg(x) = x - ( A + ( B * P( B * B ) / Q( B * B ) + x2fx2 ) ).   //
//                                                                              //
//      where      x2 and x2fx2 are constants, about 0.5 and 0.05 and           //
//            A = ( x - x2 ) / ( 1 + ( 1 / ( x * x2 ) ) ),                      //
//            B = ( x - x2 ) / ( 1 + ( x * x2 ) ).                              //

    
    //Keep track of the original sign, so we can set it back later
    xDouble sign = _mm_and_pd( minusZeroD, x );
    xDouble y = _mm_andnot_pd( minusZeroD, x );    //set x = fabs(x)
    int isLarge = _mm_isfalse_sd( _mm_cmple_sdm( y, &one ) ); 
    
    if( isLarge )    // ! y <= 1.0
        y = _mm_div_sd( _mm_load_sd( &one ), y );


    //we can now be sure that  0 <= y <= 1
    if( _mm_istrue_sd( _mm_cmple_sdm( y, &FPKATNCONS ) ) ) // y <= 0.26794919243
    {
		static const double small = 0x1.0p-63;
		static const double scale = 0x1.0p55;
		static const double mscale = 0x1.0p-55;
		static const double tiny = 0x1.0000000000000p-1022;
        if( _mm_istrue_sd( _mm_cmplt_sdm( y, (double*) &small ) ) )
		{
			if( _mm_istrue_sd( _mm_cmpne_sdm( y, (double*) &minusZeroD ) ) )
				y = _mm_mul_sdm( _mm_add_sdm( _mm_mul_sdm( y, &scale ), &tiny ), &mscale );	//set inexact
		}
		else
            y = _mm_sub_sd( y, _mm_mul_sd( y, _fatn(y) ) );
    }
    else
    {
        xDouble t = _mm_mul_sdm( y, &FPKX2 );               //y * FPKX2
        xDouble a = _mm_sub_sdm( y, &FPKX2 );               //y - FPKX2
        xDouble rt = _mm_div_sd( _mm_load_sd( &one ), t );  //1/t
        xDouble b = _mm_add_sdm( t, &one );                 //t + 1
        xDouble z = _mm_div_sd( a, b );                     //(y - FPKX2) / (t + 1)
        xDouble c = _mm_add_sdm( rt, &one );                //1/t + 1
        c = _mm_div_sd( a, c );                             //(y - FPKX2) / (1/t + 1 )
        xDouble d = _fatn(z);
        xDouble e = _mm_add_sdm( _mm_mul_sd( z, d ), &FPKX2FX2 ); 
        c = _mm_add_sd( c, e );
        y = _mm_sub_sd( y, c );
    }
    
    //correct for large inputs
    if( isLarge )
        y = _mm_sub_sd( _mm_load_sd( &FPKPI2), y );

    //restore the sign
    y = _mm_xor_pd( y, sign );

    return y;
}


static const double half = 0.5;
static const double Emin = 0x1.0p-1022;
static const double plusinf = 1e500;


static inline xDouble _xasin( xDouble x )
{
	if( _mm_istrue_sd( _mm_cmpunord_sd( x, x ) ) )
		return _mm_add_sd( x, x );

    xDouble y = _mm_andnot_pd( minusZeroD, x );
    xDouble oneD = _mm_load_sd( &one );
	
	if( _mm_istrue_sd( _mm_cmpge_sd( y, oneD ) ) )
	{
        if( _mm_istrue_sd( _mm_cmpgt_sd( y, oneD ) ) )
        {
            //deal with overflow gracefully
            y = _mm_or_pd( y, minusZeroD );
            y = _MM_SQRT_SD( y );
        }
        else
        {
            static const double pi_over_2_hi = 0x1.921fb54442d18p+0;  //pi/2 to 53 bits precision
            static const double pi_over_2_lo = 0x8.d4p-57;            //some more bits of pi/2
            
            //Get the correctly rounded pi/2 result for the current rounding mode and set inexact
            y = REQUIRED_ADD_sdm( _mm_load_sd(&pi_over_2_hi), &pi_over_2_lo );

            //Copy in the sign of x
            y = _mm_or_pd( _mm_and_pd( minusZeroD, x ), y );
        }
	}
	else
	{
		if( _mm_istrue_sd( _mm_cmple_sdm( y, &half ) ) )
		{
			static const double small = 0x1.0p-126;
			y = _mm_max_sdm( y, &small );	//avoid spurious underflow
			y = _mm_sub_sd( oneD, _mm_mul_sd( y, y )  );
		}
		else
		{
			y = _mm_sub_sd( oneD, y );
			y = _mm_sub_sd( _mm_add_sd( y, y ), _mm_mul_sd( y, y ) );
		}

		y = _MM_SQRT_SD( y );
		y = _mm_div_sd( x, y );
		y = _xatan( y );
	}
	
    //if y is NaN but x is real
    if( _mm_istrue_sd( _mm_and_pd( _mm_cmpord_sd( x, x ), _mm_cmpunord_sd( y, y ) ) ) )
        y = _mm_load_sd( &inverseTrigNaN.d );
    
    return y;
}




static inline xDouble _xacos( xDouble x )
{
	static const double three = 3.0;
    //get old fp env, and set the default one
        
    xDouble y = x;
    xDouble xone = _mm_load_sd( &one );

	if( _mm_istrue_sd( _mm_cmpeq_sd( xone, _mm_andnot_pd( minusZeroD, y ) ) ) )
	{	//Handle 1 and -1
            
        if( _mm_istrue_sd( _mm_cmpeq_sd( xone, y ) ) )
            y = _mm_xor_pd( y, y );                                                             //y = 0
        else
        {
            static const double pi_hi = 0x1.921fb54442d18p+1;  //pi to 53 bits precision
            static const double pi_lo = 0x8.d4p-56;            //some more bits of pi
            y = REQUIRED_ADD_sdm( _mm_load_sd(&pi_hi), &pi_lo );      //set inexact and return correcty rounded pi
        }
	}
	else
	{	
		xDouble safeY = _mm_andnot_pd( _mm_cmpunord_sd( y, y ), y );
		safeY = _mm_andnot_pd( minusZeroD, safeY );
		y = _mm_sel_pd( y, _mm_load_sd( &three), _mm_cmpgt_sd( safeY, xone ) ); //slam invalid inputs to a value that will calculate exactly until we reach NaN
		y = _mm_div_sd( _mm_sub_sd( xone, y ), _mm_add_sd( xone, y ) );
		y = _MM_SQRT_SD( y );
		y = _xatan( y );
		y = _mm_add_sd( y, y ); 
	}

    //if y is NaN but x is real
    if( _mm_istrue_sd( _mm_and_pd( _mm_cmpord_sd( x, x ), _mm_cmpunord_sd( y, y ) ) ) )
    {
        y = _mm_load_sd( &inverseTrigNaN.d );
    }
    
    return y;
}

#pragma mark -

double asin(double x)
{
    xDouble xd = DOUBLE_2_XDOUBLE( x );
    xd = _xasin( xd );
    return XDOUBLE_2_DOUBLE( xd );
}

double acos(double x)
{
    xDouble xd = DOUBLE_2_XDOUBLE( x );
    xd = _xacos( xd );
    return XDOUBLE_2_DOUBLE( xd );
}

double atan(double x)
{
    xDouble xd = DOUBLE_2_XDOUBLE( x );
    xd = _xatan( xd );
    return XDOUBLE_2_DOUBLE( xd );
}

#warning ******* CHEESY HACK TO GET LIBM BUILDING *********
double atan2( double x, double y )
{
	return atan2l( (long double) x, (long double) y );
}