log_universal.h [plain text]

// Note that while this file has a .s extension it is not intended to be assmbled directly
// but rather included with appropriate #defines in place.
	
// This begins the base-universal logarithm code. 
#ifndef BASE2	
#define BASE2			0
#endif
#ifndef BASEE	
#define BASEE			0	
#endif
#ifndef BASE10
#define BASE10			0
#endif
#ifndef LOGUP
#define LOGUP			0
#endif

// Assume we need to respect the flags unless directed otherwise	
#ifndef FENVON
#define FENVON 			1
#endif
	
#if BASE2
// double log2asm( double x )
ENTRY( log2 )
#elif BASEE
#if LOGUP	
// private double logup( double x ) // log(x+ULP(x)/2)
ENTRY( _logup )
#else	
// double log( double x )
ENTRY( log )
#endif	
#elif BASE10
// double log10( double x )
ENTRY( log10 )
#else
#error "One of BASE2, BASEE, BASE10 needs to be set to 1"
#endif
	// Early out if x is NaN, Inf, 0, or <0
	// I.e., if the encoding of x is not in (0, 7ff0000000000000)
#if defined( __x86_64__ )
	movd		%xmm0,				%rax
	movd		%xmm0,				%rdx
	subq		REL_ADDR(small_cut),		%rax
	cmpq		REL_ADDR(large_cut),		%rax
#else
	movl		4+FRAME_SIZE(STACKP),		%eax
	movl		4+FRAME_SIZE(STACKP),		%edx
	movsd		FRAME_SIZE(STACKP),		%xmm0
	call		0f				// PIC code
0:	pop		%ecx				// PIC base
	sub		$0x00100000,			%eax
	cmp		$0x7fe00000,			%eax
#endif
	jae		3f	// Jump if NaN, Inf, Denorm, 0, or negative

	SUBP		$LOCAL_STACK_SIZE, 		STACKP

#if LOGUP	
	// Need to adjust h with a small correction for log1p(x) when 1+x is not representable
        //u = (x < 1.0) ? -0x1p-53 : 0x1p-54 = ...? 0xbca0000000000000 : 0x3c90000000000000; 
            // Add in the half ulp now that we know we have room.
        //h = (f - a) + u; // h = R(f-a)+ulp(x)
	movapd		%xmm0,			%xmm6		// x
	cmpltsd		REL_ADDR(one),		%xmm6		// (x < 1.0)
	movsd		REL_ADDR(logup_ulp_mask),	%xmm5	// xor(-0x1p-53, 0x1p-54) = 0x8030000000000000
	andpd		%xmm5,			%xmm6		// m = (x < 1.0) ? 0x8030000000000000 : 0
	movsd		REL_ADDR(_1pm54),	%xmm5		// 0x1p-54
	xorpd		%xmm5,			%xmm6		// m ^ 0x1p-54 = (x < 1.0) ? 0x8030000000000000 ^ 0x1p-54 : 0x1p-54
	movsd		%xmm6,			24(STACKP)	// (x < 1.0) ? -0x1p-53 : 0x1p-54
#endif	

	// Break x down into exp, f with -0.25 <= f < 0.5 and x = 2^exp(1+f)
#if defined( __x86_64__ )
	movd		%xmm0,				%r8	// u.u = encoding of x
        shrq		$(52),				%rdx	// exponent >>= 52
	andq		REL_ADDR(frexp_mant_mask),	%r8	// mantissa = u.u & 0x800fffffffffffffULL;

	movd		%xmm0,				%rcx	// encoding of xadjust
	
	// or in -1 + bias to the new exponent for the mantissa

	addq		$-1023,				%rdx	// exponent + bias (= -1023 for normal data)
1:	//This is where denormal inputs re-enter with bias compensated for denormal exponent and appropriate mantissa
	//Note: for denormal xmm0 and rcx inputs are no longer the original x
	
	shrq		$51,				%rcx	// (encoding) >> 51
        andq		$0x1,				%rcx	// upperhalf = (encoding & 0x00080000ULL) >> 51 = (upperhalf != 0ULL);
	addq		%rcx,				%rdx	// exp = bias + (int) ( exponent >> 52 ) + (upperhalf != 0ULL);
	movl		%edx,				8(STACKP) // Save exp for later. Free dx and cx.

	//Note: for denormal inputs this is no longer the original x
	movd		%xmm0,				%rax	// u.u
	// grab before adjusting for which half we are in
        andq		REL_ADDR(frexp_head_mask),	%rax	// head = u.u & 0x000ff00000000000ULL;
        shrq		$((52-8)-5),			%rax	// k<<5 = (int)(head>>(52-8))<<5;
	movd		%xmm0,				%rcx	// u.u = encoding of xadjust
        andq		REL_ADDR(frexp_half_mask),	%rcx	// upperhalf = u.u & 0x0008000000000000ULL;
	shlq		$1,				%rcx	// (upperhalf << 1)
	orq		REL_ADDR(one),			%r8	// (mantissa | 0x3ff0000000000000ULL)
	// Divide by two if significand >= 3/2
	xorq		%rcx,				%r8	// mantissa = (mantissa | 0x3ff0000000000000ULL) ^ (upperhalf << 1);
	movd		%r8,				%xmm1	// fp1
	// Now:
	// k<<5 = %eax
	// exp = %rdx
	// fp1 = %xmm1 = (fp1 > 1.5) ? (0.5 * fp1 - 1.0) : (fp1 - 1.0);
#else
	movapd		%xmm0,				%xmm1	// u.u = encoding of x
	andpd		REL_ADDR(frexp_mant_mask),	%xmm1	// mantissa = u.u & 0x800fffffffffffffULL;

	movapd		%xmm0,				%xmm2	// u.u = encoding of adjusted x
	psrlq		$32,				%xmm0	// top half of encoding of adjusted x

	// or in -1 + bias to the new exponent for the mantissa
        andpd		REL_ADDR(frexp_half_mask),		%xmm2	// upperhalf = u.u & 0x0008000000000000ULL;
        shrl		$(52-32),				%edx	// exponent >>= 52-32
	
	addl		$-1023,				%edx	// exponent + bias; bias = -1023 for normal data
	
1:	//This is where denormal inputs re-enter with bias compensated for denormal exponent and appropriate mantissa

	movapd		%xmm2,				%xmm7	// upperhalf
	psrlq		$51,				%xmm7	// (upperhalf != 0ULL) = upperhalf >> 51 == 1 or 0
	movd		%xmm7,				8(STACKP)	// (int) ( exponent >> 52 )
	add		8(STACKP),			%edx		// exp = bias + (int) ( exponent >> 52 ) + (upperhalf != 0ULL);
	movl		%edx,				8(STACKP)	// exp
	movd		%xmm0,				%eax	// Encoding of top half of x 
	// grab before adjusting for which half we are in
        andl		$0x000ff000,			%eax	// head = u.u & 0x000ff00000000000ULL;
        shrl		$(12-5),			%eax	// k<<5 = (int)(head>>(52-8-32))<<5;
	psllq		$1,				%xmm2	// (upperhalf << 1)
	orpd		REL_ADDR(xone),			%xmm1	// (mantissa | 0x3ff0000000000000ULL)
	// Divide by two if significand >= 3/2
	xorpd		%xmm2,				%xmm1	// mantissa = (mantissa | 0x3ff0000000000000ULL) ^ (upperhalf << 1);

	// Now:
	// k<<5 = %eax
	// exp = 8(STACKP)
	// fp1 = %xmm1 = (fp1 > 1.5) ? (0.5 * fp1) : (fp1);
#endif
	
// We look up a+1 rather than a, so we can work with fp1 - ap1 rather than f - a
	
#if BASE2	
	// if fp1 == 1.0 return nd // base 2 only
	movsd		REL_ADDR(one),			%xmm2	// 1.0
	ucomisd		%xmm2,				%xmm1	// f == 1.0 set cc
	je		7f
#endif 
	
	movsd		%xmm1,	(STACKP)

// The lookup table is in a funny format since it has 2 long double and a single.
	// {10-byte va ; 2-byte pad ; 4-byte single a ; 10-byte lg1pa ; 2-byte pad, 4-byte a+1}
#if defined( __x86_64__ )
	// in 64-bit need lea to get address
	lea		REL_ADDR(LOOKUP),	%rdx
	fldt		(%rdx,AX_P,1)	// [va]
	fldl		(STACKP)	// [va ; f+1]

	fsubs		28(%rdx,AX_P,1)	// [va ; h = f+1 - a+1]
#if LOGUP	
	//halfulp stuff goes here...
        //h = (f - a) + u; // h = R(f-a)+ulp(x)
	faddl		24(STACKP)			// [va ; h = (f - a) + u]
#endif //LOGUP	
	fmulp		%st,	%st(1)	// [c = h * va]
	fstl		(STACKP) 	// [c] -- with double version stored on the stack
	
	fldt		16(%rdx,AX_P,1)	// [c ; lg1pa ]
#if BASE2
	// logbxhi = nd+lg1pa; // base 2
	fiaddl		8(STACKP) 	// [c ; lghi = nd + lg1pa]
#elif BASEE
	// base e stuff goes here...
	fiaddl		8(STACKP) 	// [c ; lghi = nd + lg1pa]
	fldt		REL_ADDR(ln2l)	// [c ; lghi = nd + lg1pa ; ln2l]
        fmulp		%st, %st(1)	// [ c ; logbxhi = ln2l*(nd+lg1pa)]
#elif BASE10
	// base 10 stuff goes here...
	// free(%rax)
	// free(%rdx)
#if FENVON
	movd		%xmm0,					%rdx	// Original x encoding
	shrq		$(53-4),				%rdx	// 16*k = k << 4. k = key >> 53 = the top 6 of bottom 7 bits of exponent of x
        andq		$0x3f,					%rdx	// key = _mm_and_pd(vx,log10_key_mask);
	movd		%xmm0,					%rax	// Original x encoding
	lea		REL_ADDR(isPowerOf10),			%rcx	// Get base of table
	cmp		(%rcx,%rdx,1),				%rax	// if(isPowerOf10[k][0] == x) {return  isPowerOf10[k][1];}
	je		8f
#endif //FENVON base 10 64-bit
	fiaddl		8(STACKP) 			// [c ; lghi = nd + lg1pa]
	fldt		REL_ADDR(log102l)		// [c ; lghi = nd + lg1pa ; log102l]
        fmulp		%st, %st(1)			// [ c ; logbxhi = log102l*(nd+lg1pa)]
#endif
	
#else	// i386
	fldt		(LOOKUP-0b)(%ecx,%eax,1)	// [va]
	fldl		(STACKP)			// [va ; f+1]
	//halfulp stuff goes here...
	fsubs		(LOOKUP-0b+28)(%ecx,%eax,1)	// [va ; h = f+1 - a+1]
#if LOGUP	
        //h = (f - a) + u; // h = R(f-a)+ulp(x)
	faddl		24(STACKP)			// [va ; h = (f - a) + u]
#endif //LOGUP	
	fmulp		%st,	%st(1)			// [c = h * va]
	fstl		(STACKP) 			// [c] -- with double version stored on the stack
	fldt		(LOOKUP-0b+16)(%ecx,%eax,1)	// [c ; lg1pa ]
	
#if BASE2
	// logbxhi = nd+lg1pa; // base 2
	fiaddl		8(STACKP) 			// [c ; lghi = nd + lg1pa]
#elif BASEE
	// base e stuff goes here...
	fiaddl		8(STACKP) 			// [c ; lghi = nd + lg1pa]
	fldt		REL_ADDR(ln2l)			// [c ; lghi = nd + lg1pa ; ln2l]
        fmulp		%st, %st(1)			// [ c ; logbxhi = ln2l*(nd+lg1pa)]
#elif BASE10
	// base 10 stuff goes here...
	// free(%edx)
#if FENVON
	movl		(4+LOCAL_STACK_SIZE)+FRAME_SIZE(STACKP),	%edx
        andl		$0x07e00000,				%edx	// key = _mm_and_pd(vx,log10_key_mask);
	shrl		$((53-32)-4),				%edx	// 16*k = k << 4. k = key >> 53 = the top 6 of bottom 7 bits of exponent of x
	movsd		(LOCAL_STACK_SIZE)+FRAME_SIZE(STACKP),	%xmm5
	comisd		(isPowerOf10-0b)(%ecx,%edx,1),		%xmm5	// if(isPowerOf10[k][0] == x) {return  isPowerOf10[k][1];}
	je		8f
#endif //FENVON base 10 i386
	fiaddl		8(STACKP) 			// [c ; lghi = nd + lg1pa]
	fldt		REL_ADDR(log102l)		// [c ; lghi = nd + lg1pa ; log102l]
        fmulp		%st, %st(1)			// [ c ; logbxhi = log102l*(nd+lg1pa)]
#endif //base 10 i386
	
#endif // i386

	
#ifdef __SSE3__	
	movddup		(STACKP),	%xmm0	// get cd, double version of c off stack in both registers
#else
	movsd		(STACKP),	%xmm0
	movhpd		(STACKP),	%xmm0	
#endif
	movapd		%xmm0,		%xmm1	// [cd,cd]
	addpd		REL_ADDR(a01),	%xmm0	// [a0,a1] + [cd,cd]
	mulpd		%xmm1,		%xmm0	// [a0+cd,a1+cd] * [cd,cd]
	addpd		REL_ADDR(b01),	%xmm0	// vpoly = [b0,b1] + [(a0+cd)*cd,(a1+cd)*cd]
	
//long double logbec is log_b(e) * c
//double c5b is log_b(e) * c5
	fld		%st(1)	// [c ; lghi ; c]
#if BASE2
	// in base 2 logbec = lgel * c; c5b = c5_2;
	fldt		REL_ADDR(lgel)	// [c ; lghi ; c ; lgel]
	fmulp		%st, %st(1)	// [c ; lghi ; logbec = lgel * c]
#define c5b c5_2	
#elif BASEE
	// in base e logbec = c; c5b = c5_e;
					// [c ; lghi ; logbec = lnel * c = c]
#define c5b c5_e
#elif BASE10
	// in base 10 logbec = log10el * c; c5b = c5_10;
	fldt		REL_ADDR(log10el)	// [c ; lghi ; log10el ; c]
	fmulp		%st, %st(1)	// [c ; lghi ; logbec = log10el * c]
#define c5b c5_10
#endif
	fxch		%st(2)		// [logbec ; lghi ; c]
	fldt		REL_ADDR(c0)	// [logbec ; lghi ; c ; c0]
	fmulp		%st, %st(1)	// [logbec ; lghi ; c*c0]
	fld1				// [logbec ; lghi ; c*c0 ; 1.0]
	faddp		%st, %st(1)	// [logbec ; lghi ; 1.0 + c*c0]
	fmulp		%st, %st(2)	// [linear = logbec*(1.0 + c*c0) ; lghi]
	fxch		%st(1)		// [lghi ; linear]
	
	movapd		%xmm1,		%xmm2	// [cd, cd]
	movhpd		REL_ADDR(c5b),	%xmm1	// [c5b, cd]
	mulpd		%xmm2,		%xmm1	// vccc5 = [c5b*cd, cd*cd]
	mulpd		%xmm1,		%xmm0	// vp = vccc5 * vpoly = [c5b*cd*(b0+(a0+cd)*cd), cd*cd*(b1+(a1+cd)*cd)]
	//Do a horizontal multiply.
	movapd		%xmm0,		%xmm1	// [ *, vplo]
	shufpd		$3,	%xmm0,	%xmm0	// [ *, vphi]
	mulsd		%xmm1,		%xmm0	// [ *, cccp = c5b*cd*(b0+(a0+cd)*cd) * cd*cd*(b1+(a1+cd)*cd)]
	movsd		%xmm0,		(STACKP)	// Store so we can load into x87
	faddl		(STACKP)		// [lghi ; logbxlo = linear + cccp]
	faddp		%st,		%st(1)	// [longResult = lghi + logbxlo]
						// Round to double
	fstpl		(STACKP)		// result [<empty stack>]
#if defined( __x86_64__ )
	movq		(STACKP),	%xmm0	// result
#else
	fldl 		(STACKP)
#endif
#undef c5b

2:	// Clean up stack and return
	ADDP		$LOCAL_STACK_SIZE,	STACKP
	ret

////////////////////////////////////////////////////////////////	
	
	
3:	// Special operand handling: NaN, Inf, Denorm, 0, or negative
	// When we get here AX has as encoding(x) - 0x00100000[00000000 in 64-bit]
	// +denormal -> denormal handling
	// +zero -> -inf #ZERO_DIVIDE
	// -stuff -> sqrt(x) (pass quiet NaNs or raise invalid)
	// -zero -> -inf #ZERO_DIVIDE
	// +NaN -> x+x
	// +inf -> x+x	
	xorpd		%xmm2,			%xmm2	// 0.0
	ucomisd		%xmm2,			%xmm0	// f == 0 set cc
	jp		9f	// NaN:  compares unequal and signals invalid only for signaling NaN
	je		4f	// Zero: x == 0.0 -> -1 / zero

        // We now need to distinguish between Denormal, Negative, and NaN/Inf.
        // Adding the small_cut back in sets the EFLAGS for conditional branches
#if defined( __x86_64__ )
	addq		REL_ADDR(small_cut),	%rax
#else
	addl		$0x00100000,		%eax
#endif
	jb		5f	// Denormal (i.e., Inf > x > 0 and we have ruled out other cases)
	js		6f	// Negative sign (may be "-NaN", -Inf, -Normal, or -Denormal)
	
	// If we fall through to here, we have "+NaN" or +Inf
9:
#if defined( __i386__ ) 	// Light cleanup required
	movsd 		%xmm0,                  FRAME_SIZE(STACKP)
	fldl 		FRAME_SIZE(STACKP)
#endif
	ret

4:	// Zero: x == 0.0 -> -1 / zero
	//We need to return -Inf for zero and set div/0 flag
	xorpd 		%xmm1,			%xmm1
	pcmpeqb 	%xmm0,                  %xmm0              // -1U
	cvtdq2pd 	%xmm0, 			%xmm0              // -1.0f
	divsd   	%xmm1,			%xmm0              // -1.0f / 0 = -Inf + div/0 flag
#if defined( __i386__ ) 	// Light cleanup required
	movsd 		%xmm0,                  FRAME_SIZE(STACKP)
	fldl 		FRAME_SIZE(STACKP)
#endif
	ret
	
5:	// Denormal						     
//Denormal inputs are frexped with extra care and then we branch back into the mainline code.
	
	SUBP		$LOCAL_STACK_SIZE, 		STACKP
	// Break x down into exp, f with -0.25 <= f < 0.5 and x = 2^exp(1+f)
	// knowing that we are starting with denormal x
#if defined( __x86_64__ )
	movd		%xmm0,				%r8	// u.u = encoding of x
	andq		REL_ADDR(frexp_mant_mask),	%r8	// mantissa = u.u & 0x800fffffffffffffULL;
        // or in bias to the new exponent for the mantissa
        orq		REL_ADDR(one),			%r8	// mantissa |= 0x3ff0000000000000ULL;
	movd		%r8,				%xmm0	// mantissa
	subsd		REL_ADDR(one),			%xmm0
	movd		%xmm0,				%rdx	// encoding rescaled input
	movd		%xmm0,				%r8	// encoding rescaled input
        shrq		$(52),				%rdx	// exponent >>= 52
	andq		REL_ADDR(frexp_mant_mask),	%r8	// mantissa = u.u & 0x800fffffffffffffULL;	
	movd		%xmm0,				%rcx	// encoding of xadjust
	// or in -1 + bias to the new exponent for the mantissa
	addq		$(-1023-1022),			%rdx	// exponent + bias (= -1023-1022 for subnormal data)
#else
	movapd		%xmm0,				%xmm1	// u.u = encoding of x
        // or in bias to the new exponent for the mantissa
	movq		REL_ADDR(one),			%xmm2	// 1.0
        orpd		%xmm2,				%xmm0	// mantissa =  _mm_or_pd(x, xOne);
	subsd		%xmm2,				%xmm0	// u = mantissa - xOne;
	movapd		%xmm0,				%xmm1	// u.u = encoding of adjusted x
	movapd		%xmm0,				%xmm7	// u.u = encoding of adjusted x (but we only want the high half)
	psrlq		$32,				%xmm7	// top half of encoding of adjusted x
	movd		%xmm7,				%edx	// top half of encoding of adjusted x
	andpd		REL_ADDR(frexp_mant_mask),	%xmm1	// mantissa = u.u & 0x800fffffffffffffULL;

	movapd		%xmm0,				%xmm2	// u.u = encoding of adjusted x
	psrlq		$32,				%xmm0	// top half of encoding of adjusted x
	
	// or in -1 + bias to the new exponent for the mantissa
        andpd		REL_ADDR(frexp_half_mask),		%xmm2	// upperhalf = u.u & 0x0008000000000000ULL;
        shrl		$(52-32),				%edx	// exponent >>= 52-32
	
	addl		$(-1023-1022),				%edx	// exponent + bias; bias = -1023-1022 for subnormal data
	
#endif
	jmp		1b	// Re-enter main line with bias compensated for denormal exponent and appropriate mantissa


6:	// Negative sign (may be "-NaN", -Inf, -Normal, or -Denormal)	
	// In any case sqrt(x) will do the right thing.
	// -QNaN will pass through
	// -SNaN will raise invalid and be quieted	
	// -N will raise invalid and return a quiet NaN
	sqrtsd		%xmm0,				%xmm0
#if defined( __i386__ ) 	// Light cleanup required
	movsd 		%xmm0,                  FRAME_SIZE(STACKP)
	fldl 		FRAME_SIZE(STACKP)
#endif
	ret
    
#if BASE2	
7:	 //early out for power of 2
	ADDP 		$LOCAL_STACK_SIZE,	STACKP
#if defined( __i386__ )
	movl 		%edx, 			FRAME_SIZE( STACKP )
	fildl 		FRAME_SIZE(STACKP)
#else
	cvtsi2sd 	%rdx, 			%xmm0
#endif
	ret
#endif
	
// This base10 branch only needed if FENVON
#if FENVON	
8: 	// If(isPowerOf10[k][0] == x) {return  isPowerOf10[k][1];}
	// Come in with two things on the x87 stack			// [c ; lg1pa ]
	fstp		%st						// [c ]
	fstp		%st						// [<empty>]
#if defined( __i386__ )
	movsd		(isPowerOf10-0b+8)(%ecx,%edx,1),	%xmm0	// isPowerOf10[k][1]
#else	
	movsd		8(%rcx,%rdx,1),				%xmm0	// isPowerOf10[k][1]
#endif
	// Clean up stack and return
	ADDP		$LOCAL_STACK_SIZE,	STACKP
	
#if defined( __i386__ ) 	// Light cleanup required
	movsd 		%xmm0,                  FRAME_SIZE(STACKP)
	fldl 		FRAME_SIZE(STACKP)
#endif
	ret
#endif
	
////////////////////////////////////////////////////////////////