/* atan2f.s -- atan2f for standard math library. Written by Eric Postpischil, July 2007. */ .literal8 // Define miscellaneous constants. Threshold: .double 2.384185791015625e-07 // 2**-22. nZero: .double -0 pPi1v4: .double +.7853981633974483096156608 // 1/4 pi. nPi1v4: .double -.7853981633974483096156608 pPi1v2: .double +1.570796326794896619231322 // 1/2 pi. nPi1v2: .double -1.570796326794896619231322 pPi3v4: .double +2.356194490192344928846982 // 3/4 pi. nPi3v4: .double -2.356194490192344928846982 /* Define values near +/- pi that yield +/- pi rounded toward zero when converted to single precision. This allows us to generate inexact and return the desired values for atan2f on and near the negative side of the x axis. */ AlmostpPi: .double +3.1415924 // Define a coefficient for center polynomial (used for x in [-1, +1]). C2: .double 0.0029352921857004596570518 .const .align 4 /* Define some coefficients for center polynomial (used for x in [-1, +1]). These are stored in pairs at aligned addresses for use in SIMD instructions. */ C01: .double 2.2971562298314633761676433, 0.0207432003007420961489920 C00: .double 2.4449692297316409651126041, 3.7888879287802702842997915 C11: .double -2.9466967515109826289085300, -4.9721072376211623038916292 C10: .double 5.4728447324456990092824269, 6.7197076223592378022736307 // This needs to be 16-byte aligned because it is used in an orpd instruction. .align 4 pPi: .double +3.141592653589793238462643 // pi. // Rename the general registers (just to make it easier to keep track of them). #if defined __i386__ #define r0 %eax #define r1 %ecx #define r2 %edx #define r3 %ebx #define r4 %esp #define r5 %ebp #define r6 %esi #define r7 %edi #elif defined __x86_64__ #define r0 %rax #define r1 %rcx #define r2 %rdx #define r3 %rbx #define r4 %rsp #define r5 %rbp #define r6 %rsi #define r7 %rdi #else #error "Unknown architecture." #endif .text // Define various symbols. #define BaseP r0 // Base address for position-independent addressing. #define y %xmm0 // Must be in %xmm0 for return on x86_64. #define x %xmm1 #define p0 %xmm2 #define x1 %xmm3 #define t0 %xmm4 #define Base %xmm5 #define p1 %xmm6 #if defined __i386__ // Define locations of arguments. #define Argy 4(%esp) #define Argx 8(%esp) // Define how to address data. BaseP must contain the address of label 0. #define Address(label) label-0b(BaseP) #elif defined __x86_64__ // Define locations of arguments. #define Argx %xmm1 #define Argy %xmm0 // Define how to address data. #define Address(label) label(%rip) #endif /* float atan2f(float x). Notes: This routine has not been proven to be correct. See the notes in the accompanying C version regarding potential proof. The polynomial it uses was proven to provide faithfully rounded results in atanf. atan2f introduces additional error performing the division and additional points used in the domain of the polynomial. Citations in parentheses below indicate the source of a requirement. "C" stands for ISO/IEC 9899:TC2. The Open Group specification (IEEE Std 1003.1, 2004 edition) adds no requirements since it defers to C and requires errno behavior only if we choose to support it by arranging for "math_errhandling & MATH_ERRNO" to be non-zero, which we do not. Return value for atan2f(y, x) (C F.9.1 12 and C F.9.1.4): y x atan2f(y, x) -infinity -infinity -3*pi/4. < 0 -2*pi/4. -0 -2*pi/4. +0 -2*pi/4. > 0 -2*pi/4. +infinity -1*pi/4. < 0 -infinity -4*pi/4. < 0 arctangent(y/x) in [-4*pi/4, -2*pi/4]. -0 -2*pi/4. +0 -2*pi/4. > 0 arctangent(y/x) in [-2*pi/4, -0*pi/4]. +infinity -0*pi/4. -0 -infinity -4*pi/4. < 0 -4*pi/4. -0 -4*pi/4. +0 -0*pi/4. > 0 -0*pi/4. +infinity -0*pi/4. +0 -infinity +4*pi/4. < 0 +4*pi/4. -0 +4*pi/4. +0 +0*pi/4. > 0 +0*pi/4. +infinity +0*pi/4. > 0 -infinity +4*pi/4. < 0 arctangent(y/x) in [+2*pi/4, +4*pi/4]. -0 +2*pi/4. +0 +2*pi/4. > 0 arctangent(y/x) in [+0*pi/4, +2*pi/4]. +infinity +0*pi/4. +infinity -infinity +3*pi/4. < 0 +2*pi/4. -0 +2*pi/4. +0 +2*pi/4. > 0 +2*pi/4. +infinity +1*pi/4. If either input is a NaN, return one of the NaNs in the input. (C F.9 11 and 13). (If the NaN is a signalling NaN, we return the "same" NaN quieted.) Otherwise: If the rounding mode is round-to-nearest, return arctangent(x) faithfully rounded. Returns a value in [-pi, +pi] (C 7.12.4.4 3). Note that this prohibits returning correctly rounded values for -pi and +pi, since pi rounded to a float lies outside that interval. Not implemented: In other rounding modes, return arctangent(x) possibly with slightly worse error, not necessarily honoring the rounding mode (Ali Sazegari narrowing C F.9 10). Exceptions: Raise underflow for a denormal result (C F.9 7 and Draft Standard for Floating-Point Arithmetic P754 Draft 1.2.5 9.5). If the input is the smallest normal, underflow may or may not be raised. This is stricter than the older 754 standard. May or may not raise inexact, even if the result is exact (C F.9 8). Raise invalid if the input is a signalling NaN (C 5.2.4.2.2 3, in spite of C 4.2.1) but not if the input is a quiet NaN (C F.9 11). May not raise exceptions otherwise (C F.9 9). Properties: We desire this routine to be monotonic, but that has not been proven. (For atan2f, monotonicity would mean, if (x0, y0) and (x1, y1) are in the same quadrant, then y0/x0 <= y1/x1 implies atan2f(y0, x0) <= atan2f(y1, x1).) */ .align 5 .globl _atan2f _atan2f: cvtss2sd Argy, y // Convert to double precision. cvtss2sd Argx, x #if defined __i386__ // Get address of 0 in BaseP. call 0f // Push program counter onto stack. 0: pop BaseP // Get program counter. #endif #define nx t0 movsd Address(nZero), nx xorpd x, nx // Negate x. ucomisd x, y jae yGEx // If we jump, y >= x. je Unordered // If we jump, an operand is a NaN. // If we fall through, y < x. ucomisd y, nx jae yLTx_and_nxGEy // If we jump, y < x and -x >= y. // If we fall through, -x < y < x. // Return atand(y/x). divsd x, y // Form y/x. movsd y, x // Move to register used by Polynomial. movsd Address(nZero), Base // Set Base to -0. // This makes the return value -0 if y is -0 and x > 0. jmp Polynomial // Return 0 + arctangent(y/x). yLTx_and_nxGEy: // Here y < x && y <= -x. je yLTx_and_nxEQy // If we jump, y < x && -x == y. // If we fall through, y < x < -y. // Return -pi/2 - atand(x/y). movsd Address(nZero), t0 // Get -0 for sign bit. divsd y, x // Form x/y. xorpd t0, x // Form -x/y. movsd Address(nPi1v2), Base // Set Base to -pi/2. jmp Polynomial // Return -pi/2 + arctangent(-x/y). yLTx_and_nxEQy: // Here -x == y < x. // Return -1/4*pi with inexact exception. cvtsd2ss Address(nPi1v4), y jmp ReturnSingle yGEx: // Here y >= x. je yEQx // If we jump, y == x. // If we fall through, y > x. ucomisd y, nx #undef nx jae yGTx_and_nxGEy // If we jump, y > x && -x >= y. // If we fall through, -y < x < y. // Return +pi/2 - atand(x/y). movsd Address(nZero), t0 // Get -0 for sign bit. divsd y, x // Form x/y. movsd Address(pPi1v2), Base // Set Base to +pi/2. xorpd t0, x // Form -x/y. jmp Polynomial // Return +pi/2 + arctangent(-x/y). yGTx_and_nxGEy: // Here y > x && -x >= y. je yGTx_and_nxEQy // If we jump, y > x && -x == y. // If we fall through, x < y < -x. movsd Address(nZero), Base // Get mask for sign bit. movapd Base, t0 // Copy mask. andpd y, Base // Extract sign bit of y. divsd x, y // Form y/x. andnpd y, t0 // Take absolute value of quotient. comisd Address(Threshold), t0 // Is quotient small? jbe NearNegativeXAxis orpd Address(pPi), Base // Set Base to pi with y's sign. movsd y, x // Move to register used by Polynomial. jmp Polynomial // Return +/- pi + arctangent(y/x). NearNegativeXAxis: // Return -pi or +pi, matching the sign of y, rounded toward zero. movsd Address(AlmostpPi), p0 xorpd Base, p0 jmp ReturnDouble yGTx_and_nxEQy: // Here x < y == -x. // Return +3/4*pi with inexact exception. cvtsd2ss Address(pPi3v4), y jmp ReturnSingle yEQx: // Here y == x. ucomisd Address(nZero), y jae yEQx_and_yGE0 // If we jump, y == x && y >= 0. // If we fall through, x == y < -x. // Return -3/4*pi with inexact exception. cvtsd2ss Address(nPi3v4), y jmp ReturnSingle yEQx_and_yGE0: // Here y == x && y >= 0. je yEQx_and_yEQ0 // If we jump, y == x && y == 0. // If we fall through, -x < y == x. // Return +1/4*pi with inexact exception. cvtsd2ss Address(pPi1v4), y jmp ReturnSingle yEQx_and_yEQ0: // Here y == x == 0. /* Return: x y atan2f(y, x) -0 -0 -pi, with inexact exception. -0 +0 +pi, with inexact exception. +0 -0 -0. +0 +0 +0. */ /* We want to know if x is -0 or +0, but there is no direct test for that that puts the results in a vector register. We do an arithmetic right shift to fill up the exponent bits with copies of the sign bit. This produces a NaN or +0. Then we test for "unordered", which yields all one bits if x was -0 and all zero bits if x was +0. */ psraw $12, x cmpunordsd x, x // Form (almost) pi if x was -0 and 0 if x was +0. movsd Address(AlmostpPi), p0 andpd x, p0 orpd y, p0 // Apply the sign bit from y. jmp ReturnDouble Unordered: // Here x or y is a NaN. addsd x, y // Return one of the NaNs, quieted. cvtsd2ss y, y jmp ReturnSingle /* This is the principal arctangent evaluation. Previous code has prepared the Base and y registers, and we need to calculate Base + arctangent(y). The result is then converted to a single-precision number and returned. -1 <= y <= +1. (Actually, -1 < y < +1. The equalities were sidetracked during all the branching above, and division of two different single-precision numbers converted to double-precision never rounds to one.) There are some slight inefficiencies here, in that special cases could omit a few instructions -- sometimes the base is zero or y had to be negated to fit this common code. So, if speed is all important, this routine might be speeded up a little by replicating this code. */ Polynomial: /* The polynomial that approximates arctangent has been arranged into the form: x * c2 * ((x**4 + c01 * x**2 + c00)) * ((x**4 + c11 * x**2 + c10)) * ((x**4 + c21 * x**2 + c20)) * ((x**4 + c31 * x**2 + c30)) The coefficients are stored in pairs, with c01 and c21 at C01, c00 and c20 at C00, c11 and c31 at C11, and c10 and c30 at C10. c2 is at C2. The quartic factors are evaluated in SIMD registers. For brevity, some comments below describe only one element of a register. The other is analagous. */ movsd x, x1 // Save a copy of x for later. movapd Address(C11), p1 mulsd x, x // Form x**2. #define x2 x // Define name describing current register contents. movlhps x2, x2 // Duplicate x**2. addpd x2, p1 // Form x**2 + c11. mulpd x2, p1 // Form x**4 + c11 * x**2. addpd Address(C10), p1 // Form x**4 + c11 * x**2 + c10. movapd Address(C01), p0 // Get first coefficients. addpd x2, p0 // Form x**2 + c01. mulpd x2, p0 // Form x**4 + c01 * x**2. movhpd Address(C2), x1 // Put c2 in one element, with x in other. addpd Address(C00), p0 // Form x**4 + c01 * x**2 + c00. mulpd p1, p0 // Combine factors. mulpd x1, p0 // Multiply by x and c2. movhlps p0, p1 // Get high element. mulsd p1, p0 // Finish combining factors. #undef x2 addsd Base, p0 // Return the double-precision number currently in p0. ReturnDouble: cvtsd2ss p0, y // Convert result to single precision. // Return the single-precision number currently in p0. ReturnSingle: #if defined __i386__ movss y, Argx // Shuttle result through memory. // This uses the argument area for scratch space, which is allowed. flds Argx // Return input on floating-point stack. #else // On x86_64, the return value is now in y, which is %xmm0. #endif ret