w_cabsf.c   [plain text]


/*
 * Copyright (c) 2002 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * The contents of this file constitute Original Code as defined in and
 * are subject to the Apple Public Source License Version 1.1 (the
 * "License").  You may not use this file except in compliance with the
 * License.  Please obtain a copy of the License at
 * http://www.apple.com/publicsource and read it before using this file.
 * 
 * This Original Code and all software distributed under the License are
 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */

/* Modified by Stephen Canon on Sept 18, 2006 to do computation in double
 * precision, without calling cabs(double complex). This works because we
 * can't get undue underflow or overflow, but we do need to check for ±°.
 *
 * Also tried out the following SSE3 code:
 *
 *     xDouble xy = _mm_cvtps_pd(_mm_load_ps(&z));
 *     xy = _mm_andnot_pd(minusZeroD,xy);
 *     if ( EXPECT_FALSE( _mm_movemask_pd ( _mm_cmpeq_pd( xy, infinityD ) ) ) )
 *       return __builtin_inf();
 *	   xy = _mm_mul_pd(xy,xy);
 *     xy = _mm_hadd_pd(xy,xy);
 *     xy = __builtin_ia32_sqrtsd(xy);
 *     return XDOUBLE_2_FLOAT(xy);
 *
 * but the x87 code is about twice as fast, and doesn't need SSE3.
 * (and doesn't depend on stack alignment magic)
 ************************************************************************
 *
 * the old code for this function:
 * float cabsf( float complex z ) { return (float)cabs((double complex)z); }
 *
 ************************************************************************/

float cabsf( float _Complex z ) {
	static const double inf = __builtin_inf();
	double x = __builtin_fabs(__real__ z);
	double y = __builtin_fabs(__imag__ z);
	if ( x == inf || y == inf )
		return (float)inf;
	return (float)__builtin_sqrt(x*x + y*y);
}