truncf.s [plain text]

/* Single-precision truncf
 *
 * Reimplemented for improved performance on in-order machines and
 * machines that support SSE 4.1
 *
 * Steve Canon, March 2009.
 */
 
#include <System/i386/cpu_capabilities.h>
.set cpubits, _COMM_PAGE_CPU_CAPABILITIES

#if defined __i386__

.text
.align 4
.globl _truncf
_truncf:
	movss	  4(%esp),			%xmm0	// load argument
	testl	  $(kHasSSE4_1),	cpubits
	jz			0f
	
	roundss		$0x3,	%xmm0,	%xmm0	// fast path using SSE 4.1
	movss		%xmm0,		  4(%esp)	//
	flds	  4(%esp)					//
	ret

.align 4
0:  mov		  4(%esp),			%eax	// load the input, x
    and			$0x7f800000,    %eax	// |x|
	mov			$23,			%ecx
	sub			$0x3f800000,	%eax	// if |x| < 1.0f, goto 2
	js			2f
	sar			%cl,			%eax	// exponent(x)
	mov			$0xffffffff,	%edx
	sub			%eax,			%ecx	// 23 - exponent(x)
	js			1f						// return x if |x| >= 0x1.0p24
	shl			%cl,			%edx
	and			%edx,		  4(%esp)	// mask off non-integral bits
	cvttps2dq	%xmm0,			%xmm0	// raise inexact
1:	flds	  4(%esp)
	ret
	
.align 4
2:	// Handle |x| < 1.0 here.
	andl		$0x80000000,  4(%esp)	// copysign(0.0, x)
	cvttps2dq	%xmm0,			%xmm0	// raise inexact
	flds	  4(%esp)
	ret

#elif defined __x86_64__

.const
.align 4
mzero:	.long	0x80000000

.text
.align 4
.globl _truncf
_truncf:
	movd		%xmm0,			%eax
    and			$0x7f800000,    %eax	// |x|
	mov			$23,			%ecx
	sub			$0x3f800000,	%eax	// if |x| < 1.0f, goto 2
	js			2f
	sar			%cl,			%eax	// exponent(x)
	mov			$0xffffffff,	%edx
	sub			%eax,			%ecx	// 23 - exponent(x)
	js			1f						// return x if |x| >= 0x1.0p23
	shl			%cl,			%edx
	movd		%edx,			%xmm2
	cvttps2dq	%xmm0,			%xmm1	// raise inexact
	andps		%xmm2,			%xmm0	// mask off non-integral bits
1:	ret
.align 4
2:	cvttps2dq	%xmm0,			%xmm1	// raise inexact
	andps		mzero(%rip),	%xmm0	// copysign(0.0, x)
	ret
	
#else
	#error unknown arch
#endif