IOAudioBlitterLibX86.cpp [plain text]
#include <TargetConditionals.h>
#if __i386__ || __LP64__
#define _MM_MALLOC_H_INCLUDED 1 // we don't want this header
#include <xmmintrin.h>
#include "IOAudioBlitterLib.h"
#include <libkern/OSByteOrder.h>
#define kMaxFloat32 2147483520.0f
static inline __m128i byteswap16( __m128i v )
{
return _mm_or_si128( _mm_slli_epi16( v, 8 ), _mm_srli_epi16( v, 8 ) );
}
static inline __m128i byteswap32( __m128i v )
{
v = _mm_shufflehi_epi16( _mm_shufflelo_epi16( v, 0xB1 ), 0xB1 );
return byteswap16( v );
}
#pragma mark -
#pragma mark Float -> Int
void Float32ToNativeInt16_X86( const Float32 *src, SInt16 *dst, unsigned int numToConvert )
{
const float *src0 = src;
int16_t *dst0 = dst;
unsigned int count = numToConvert;
if (count >= 8) {
ROUNDMODE_NEG_INF
const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f };
const __m128 vmin = (const __m128) { -32768.0f, -32768.0f, -32768.0f, -32768.0f };
const __m128 vmax = (const __m128) { 32767.0f, 32767.0f, 32767.0f, 32767.0f };
const __m128 vscale = (const __m128) { 32768.0f, 32768.0f, 32768.0f, 32768.0f };
__m128 vf0, vf1;
__m128i vi0, vi1, vpack0;
#define F32TOLE16 \
vf0 = _mm_mul_ps(vf0, vscale); \
vf1 = _mm_mul_ps(vf1, vscale); \
vf0 = _mm_add_ps(vf0, vround); \
vf1 = _mm_add_ps(vf1, vround); \
vf0 = _mm_max_ps(vf0, vmin); \
vf1 = _mm_max_ps(vf1, vmin); \
vf0 = _mm_min_ps(vf0, vmax); \
vf1 = _mm_min_ps(vf1, vmax); \
vi0 = _mm_cvtps_epi32(vf0); \
vi1 = _mm_cvtps_epi32(vf1); \
vpack0 = _mm_packs_epi32(vi0, vi1);
int falign = (uintptr_t)src & 0xF;
int ialign = (uintptr_t)dst & 0xF;
if (falign != 0 || ialign != 0) {
vf0 = _mm_loadu_ps(src);
vf1 = _mm_loadu_ps(src+4);
F32TOLE16
_mm_storeu_si128((__m128i *)dst, vpack0);
unsigned int n = (16 - ialign) / 2;
src += n;
dst += n;
count -= n;
falign = (uintptr_t)src & 0xF;
if (falign != 0) {
while (count >= 8) {
vf0 = _mm_loadu_ps(src);
vf1 = _mm_loadu_ps(src+4);
F32TOLE16
_mm_store_si128((__m128i *)dst, vpack0);
src += 8;
dst += 8;
count -= 8;
}
goto VectorCleanup;
}
}
while (count >= 8) {
vf0 = _mm_load_ps(src);
vf1 = _mm_load_ps(src+4);
F32TOLE16
_mm_store_si128((__m128i *)dst, vpack0);
src += 8;
dst += 8;
count -= 8;
}
VectorCleanup:
if (count > 0) {
src = src0 + numToConvert - 8;
dst = dst0 + numToConvert - 8;
vf0 = _mm_loadu_ps(src);
vf1 = _mm_loadu_ps(src+4);
F32TOLE16
_mm_storeu_si128((__m128i *)dst, vpack0);
}
RESTORE_ROUNDMODE
return;
}
if (count > 0) {
double scale = 2147483648.0, round = 32768.0, max32 = 2147483648.0 - 1.0 - 32768.0, min32 = 0.;
ROUNDMODE_NEG_INF
while (count-- > 0) {
double f0 = *src++;
f0 = f0 * scale + round;
SInt32 i0 = FloatToInt(f0, min32, max32);
i0 >>= 16;
*dst++ = i0;
}
RESTORE_ROUNDMODE
}
}
void Float32ToSwapInt16_X86( const Float32 *src, SInt16 *dst, unsigned int numToConvert )
{
const float *src0 = src;
int16_t *dst0 = dst;
unsigned int count = numToConvert;
if (count >= 8) {
ROUNDMODE_NEG_INF
const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f };
const __m128 vmin = (const __m128) { -32768.0f, -32768.0f, -32768.0f, -32768.0f };
const __m128 vmax = (const __m128) { 32767.0f, 32767.0f, 32767.0f, 32767.0f };
const __m128 vscale = (const __m128) { 32768.0f, 32768.0f, 32768.0f, 32768.0f };
__m128 vf0, vf1;
__m128i vi0, vi1, vpack0;
#define F32TOBE16 \
vf0 = _mm_mul_ps(vf0, vscale); \
vf1 = _mm_mul_ps(vf1, vscale); \
vf0 = _mm_add_ps(vf0, vround); \
vf1 = _mm_add_ps(vf1, vround); \
vf0 = _mm_max_ps(vf0, vmin); \
vf1 = _mm_max_ps(vf1, vmin); \
vf0 = _mm_min_ps(vf0, vmax); \
vf1 = _mm_min_ps(vf1, vmax); \
vi0 = _mm_cvtps_epi32(vf0); \
vi1 = _mm_cvtps_epi32(vf1); \
vpack0 = _mm_packs_epi32(vi0, vi1); \
vpack0 = byteswap16(vpack0);
int falign = (uintptr_t)src & 0xF;
int ialign = (uintptr_t)dst & 0xF;
if (falign != 0 || ialign != 0) {
vf0 = _mm_loadu_ps(src);
vf1 = _mm_loadu_ps(src+4);
F32TOBE16
_mm_storeu_si128((__m128i *)dst, vpack0);
unsigned int n = (16 - ialign) / 2;
src += n;
dst += n;
count -= n;
falign = (uintptr_t)src & 0xF;
if (falign != 0) {
while (count >= 8) {
vf0 = _mm_loadu_ps(src);
vf1 = _mm_loadu_ps(src+4);
F32TOBE16
_mm_store_si128((__m128i *)dst, vpack0);
src += 8;
dst += 8;
count -= 8;
}
goto VectorCleanup;
}
}
while (count >= 8) {
vf0 = _mm_load_ps(src);
vf1 = _mm_load_ps(src+4);
F32TOBE16
_mm_store_si128((__m128i *)dst, vpack0);
src += 8;
dst += 8;
count -= 8;
}
VectorCleanup:
if (count > 0) {
src = src0 + numToConvert - 8;
dst = dst0 + numToConvert - 8;
vf0 = _mm_loadu_ps(src);
vf1 = _mm_loadu_ps(src+4);
F32TOBE16
_mm_storeu_si128((__m128i *)dst, vpack0);
}
RESTORE_ROUNDMODE
return;
}
if (count > 0) {
double scale = 2147483648.0, round = 32768.0, max32 = 2147483648.0 - 1.0 - 32768.0, min32 = 0.;
ROUNDMODE_NEG_INF
while (count-- > 0) {
double f0 = *src++;
f0 = f0 * scale + round;
SInt32 i0 = FloatToInt(f0, min32, max32);
i0 >>= 16;
#if __ppc__
*dst++ = OSSwapInt16(i0);
#else
*dst++ = i0;
#endif
}
RESTORE_ROUNDMODE
}
}
void Float32ToNativeInt32_X86( const Float32 *src, SInt32 *dst, unsigned int numToConvert )
{
const float *src0 = src;
SInt32 *dst0 = dst;
unsigned int count = numToConvert;
if (count >= 4) {
ROUNDMODE_NEG_INF
const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f };
const __m128 vmin = (const __m128) { -2147483648.0f, -2147483648.0f, -2147483648.0f, -2147483648.0f };
const __m128 vmax = (const __m128) { kMaxFloat32, kMaxFloat32, kMaxFloat32, kMaxFloat32 };
const __m128 vscale = (const __m128) { 2147483648.0f, 2147483648.0f, 2147483648.0f, 2147483648.0f };
__m128 vf0;
__m128i vi0;
#define F32TOLE32(x) \
vf##x = _mm_mul_ps(vf##x, vscale); \
vf##x = _mm_add_ps(vf##x, vround); \
vf##x = _mm_max_ps(vf##x, vmin); \
vf##x = _mm_min_ps(vf##x, vmax); \
vi##x = _mm_cvtps_epi32(vf##x); \
int falign = (uintptr_t)src & 0xF;
int ialign = (uintptr_t)dst & 0xF;
if (falign != 0 || ialign != 0) {
vf0 = _mm_loadu_ps(src);
F32TOLE32(0)
_mm_storeu_si128((__m128i *)dst, vi0);
unsigned int n = (16 - ialign) / 4;
src += n;
dst += n;
count -= n;
falign = (uintptr_t)src & 0xF;
if (falign != 0) {
while (count >= 4) {
vf0 = _mm_loadu_ps(src);
F32TOLE32(0)
_mm_store_si128((__m128i *)dst, vi0);
src += 4;
dst += 4;
count -= 4;
}
goto VectorCleanup;
}
}
while (count >= 4) {
vf0 = _mm_load_ps(src);
F32TOLE32(0)
_mm_store_si128((__m128i *)dst, vi0);
src += 4;
dst += 4;
count -= 4;
}
VectorCleanup:
if (count > 0) {
src = src0 + numToConvert - 4;
dst = dst0 + numToConvert - 4;
vf0 = _mm_loadu_ps(src);
F32TOLE32(0)
_mm_storeu_si128((__m128i *)dst, vi0);
}
RESTORE_ROUNDMODE
return;
}
if (count > 0) {
double scale = 2147483648.0, round = 0.5, max32 = 2147483648.0 - 1.0 - 0.5, min32 = 0.;
ROUNDMODE_NEG_INF
while (count-- > 0) {
double f0 = *src++;
f0 = f0 * scale + round;
SInt32 i0 = FloatToInt(f0, min32, max32);
*dst++ = i0;
}
RESTORE_ROUNDMODE
}
}
void Float32ToSwapInt32_X86( const Float32 *src, SInt32 *dst, unsigned int numToConvert )
{
const float *src0 = src;
SInt32 *dst0 = dst;
unsigned int count = numToConvert;
if (count >= 4) {
ROUNDMODE_NEG_INF
const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f };
const __m128 vmin = (const __m128) { -2147483648.0f, -2147483648.0f, -2147483648.0f, -2147483648.0f };
const __m128 vmax = (const __m128) { kMaxFloat32, kMaxFloat32, kMaxFloat32, kMaxFloat32 };
const __m128 vscale = (const __m128) { 2147483648.0f, 2147483648.0f, 2147483648.0f, 2147483648.0f };
__m128 vf0;
__m128i vi0;
#define F32TOBE32(x) \
vf##x = _mm_mul_ps(vf##x, vscale); \
vf##x = _mm_add_ps(vf##x, vround); \
vf##x = _mm_max_ps(vf##x, vmin); \
vf##x = _mm_min_ps(vf##x, vmax); \
vi##x = _mm_cvtps_epi32(vf##x); \
vi##x = byteswap32(vi##x);
int falign = (uintptr_t)src & 0xF;
int ialign = (uintptr_t)dst & 0xF;
if (falign != 0 || ialign != 0) {
vf0 = _mm_loadu_ps(src);
F32TOBE32(0)
_mm_storeu_si128((__m128i *)dst, vi0);
unsigned int n = (16 - ialign) / 4;
src += n;
dst += n;
count -= n;
falign = (uintptr_t)src & 0xF;
if (falign != 0) {
while (count >= 4) {
vf0 = _mm_loadu_ps(src);
F32TOBE32(0)
_mm_store_si128((__m128i *)dst, vi0);
src += 4;
dst += 4;
count -= 4;
}
goto VectorCleanup;
}
}
while (count >= 4) {
vf0 = _mm_load_ps(src);
F32TOBE32(0)
_mm_store_si128((__m128i *)dst, vi0);
src += 4;
dst += 4;
count -= 4;
}
VectorCleanup:
if (count > 0) {
src = src0 + numToConvert - 4;
dst = dst0 + numToConvert - 4;
vf0 = _mm_loadu_ps(src);
F32TOBE32(0)
_mm_storeu_si128((__m128i *)dst, vi0);
}
RESTORE_ROUNDMODE
return;
}
if (count > 0) {
double scale = 2147483648.0, round = 0.5, max32 = 2147483648.0 - 1.0 - 0.5, min32 = 0.;
ROUNDMODE_NEG_INF
while (count-- > 0) {
double f0 = *src++;
f0 = f0 * scale + round;
SInt32 i0 = FloatToInt(f0, min32, max32);
#if __ppc__
*dst++ = OSSwapInt32(i0);
#else
*dst++ = i0;
#endif
}
RESTORE_ROUNDMODE
}
}
static inline __m128i Pack32ToLE24(__m128i val, __m128i mask)
{
__m128i store;
val = _mm_srli_si128(val, 1);
store = _mm_and_si128(val, mask);
val = _mm_srli_si128(val, 1);
mask = _mm_slli_si128(mask, 3);
store = _mm_or_si128(store, _mm_and_si128(val, mask));
val = _mm_srli_si128(val, 1);
mask = _mm_slli_si128(mask, 3);
store = _mm_or_si128(store, _mm_and_si128(val, mask));
val = _mm_srli_si128(val, 1);
mask = _mm_slli_si128(mask, 3);
store = _mm_or_si128(store, _mm_and_si128(val, mask));
return store;
}
void Float32ToNativeInt24_X86( const Float32 *src, UInt8 *dst, unsigned int numToConvert )
{
const Float32 *src0 = src;
UInt8 *dst0 = dst;
unsigned int count = numToConvert;
if (count >= 6) {
ROUNDMODE_NEG_INF
const __m128 vround = (const __m128) { 0.5f, 0.5f, 0.5f, 0.5f };
const __m128 vmin = (const __m128) { -2147483648.0f, -2147483648.0f, -2147483648.0f, -2147483648.0f };
const __m128 vmax = (const __m128) { kMaxFloat32, kMaxFloat32, kMaxFloat32, kMaxFloat32 };
const __m128 vscale = (const __m128) { 2147483648.0f, 2147483648.0f, 2147483648.0f, 2147483648.0f };
__m128i mask = _mm_setr_epi32(0x00FFFFFF, 0, 0, 0);
__m128i store;
union {
UInt32 i[4];
__m128i v;
} u;
__m128 vf0;
__m128i vi0;
int falign = (uintptr_t)src & 0xF;
if (falign != 0) {
vf0 = _mm_loadu_ps(src);
F32TOLE32(0)
store = Pack32ToLE24(vi0, mask);
_mm_storeu_si128((__m128i *)dst, store);
unsigned int n = (16 - falign) / 4;
src += n;
dst += 3*n; count -= n;
}
while (count >= 6) {
vf0 = _mm_load_ps(src);
F32TOLE32(0)
store = Pack32ToLE24(vi0, mask);
_mm_storeu_si128((__m128i *)dst, store);
src += 4;
dst += 12; count -= 4;
}
if (count >= 4) {
vf0 = _mm_load_ps(src);
F32TOLE32(0)
u.v = Pack32ToLE24(vi0, mask);
((UInt32 *)dst)[0] = u.i[0];
((UInt32 *)dst)[1] = u.i[1];
((UInt32 *)dst)[2] = u.i[2];
src += 4;
dst += 12; count -= 4;
}
if (count > 0) {
src = src0 + numToConvert - 4;
dst = dst0 + 3*numToConvert - 12;
vf0 = _mm_loadu_ps(src);
F32TOLE32(0)
u.v = Pack32ToLE24(vi0, mask);
((UInt32 *)dst)[0] = u.i[0];
((UInt32 *)dst)[1] = u.i[1];
((UInt32 *)dst)[2] = u.i[2];
}
RESTORE_ROUNDMODE
return;
}
if (count > 0) {
double scale = 2147483648.0, round = 0.5, max32 = 2147483648.0 - 1.0 - 0.5, min32 = 0.;
ROUNDMODE_NEG_INF
while (count-- > 0) {
double f0 = *src++;
f0 = f0 * scale + round;
UInt32 i0 = FloatToInt(f0, min32, max32);
dst[0] = (UInt8)(i0 >> 8);
dst[1] = (UInt8)(i0 >> 16);
dst[2] = (UInt8)(i0 >> 24);
dst += 3;
}
RESTORE_ROUNDMODE
}
}
#pragma mark -
#pragma mark Int -> Float
void NativeInt16ToFloat32_X86( const SInt16 *src, Float32 *dst, unsigned int numToConvert )
{
const SInt16 *src0 = src;
Float32 *dst0 = dst;
unsigned int count = numToConvert;
if (count >= 8) {
#define LEI16TOF32(x, y) \
vi##x = _mm_unpacklo_epi16(zero, vpack##x); \
vi##y = _mm_unpackhi_epi16(zero, vpack##x); \
vf##x = _mm_cvtepi32_ps(vi##x); \
vf##y = _mm_cvtepi32_ps(vi##y); \
vf##x = _mm_mul_ps(vf##x, vscale); \
vf##y = _mm_mul_ps(vf##y, vscale);
const __m128 vscale = (const __m128) { 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f };
const __m128i zero = _mm_setzero_si128();
__m128 vf0, vf1;
__m128i vi0, vi1, vpack0;
int ialign = (uintptr_t)src & 0xF;
int falign = (uintptr_t)dst & 0xF;
if (falign != 0 || ialign != 0) {
vpack0 = _mm_loadu_si128((__m128i const *)src);
LEI16TOF32(0, 1)
_mm_storeu_ps(dst, vf0);
_mm_storeu_ps(dst+4, vf1);
unsigned int n = (16 - falign) / 4;
src += n;
dst += n;
count -= n;
ialign = (uintptr_t)src & 0xF;
if (ialign != 0) {
while (count >= 8) {
vpack0 = _mm_loadu_si128((__m128i const *)src);
LEI16TOF32(0, 1)
_mm_store_ps(dst, vf0);
_mm_store_ps(dst+4, vf1);
src += 8;
dst += 8;
count -= 8;
}
goto VectorCleanup;
}
}
while (count >= 8) {
vpack0 = _mm_load_si128((__m128i const *)src);
LEI16TOF32(0, 1)
_mm_store_ps(dst, vf0);
_mm_store_ps(dst+4, vf1);
src += 8;
dst += 8;
count -= 8;
}
VectorCleanup:
if (count > 0) {
src = src0 + numToConvert - 8;
dst = dst0 + numToConvert - 8;
vpack0 = _mm_loadu_si128((__m128i const *)src);
LEI16TOF32(0, 1)
_mm_storeu_ps(dst, vf0);
_mm_storeu_ps(dst+4, vf1);
}
return;
}
if (count > 0) {
double scale = 1./32768.f;
while (count-- > 0) {
SInt16 i = *src++;
double f = (double)i * scale;
*dst++ = f;
}
}
}
void SwapInt16ToFloat32_X86( const SInt16 *src, Float32 *dst, unsigned int numToConvert )
{
const SInt16 *src0 = src;
Float32 *dst0 = dst;
unsigned int count = numToConvert;
if (count >= 8) {
#define BEI16TOF32 \
vpack0 = byteswap16(vpack0); \
vi0 = _mm_unpacklo_epi16(zero, vpack0); \
vi1 = _mm_unpackhi_epi16(zero, vpack0); \
vf0 = _mm_cvtepi32_ps(vi0); \
vf1 = _mm_cvtepi32_ps(vi1); \
vf0 = _mm_mul_ps(vf0, vscale); \
vf1 = _mm_mul_ps(vf1, vscale);
const __m128 vscale = (const __m128) { 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f };
const __m128i zero = _mm_setzero_si128();
__m128 vf0, vf1;
__m128i vi0, vi1, vpack0;
int ialign = (uintptr_t)src & 0xF;
int falign = (uintptr_t)dst & 0xF;
if (falign != 0 || ialign != 0) {
vpack0 = _mm_loadu_si128((__m128i const *)src);
BEI16TOF32
_mm_storeu_ps(dst, vf0);
_mm_storeu_ps(dst+4, vf1);
unsigned int n = (16 - falign) / 4;
src += n;
dst += n;
count -= n;
ialign = (uintptr_t)src & 0xF;
if (ialign != 0) {
while (count >= 8) {
vpack0 = _mm_loadu_si128((__m128i const *)src);
BEI16TOF32
_mm_store_ps(dst, vf0);
_mm_store_ps(dst+4, vf1);
src += 8;
dst += 8;
count -= 8;
}
goto VectorCleanup;
}
}
while (count >= 8) {
vpack0 = _mm_load_si128((__m128i const *)src);
BEI16TOF32
_mm_store_ps(dst, vf0);
_mm_store_ps(dst+4, vf1);
src += 8;
dst += 8;
count -= 8;
}
VectorCleanup:
if (count > 0) {
src = src0 + numToConvert - 8;
dst = dst0 + numToConvert - 8;
vpack0 = _mm_loadu_si128((__m128i const *)src);
BEI16TOF32
_mm_storeu_ps(dst, vf0);
_mm_storeu_ps(dst+4, vf1);
}
return;
}
if (count > 0) {
double scale = 1./32768.f;
while (count-- > 0) {
SInt16 i = *src++;
#if __ppc__
i = OSSwapInt16(i);
#endif
double f = (double)i * scale;
*dst++ = f;
}
}
}
void NativeInt32ToFloat32_X86( const SInt32 *src, Float32 *dst, unsigned int numToConvert )
{
const SInt32 *src0 = src;
Float32 *dst0 = dst;
unsigned int count = numToConvert;
if (count >= 4) {
#define LEI32TOF32(x) \
vf##x = _mm_cvtepi32_ps(vi##x); \
vf##x = _mm_mul_ps(vf##x, vscale); \
const __m128 vscale = (const __m128) { 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f };
__m128 vf0;
__m128i vi0;
int ialign = (uintptr_t)src & 0xF;
int falign = (uintptr_t)dst & 0xF;
if (falign != 0 || ialign != 0) {
vi0 = _mm_loadu_si128((__m128i const *)src);
LEI32TOF32(0)
_mm_storeu_ps(dst, vf0);
unsigned int n = (16 - falign) / 4;
src += n;
dst += n;
count -= n;
ialign = (uintptr_t)src & 0xF;
if (ialign != 0) {
while (count >= 4) {
vi0 = _mm_loadu_si128((__m128i const *)src);
LEI32TOF32(0)
_mm_store_ps(dst, vf0);
src += 4;
dst += 4;
count -= 4;
}
goto VectorCleanup;
}
}
while (count >= 4) {
vi0 = _mm_load_si128((__m128i const *)src);
LEI32TOF32(0)
_mm_store_ps(dst, vf0);
src += 4;
dst += 4;
count -= 4;
}
VectorCleanup:
if (count > 0) {
src = src0 + numToConvert - 4;
dst = dst0 + numToConvert - 4;
vi0 = _mm_loadu_si128((__m128i const *)src);
LEI32TOF32(0)
_mm_storeu_ps(dst, vf0);
}
return;
}
if (count > 0) {
double scale = 1./2147483648.0f;
while (count-- > 0) {
SInt32 i = *src++;
double f = (double)i * scale;
*dst++ = f;
}
}
}
void SwapInt32ToFloat32_X86( const SInt32 *src, Float32 *dst, unsigned int numToConvert )
{
const SInt32 *src0 = src;
Float32 *dst0 = dst;
unsigned int count = numToConvert;
if (count >= 4) {
#define BEI32TOF32(x) \
vi##x = byteswap32(vi##x); \
vf##x = _mm_cvtepi32_ps(vi##x); \
vf##x = _mm_mul_ps(vf##x, vscale); \
const __m128 vscale = (const __m128) { 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f, 1.0/2147483648.0f };
__m128 vf0;
__m128i vi0;
int ialign = (uintptr_t)src & 0xF;
int falign = (uintptr_t)dst & 0xF;
if (falign != 0 || ialign != 0) {
vi0 = _mm_loadu_si128((__m128i const *)src);
BEI32TOF32(0)
_mm_storeu_ps(dst, vf0);
unsigned int n = (16 - falign) / 4;
src += n;
dst += n;
count -= n;
ialign = (uintptr_t)src & 0xF;
if (ialign != 0) {
while (count >= 4) {
vi0 = _mm_loadu_si128((__m128i const *)src);
BEI32TOF32(0)
_mm_store_ps(dst, vf0);
src += 4;
dst += 4;
count -= 4;
}
goto VectorCleanup;
}
}
while (count >= 4) {
vi0 = _mm_load_si128((__m128i const *)src);
BEI32TOF32(0)
_mm_store_ps(dst, vf0);
src += 4;
dst += 4;
count -= 4;
}
VectorCleanup:
if (count > 0) {
src = src0 + numToConvert - 4;
dst = dst0 + numToConvert - 4;
vi0 = _mm_loadu_si128((__m128i const *)src);
BEI32TOF32(0)
_mm_storeu_ps(dst, vf0);
}
return;
}
if (count > 0) {
double scale = 1./2147483648.0f;
while (count-- > 0) {
SInt32 i = *src++;
#if __ppc__
i = OSSwapInt32(i);
#endif
double f = (double)i * scale;
*dst++ = f;
}
}
}
#endif // __i386__