/*
* Copyright (c) 2009 Apple Inc. All rights reserved.
*
* @APPLE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
* compliance with the License. Please obtain a copy of the License at
* http://www.opensource.apple.com/apsl/ and read it before using this
* file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
* Please see the License for the specific language governing rights and
* limitations under the License.
*
* @APPLE_LICENSE_HEADER_END@
*/
/*****************************************************************************
* Cortex-A8 implementation *
*****************************************************************************/
// Cortex-A8 implementations of memcpy( ), memmove( ) and bcopy( ).
//
// Our tests have shown that NEON is always a performance win for memcpy( ).
// However, for the specific case of copies from a warm source to a cold
// destination when the buffer size is between 1k and 32k, it is not enough
// of a performance win to offset the increased power footprint, resulting
// in an energy usage regression. Thus, we detect that particular case, and
// pass those copies through the ARM core registers. All other copies larger
// than 8 bytes are handled on NEON.
//
// Stephen Canon, August 2009
.text
.code 16
.syntax unified
// void bcopy(const void * source,
// void * destination,
// size_t length)// void *memmove(void * destination,
// const void * source,
// size_t n)// void *memcpy(void * restrict destination,
// const void * restrict source,
// size_t n)// all copy n successive bytes from source to destination. memmove and memcpy
// returns destination, whereas bcopy has no return value. copying takes place
// as if it were through a temporary buffer -- after return destination contains
// exactly the bytes from source, even if the buffers overlap.
.thumb_func _bcopy
.globl _bcopy
.thumb_func _memmove
.globl _memmove
.thumb_func _memcpy
.globl _memcpy
.align 2
_bcopy:
mov r3, r0 // swap the first and second arguments
mov r0, r1 // and fall through into memmove
mov r1, r3 //
.align 2
_memmove:
_memcpy:
subs r3, r0, r1 // offset = destination addr - source addr
it eq
bxeq lr // if source == destination, early out
// Our preference is for using a (faster) front-to-back copy. However, if
// 0 < offset < length, it is necessary to copy back-to-front for correctness.
// We have already ruled out offset == 0, so we can use an unsigned compare
// with length -- if offset is higher, offset is either greater than length
// or negative.
cmp r3, r2
bhs L_copyFrontToBack
/*****************************************************************************
* back to front copy *
*****************************************************************************/
mov ip, r0 // copy destination pointer.
add r1, r2 // move source pointer to end of source array
add ip, r2 // move destination pointer to end of dest array
subs r2, $8 // if length - 8 is negative (i.e. length
blt L_scalarReverseCopy // is less than 8), jump to cleanup path.
tst ip, $7 // if (destination + length) is doubleword
beq L_vectorReverseCopy // aligned, jump to fast path.
0: ldrb r3, [r1, $-1]! // load byte
sub r2, $1 // decrement length
strb r3, [ip, $-1]! // store byte
tst ip, $7 // test alignment
bne 0b
cmp r2, $0 // if length - 8 is negative,
blt L_scalarReverseCopy // jump to the cleanup code
/*****************************************************************************
* destination is doubleword aligned *
*****************************************************************************/
L_vectorReverseCopy:
ands r3, r1, $3 // Extract the alignment of the source
bic r1, $3
tbh [pc, r3, lsl $1] // Dispatch table on source alignment
0:
.short (L_reverseAligned0-0b)/2 // The NEON alignment hardware does not work
.short (L_reverseAligned1-0b)/2 // properly with sub 4-byte alignment and
.short (L_reverseAligned2-0b)/2 // buffers that are uncacheable, so we need
.short (L_reverseAligned3-0b)/2 // to have a software workaround.
/*****************************************************************************
* source is also at least word aligned *
*****************************************************************************/
L_reverseAligned0:
subs r2, $0x38 // if length - 64 is negative, jump to
blt L_reverseVectorCleanup// the cleanup path.
tst ip, $0x38 // if (destination + length) is cacheline
beq L_reverseCachelineAligned // aligned, jump to the fast path.
0: sub r1, $8 // copy eight bytes at a time until the
vld1.32 {d0}, [r1] // destination is 8 byte aligned.
sub ip, $8 //
sub r2, $8 //
tst ip, $0x38 //
vst1.64 {d0}, [ip, :64] //
bne 0b //
cmp r2, $0 // if length - 64 is negative,
blt L_reverseVectorCleanup// jump to the cleanup code
L_reverseCachelineAligned:
sub r3, r2, $0x3c0 // If 1024 < length < 32768, use core
cmp r3, $0x7c00 // register copies instead of NEON to
blo L_useSTMDB // control energy usage.
sub r1, $32 // decrement source
sub ip, $32 // decrement destination
mov r3, $-32 // load address increment
tst r1, $0x1f // if source shares 32 byte alignment
beq L_reverseSourceAligned// jump to loop with more alignment hints
vld1.32 {q2,q3}, [r1], r3 // This loop handles 4-byte aligned copies
vld1.32 {q0,q1}, [r1], r3 // as generally as possible.
subs r2, $64 //
vst1.64 {q2,q3}, [ip,:256], r3 // The Cortex-A8 NEON unit does not always
blt 1f // properly handle misalignment in vld1
.align 3 // with an element size of 8 or 16, so
0: vld1.32 {q2,q3}, [r1], r3 // this is the best we can do without
vst1.64 {q0,q1}, [ip,:256], r3 // handling alignment in software.
vld1.32 {q0,q1}, [r1], r3 //
subs r2, $64 //
vst1.64 {q2,q3}, [ip,:256], r3 //
bge 0b //
b 1f //
L_reverseSourceAligned:
vld1.64 {q2,q3}, [r1,:256], r3 // Identical to loop above except for
vld1.64 {q0,q1}, [r1,:256], r3 // additional alignment information vst1.64 {q2,q3}, [ip,:256], r3 // on Cortex-A8.
blt 1f //
.align 3 //
0: vld1.64 {q2,q3}, [r1,:256], r3 //
vst1.64 {q0,q1}, [ip,:256], r3 //
vld1.64 {q0,q1}, [r1,:256], r3 //
subs r2, $64 //
vst1.64 {q2,q3}, [ip,:256], r3 //
bge 0b //
1: vst1.64 {q0,q1}, [ip,:256], r3 // loop cleanup: final 32 byte store
add r1, $32 // point source at last element stored
add ip, $32 // point destination at last element stored
L_reverseVectorCleanup:
adds r2, $0x38 // If (length - 8) < 0, goto scalar cleanup
blt L_scalarReverseCopy //
0: sub r1, $8 // copy eight bytes at a time until
vld1.32 {d0}, [r1] // (length - 8) < 0.
sub ip, $8 //
subs r2, $8 //
vst1.64 {d0}, [ip, :64] //
bge 0b //
/*****************************************************************************
* sub-doubleword cleanup copies *
*****************************************************************************/
L_scalarReverseCopy:
adds r2, #0x8 // restore length
it eq // if this is zero
bxeq lr // early out
0: ldrb r3, [r1, #-1]! // load a byte from source
strb r3, [ip, #-1]! // store to destination
subs r2, #0x1 // subtract one from length
bne 0b // if non-zero, repeat
bx lr // return
/*****************************************************************************
* STMDB loop for 1k-32k buffers *
*****************************************************************************/
L_useSTMDB:
push {r4-r8,r10,r11}
.align 3
0: ldmdb r1!, {r3-r8,r10,r11}
subs r2, #0x40
stmdb ip!, {r3-r8,r10,r11}
ldmdb r1!, {r3-r8,r10,r11}
pld [r1, #-0x40]
stmdb ip!, {r3-r8,r10,r11}
bge 0b
pop {r4-r8,r10,r11}
b L_reverseVectorCleanup
/*****************************************************************************
* Misaligned vld1 loop *
*****************************************************************************/
// Software alignment fixup to handle source and dest that are relatively
// misaligned mod 4 bytes. Load two 4-byte aligned double words from source,
// use vext.8 to extract a double word to store, and perform an 8-byte aligned
// store to destination.
#define RCOPY_UNALIGNED(offset) \
subs r2, $8 sub r1, $8 mov r3, $-8 subs r2, $8 0: vext.8 d0, d2, d3, $(offset) vld1.32 {d2}, [r1], r3 vst1.64 {d0}, [ip, :64], r3 1: vext.8 d0, d2, d3, $(offset) vst1.64 {d0}, [ip, :64] add r1, $(offset)
L_reverseAligned1:
RCOPY_UNALIGNED(1)
L_reverseAligned2:
RCOPY_UNALIGNED(2)
L_reverseAligned3:
RCOPY_UNALIGNED(3)
/*****************************************************************************
* front to back copy *
*****************************************************************************/
L_copyFrontToBack:
mov ip, r0 // copy destination pointer.
subs r2, $8 // if length - 8 is negative (i.e. length
blt L_scalarCopy // is less than 8), jump to cleanup path.
tst ip, $7 // if the destination is doubleword
beq L_vectorCopy // aligned, jump to fast path.
0: ldrb r3, [r1], $1 // load byte
sub r2, $1 // decrement length
strb r3, [ip], $1 // store byte
tst ip, $7 // test alignment
bne 0b
cmp r2, $0 // if length - 8 is negative,
blt L_scalarCopy // jump to the cleanup code
/*****************************************************************************
* destination is doubleword aligned *
*****************************************************************************/
L_vectorCopy:
ands r3, r1, $3 // Extract the alignment of the source
bic r1, $3
tbh [pc, r3, lsl $1] // Dispatch table on source alignment
0:
.short (L_sourceAligned0-0b)/2 // The NEON alignment hardware does not work
.short (L_sourceAligned1-0b)/2 // properly with sub 4-byte alignment and
.short (L_sourceAligned2-0b)/2 // buffers that are uncacheable, so we need
.short (L_sourceAligned3-0b)/2 // to have a software workaround.
/*****************************************************************************
* source is also at least word aligned *
*****************************************************************************/
L_sourceAligned0:
subs r2, $0x38 // If (length - 64) < 0
blt L_vectorCleanup // jump to cleanup code
tst ip, $0x38 // If destination is 64 byte aligned
beq L_cachelineAligned // jump to main loop
0: vld1.32 {d0}, [r1]! // Copy one double word at a time until
sub r2, $8 // the destination is 64-byte aligned.
vst1.64 {d0}, [ip, :64]! //
tst ip, $0x38 //
bne 0b //
cmp r2, $0 // If (length - 64) < 0, goto cleanup
blt L_vectorCleanup //
L_cachelineAligned:
sub r3, r2, $0x3c0 // If 1024 < length < 32768, use core
cmp r3, $0x7c00 // register copies instead of NEON to
blo L_useSTMIA // control energy usage.
tst r1, $0x1f // If source has 32-byte alignment, use
beq L_sourceAligned32 // an optimized loop.
vld1.32 {q2,q3}, [r1]! // This is the most common path for small
vld1.32 {q0,q1}, [r1]! // copies, which are alarmingly frequent.
subs r2, #0x40 // It requires 4-byte alignment on the
vst1.64 {q2,q3}, [ip, :256]! // source. For ordinary malloc'd buffers,
blt 1f // this path could handle only single-byte
.align 3 // alignment at speed by using vld1.8
0: vld1.32 {q2,q3}, [r1]! // instead of vld1.32 vld1.32 {q0,q1}, [r1]! // special copies if the element size is
subs r2, #0x40 // 8 or 16, so we need to work around
vst1.64 {q2,q3}, [ip, :256]! // sub 4-byte alignment in software, in
bge 0b // another code path.
b 1f
L_sourceAligned32:
vld1.64 {q2,q3}, [r1, :256]! // When the source shares 32-byte alignment
vld1.64 {q0,q1}, [r1, :256]! // with the destination, we use this loop
subs r2, #0x40 // instead, which specifies the maximum
vst1.64 {q2,q3}, [ip, :256]! // :256 alignment on all loads and stores.
blt 1f //
.align 3 // This gets an additional .5 bytes per
0: vld1.64 {q2,q3}, [r1, :256]! // cycle for in-cache copies, which is not
vst1.64 {q0,q1}, [ip, :256]! // insignificant for this (rather common)
vld1.64 {q0,q1}, [r1, :256]! // case.
subs r2, #0x40 //
vst1.64 {q2,q3}, [ip, :256]! // This is identical to the above loop,
bge 0b // except for the additional alignment.
1: vst1.64 {q0,q1}, [ip, :256]! //
L_vectorCleanup:
adds r2, $0x38 // If (length - 8) < 0, goto scalar cleanup
blt L_scalarCopy //
0: vld1.32 {d0}, [r1]! // Copy one doubleword at a time until
subs r2, $8 // (length - 8) < 0.
vst1.64 {d0}, [ip, :64]! //
bge 0b //
/*****************************************************************************
* sub-doubleword cleanup copies *
*****************************************************************************/
L_scalarCopy:
adds r2, #0x8 // restore length
it eq // if this is zero
bxeq lr // early out
0: ldrb r3, [r1], #1 // load a byte from source
strb r3, [ip], #1 // store to destination
subs r2, #1 // subtract one from length
bne 0b // if non-zero, repeat
bx lr // return
/*****************************************************************************
* STMIA loop for 1k-32k buffers *
*****************************************************************************/
L_useSTMIA:
push {r4-r8,r10,r11}
.align 3
0: ldmia r1!, {r3-r8,r10,r11}
subs r2, r2, #64
stmia ip!, {r3-r8,r10,r11}
ldmia r1!, {r3-r8,r10,r11}
pld [r1, #64]
stmia ip!, {r3-r8,r10,r11}
bge 0b
pop {r4-r8,r10,r11}
b L_vectorCleanup
/*****************************************************************************
* Misaligned reverse vld1 loop *
*****************************************************************************/
// Software alignment fixup to handle source and dest that are relatively
// misaligned mod 4 bytes. Load two 4-byte aligned double words from source,
// use vext.8 to extract a double word to store, and perform an 8-byte aligned
// store to destination.
#define COPY_UNALIGNED(offset) \
subs r2, $8 vld1.32 {d2,d3}, [r1]! blt 1f vmov d2, d3 subs r2, $8 bge 0b sub r1, $8 2: add r1, $(offset) b L_scalarCopy
L_sourceAligned1:
COPY_UNALIGNED(1)
L_sourceAligned2:
COPY_UNALIGNED(2)
L_sourceAligned3:
COPY_UNALIGNED(3)