/* * Copyright (c) 1992-2001 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * The contents of this file constitute Original Code as defined in and * are subject to the Apple Public Source License Version 1.1 (the * "License"). You may not use this file except in compliance with the * License. Please obtain a copy of the License at * http://www.apple.com/publicsource and read it before using this file. * * This Original Code and all software distributed under the License are * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the * License for the specific language governing rights and limitations * under the License. * * @APPLE_LICENSE_HEADER_END@ */ #include <architecture/ppc/asm_help.h> // ================================================================================================= // *** The easiest way to assemble things on Mac OS X is via "cc", so this uses #defines and such. // ================================================================================================= // Keep track of whether we have Altivec // This gets set in pthread_init() .data .align 2 .globl __cpu_has_altivec __cpu_has_altivec: .long 0 .text .align 2 .globl _bcopy .globl _memcpy .globl _memmove _bcopy: mr r2,r4 // Since bcopy uses (src,dest,count), swap r3,r4 mr r4,r3 mr r3,r2 _memcpy: _memmove: mr r2,r3 // Store dest ptr in r2 to preserve r3 on return // ------------------ // Standard registers #define rs r4 #define rd r2 #define rc r5 // Should we bother using Altivec? cmpwi r5, 128 blt+ LScalar // Determine whether we have Altivec enabled mflr r0 bcl 20,31,1f 1: mflr r6 mtlr r0 addis r6, r6, ha16(__cpu_has_altivec - 1b) lwz r6, lo16(__cpu_has_altivec - 1b)(r6) cmpwi r6, 0 bne+ LAltivec // ================================================================================================= // ***************************************** // * S c a l a r B l o c k M o o f D a t a * // ***************************************** // // This is the scalar (non-AltiVec) version of BlockMoofData. // // void ScalarBlockMoofData (ptr sou, ptr dest, long len) // void ScalarBlockMoofDataUncached (ptr sou, ptr dest, long len) // // // Calling Sequence: r3 = source pointer // r4 = destination pointer // r5 = length in bytes // // Uses: all volatile registers. LScalar: cmplwi cr7,rc,32 // length <= 32 bytes? cmplw cr6,rd,rs // up or down? mr. r0,rc // copy to r0 for MoveShort, and test for negative bgt cr7,Lbm1 // skip if count > 32 // Handle short moves (<=32 bytes.) beq cr7,LMove32 // special case 32-byte blocks blt cr6,LMoveDownShort // move down in memory and return add rs,rs,rc // moving up (right-to-left), so adjust pointers add rd,rd,rc b LMoveUpShort // move up in memory and return // Handle long moves (>32 bytes.) Lbm1: beqlr cr6 // rs==rd, so nothing to move bltlr cr0 // length<0, so ignore call and return mflr r12 // save return address bge cr6,Lbm2 // rd>=rs, so move up // Long moves down (left-to-right.) neg r6,rd // start to 32-byte-align destination andi. r0,r6,0x1F // r0 <- bytes to move to align destination bnel LMoveDownShort // align destination if necessary bl LMoveDownLong // move 32-byte chunks down andi. r0,rc,0x1F // done? mtlr r12 // restore caller's return address bne LMoveDownShort // move trailing leftover bytes and done blr // no leftovers, so done // Long moves up (right-to-left.) Lbm2: add rs,rs,rc // moving up (right-to-left), so adjust pointers add rd,rd,rc andi. r0,rd,0x1F // r0 <- bytes to move to align destination bnel LMoveUpShort // align destination if necessary bl LMoveUpLong // move 32-byte chunks up andi. r0,rc,0x1F // done? mtlr r12 // restore caller's return address bne LMoveUpShort // move trailing leftover bytes and done blr // no leftovers, so done // *************** // * M O V E 3 2 * // *************** // // Special case subroutine to move a 32-byte block. MoveDownShort and // MoveUpShort only handle 0..31 bytes, and we believe 32 bytes is too // common a case to send it through the general purpose long-block code. // Since it moves both up and down, we must load all 32 bytes before // storing any. // // Calling Sequence: rs = source ptr // rd = destination ptr // // Uses: r0,r5-r11. // LMove32: lwz r0,0(rs) lwz r5,4(rs) lwz r6,8(rs) lwz r7,12(rs) lwz r8,16(rs) lwz r9,20(rs) lwz r10,24(rs) lwz r11,28(rs) stw r0,0(rd) stw r5,4(rd) stw r6,8(rd) stw r7,12(rd) stw r8,16(rd) stw r9,20(rd) stw r10,24(rd) stw r11,28(rd) blr // ************************* // * M o v e U p S h o r t * // ************************* // // Subroutine called to move <32 bytes up in memory (ie, right-to-left). // // Entry conditions: rs = last byte moved from source (right-to-left) // rd = last byte moved into destination // r0 = #bytes to move (0..31) // // Exit conditions: rs = updated source ptr // rd = updated destination ptr // rc = decremented by #bytes moved // // Uses: r0,r6,r7,r8,cr7. // LMoveUpShort: andi. r6,r0,0x10 // test 0x10 bit in length mtcrf 0x1,r0 // move count to cr7 so we can test bits sub rc,rc,r0 // decrement count of bytes remaining to be moved beq Lmus1 // skip if 0x10 bit in length is 0 lwzu r0,-16(rs) // set, so copy up 16 bytes lwz r6,4(rs) lwz r7,8(rs) lwz r8,12(rs) stwu r0,-16(rd) stw r6,4(rd) stw r7,8(rd) stw r8,12(rd) Lmus1: bf 28,Lmus2 // test 0x08 bit lwzu r0,-8(rs) lwz r6,4(rs) stwu r0,-8(rd) stw r6,4(rd) Lmus2: bf 29,Lmus3 // test 0x4 bit lwzu r0,-4(rs) stwu r0,-4(rd) Lmus3: bf 30,Lmus4 // test 0x2 bit lhzu r0,-2(rs) sthu r0,-2(rd) Lmus4: bflr 31 // test 0x1 bit, return if 0 lbzu r0,-1(rs) stbu r0,-1(rd) blr // ***************************** // * M o v e D o w n S h o r t * // ***************************** // // Subroutine called to move <32 bytes down in memory (ie, left-to-right). // // Entry conditions: rs = source pointer // rd = destination pointer // r0 = #bytes to move (0..31) // // Exit conditions: rs = ptr to 1st byte not moved // rd = ptr to 1st byte not moved // rc = decremented by #bytes moved // // Uses: r0,r6,r7,r8,cr7. // LMoveDownShort: andi. r6,r0,0x10 // test 0x10 bit in length mtcrf 0x1,r0 // move count to cr7 so we can test bits sub rc,rc,r0 // decrement count of bytes remaining to be moved beq Lmds1 // skip if 0x10 bit in length is 0 lwz r0,0(rs) // set, so copy up 16 bytes lwz r6,4(rs) lwz r7,8(rs) lwz r8,12(rs) addi rs,rs,16 stw r0,0(rd) stw r6,4(rd) stw r7,8(rd) stw r8,12(rd) addi rd,rd,16 Lmds1: bf 28,Lmds2 // test 0x08 bit lwz r0,0(rs) lwz r6,4(rs) addi rs,rs,8 stw r0,0(rd) stw r6,4(rd) addi rd,rd,8 Lmds2: bf 29,Lmds3 // test 0x4 bit lwz r0,0(rs) addi rs,rs,4 stw r0,0(rd) addi rd,rd,4 Lmds3: bf 30,Lmds4 // test 0x2 bit lhz r0,0(rs) addi rs,rs,2 sth r0,0(rd) addi rd,rd,2 Lmds4: bflr 31 // test 0x1 bit, return if 0 lbz r0,0(rs) addi rs,rs,1 stb r0,0(rd) addi rd,rd,1 blr // *********************** // * M o v e U p L o n g * // *********************** // // Subroutine to move 32-byte chunks of memory up (ie, right-to-left.) // The destination is known to be 32-byte aligned, but the source is // *not* necessarily aligned. // // Entry conditions: rs = last byte moved from source (right-to-left) // rd = last byte moved into destination // rc = count of bytes to move // cr = crCached set iff destination is cacheable // // Exit conditions: rs = updated source ptr // rd = updated destination ptr // rc = low order 8 bits of count of bytes to move // // Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7. // LMoveUpLong: srwi. r11,rc,5 // r11 <- #32 byte chunks to move mtctr r11 // prepare loop count beqlr // return if no chunks to move andi. r0,rs,7 // is source at least doubleword aligned? beq Lmup3 // yes, can optimize this case mtcrf 0x1,rc // save low bits of count mtcrf 0x2,rc // (one cr at a time, as 604 prefers) Lmup1: // loop over each 32-byte-chunk lwzu r0,-32(rs) subi rd,rd,32 // prepare destination address for 'dcbz' lwz r5,4(rs) lwz r6,8(rs) lwz r7,12(rs) lwz r8,16(rs) lwz r9,20(rs) lwz r10,24(rs) lwz r11,28(rs) stw r0,0(rd) stw r5,4(rd) stw r6,8(rd) stw r7,12(rd) stw r8,16(rd) stw r9,20(rd) stw r10,24(rd) stw r11,28(rd) bdnz Lmup1 mfcr rc // restore low bits of count blr // return to caller // Aligned operands, so use d.p. floating point registers to move data. Lmup3: lfdu f0,-32(rs) subi rd,rd,32 // prepare destination address for 'dcbz' lfd f1,8(rs) lfd f2,16(rs) lfd f3,24(rs) stfd f0,0(rd) stfd f1,8(rd) stfd f2,16(rd) stfd f3,24(rd) bdnz Lmup3 blr // return to caller // *************************** // * M o v e D o w n L o n g * // *************************** // // Subroutine to move 32-byte chunks of memory down (ie, left-to-right.) // The destination is known to be 32-byte aligned, but the source is // *not* necessarily aligned. // // Entry conditions: rs = source ptr (next byte to move) // rd = dest ptr (next byte to move into) // rc = count of bytes to move // cr = crCached set iff destination is cacheable // // Exit conditions: rs = updated source ptr // rd = updated destination ptr // rc = low order 8 bits of count of bytes to move // // Uses: r0,r5-r11,fr0-fr3,ctr,cr0,cr6,cr7. // LMoveDownLong: srwi. r11,rc,5 // r11 <- #32 byte chunks to move mtctr r11 // prepare loop count beqlr // return if no chunks to move andi. r0,rs,7 // is source at least doubleword aligned? beq Lmdown3 // yes, can optimize this case mtcrf 0x1,rc // save low 8 bits of count mtcrf 0x2,rc // (one cr at a time, as 604 prefers) Lmdown1: // loop over each 32-byte-chunk lwz r0,0(rs) lwz r5,4(rs) lwz r6,8(rs) lwz r7,12(rs) lwz r8,16(rs) lwz r9,20(rs) lwz r10,24(rs) lwz r11,28(rs) stw r0,0(rd) stw r5,4(rd) stw r6,8(rd) stw r7,12(rd) stw r8,16(rd) stw r9,20(rd) addi rs,rs,32 stw r10,24(rd) stw r11,28(rd) addi rd,rd,32 bdnz Lmdown1 mfcr rc // restore low bits of count blr // return to caller // Aligned operands, so use d.p. floating point registers to move data. Lmdown3: lfd f0,0(rs) lfd f1,8(rs) lfd f2,16(rs) lfd f3,24(rs) addi rs,rs,32 stfd f0,0(rd) stfd f1,8(rd) stfd f2,16(rd) stfd f3,24(rd) addi rd,rd,32 bdnz Lmdown3 blr // return to caller // // Register use conventions are as follows: // // r0 - temp // r6 - copy of VMX SPR at entry // r7 - temp // r8 - constant -1 (also temp and a string op buffer) // r9 - constant 16 or -17 (also temp and a string op buffer) // r10- constant 32 or -33 (also temp and a string op buffer) // r11- constant 48 or -49 (also temp and a string op buffer) // r12- chunk count ("c") in long moves // // v0 - vp - permute vector // v1 - va - 1st quadword of source // v2 - vb - 2nd quadword of source // v3 - vc - 3rd quadword of source // v4 - vd - 4th quadword of source // v5 - vx - temp // v6 - vy - temp // v7 - vz - temp #define vp v0 #define va v1 #define vb v2 #define vc v3 #define vd v4 #define vx v5 #define vy v6 #define vz v7 #define VRSave 256 // kShort should be the crossover point where the long algorithm is faster than the short. // WARNING: kShort must be >= 64 // Yes, I know, we just checked rc > 128 to get here... #define kShort 128 LAltivec: cmpwi cr1,rc,kShort //(1) too short to bother using vector regs? sub. r0,rd,rs //(1) must move reverse if (rd-rs)<rc dcbt 0,rs //(2) prefetch first source block cmplw cr6,r0,rc //(2) set cr6 blt iff we must move reverse beqlr- //(2) done if src==dest srawi. r9,rc,4 //(3) r9 <- quadwords to move, test for zero or r8,rs,rd //(3) start to check for word alignment dcbtst 0,rd //(4) prefetch first destination block rlwinm r8,r8,0,30,31 //(4) r8 is zero if word aligned bgt- cr1,LMoveLong //(4) handle long operands cmpwi cr1,r8,0 //(5) word aligned? rlwinm r7,rc,0,28,31 //(5) r7 <- leftover bytes to move after quadwords bltlr- //(5) done if negative count blt- cr6,LShortReverse //(5) handle reverse moves cmpwi cr7,r7,0 //(6) leftover bytes? beq- Leftovers //(6) r9==0, so no quadwords to move mtctr r9 //(7) set up for quadword loop bne- cr1,LUnalignedLoop //(7) not word aligned (less common than word aligned) // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> // <><> S H O R T O P E R A N D S <><> // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> LAlignedLoop: // word aligned operands (the common case) lfd f0,0(rs) //(1) lfd f1,8(rs) //(2) addi rs,rs,16 //(2) stfd f0,0(rd) //(3) stfd f1,8(rd) //(4) addi rd,rd,16 //(4) bdnz LAlignedLoop //(4) Leftovers: beqlr- cr7 //(8) done if r7==0, ie no leftover bytes mtxer r7 //(9) count of bytes to move (1-15) lswx r8,0,rs stswx r8,0,rd blr //(17) LUnalignedLoop: // not word aligned, cannot use lfd/stfd lwz r8,0(rs) //(1) lwz r9,4(rs) //(2) lwz r10,8(rs) //(3) lwz r11,12(rs) //(4) addi rs,rs,16 //(4) stw r8,0(rd) //(5) stw r9,4(rd) //(6) stw r10,8(rd) //(7) stw r11,12(rd) //(8) addi rd,rd,16 //(8) bdnz LUnalignedLoop //(8) b Leftovers // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> // <><> S H O R T R E V E R S E M O V E S <><> // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> // cr0 & r9 <- #doublewords to move (>=0) // cr1 <- beq if word aligned // r7 <- #leftover bytes to move (0-15) LShortReverse: cmpwi cr7,r7,0 // leftover bytes? add rs,rs,rc // point 1 past end of string for reverse moves add rd,rd,rc beq- LeftoversReverse // r9==0, ie no words to move mtctr r9 // set up for quadword loop bne- cr1,LUnalignedLoopReverse LAlignedLoopReverse: // word aligned, so use lfd/stfd lfd f0,-8(rs) lfdu f1,-16(rs) stfd f0,-8(rd) stfdu f1,-16(rd) bdnz LAlignedLoopReverse LeftoversReverse: beqlr- cr7 // done if r7==0, ie no leftover bytes mtxer r7 // count of bytes to move (1-15) neg r7,r7 // index back by #bytes lswx r8,r7,rs stswx r8,r7,rd blr LUnalignedLoopReverse: // not word aligned, cannot use lfd/stfd lwz r8,-4(rs) lwz r9,-8(rs) lwz r10,-12(rs) lwzu r11,-16(rs) stw r8,-4(rd) stw r9,-8(rd) stw r10,-12(rd) stwu r11,-16(rd) bdnz LUnalignedLoopReverse b LeftoversReverse // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> // <><> L O N G O P E R A N D S <><> // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> // cr6 set (blt) if must move reverse // r0 <- (rd - rs) LMoveLong: mfspr r6,VRSave //(5) save caller's VMX mask register stw r6,-4(r1) // use CR save area so we can use r6 later neg r8,rd //(5) start to compute #bytes to fill in 1st dest quadword rlwinm r0,r0,0,28,31 //(6) start to determine relative alignment andi. r7,r8,0xF //(6) r7 <- #bytes to fill in 1st dest quadword cmpwi cr7,r0,0 //(7) relatively aligned? (ie, 16 bytes apart?) oris r9,r6,0xFF00 //(7) light bits for regs we use (v0-v7) mtspr VRSave,r9 //(8) update live register bitmask blt- cr6,LongReverse //(8) must move reverse direction sub rc,rc,r7 //(9) adjust length while we wait beq- LDest16Aligned //(9) r7==0, ie destination already quadword aligned // Align destination on a quadword. mtxer r7 //(10) set up byte count (1-15) lswx r8,0,rs // load into r8-r11 stswx r8,0,rd // store r8-r11 (measured latency on arthur is 7.2 cycles) add rd,rd,r7 //(18) adjust ptrs add rs,rs,r7 //(18) // Begin preparation for inner loop and "dst" stream. LDest16Aligned: andi. r0,rd,0x10 //(19) is destination cache-block aligned? li r9,16 //(19) r9 <- constant used to access 2nd quadword li r10,32 //(20) r10<- constant used to access 3rd quadword beq- cr7,LAligned //(20) handle relatively aligned operands lvx va,0,rs //(20) prefetch 1st source quadword li r11,48 //(21) r11<- constant used to access 4th quadword lvsl vp,0,rs //(21) get permute vector to left shift beq LDest32Aligned //(22) destination already cache-block aligned // Copy 16 bytes to align destination on 32-byte (cache block) boundary // to maximize store gathering. lvx vb,r9,rs //(23) get 2nd source qw subi rc,rc,16 //(23) adjust count addi rs,rs,16 //(24) adjust source ptr vperm vx,va,vb,vp //(25) vx <- 1st destination qw vor va,vb,vb //(25) va <- vb stvx vx,0,rd //(26) assuming store Q deep enough to avoid latency addi rd,rd,16 //(26) adjust dest ptr // Destination 32-byte aligned, source alignment unknown. LDest32Aligned: srwi. r12,rc,6 //(27) r12<- count of 64-byte chunks to move rlwinm r7,rc,28,30,31 //(27) r7 <- count of 16-byte chunks to move cmpwi cr1,r7,0 //(28) remember if any 16-byte chunks rlwinm r8,r12,0,26,31 //(29) mask chunk count down to 0-63 subi r0,r8,1 //(30) r8==0? beq- LNoChunks //(30) r12==0, ie no chunks to move rlwimi r8,r0,0,25,25 //(31) if r8==0, then r8 <- 64 li r0,64 //(31) r0 <- used to get 1st quadword of next chunk sub. r12,r12,r8 //(32) adjust chunk count, set cr0 mtctr r8 //(32) set up loop count li r8,96 //SKP li r6,128 //SKP // Inner loop for unaligned sources. We copy 64 bytes per iteration. // We loop at most 64 times, then reprime the "dst" and loop again for // the next 4KB. This loop is tuned to keep the CPU flat out, which // means we need to execute a lvx or stvx every cycle. LoopBy64: dcbt rs,r8 //SKP dcbt rs,r6 //SKP lvx vb,r9,rs //(1) 2nd source quadword (1st already in va) lvx vc,r10,rs //(2) 3rd lvx vd,r11,rs //(3) 4th vperm vx,va,vb,vp //(3) vx <- 1st destination quadword lvx va,rs,r0 //(4) get 1st qw of next 64-byte chunk (r0 must be RB!) vperm vy,vb,vc,vp //(4) vy <- 2nd dest qw stvx vx,0,rd //(5) vperm vz,vc,vd,vp //(5) vz <- 3rd dest qw stvx vy,r9,rd //(6) vperm vx,vd,va,vp //(6) vx <- 4th stvx vz,r10,rd //(7) addi rs,rs,64 //(7) stvx vx,r11,rd //(8) addi rd,rd,64 //(8) bdnz LoopBy64 //(8) // End of inner loop. Should we reprime dst stream and restart loop? // This block is only executed when we're moving more than 4KB. // It is usually folded out because cr0 is set in the loop prologue. beq+ LNoChunks // r12==0, ie no more chunks to move sub. r12,r12,r0 // set cr0 if more than 4KB remain to xfer mtctr r0 // initialize loop count to 64 b LoopBy64 // restart inner loop, xfer another 4KB // Fewer than 64 bytes remain to be moved. LNoChunks: // r7 and cr1 are set with the number of QWs andi. rc,rc,0xF //(33) rc <- leftover bytes beq- cr1,LCleanup //(33) r7==0, ie fewer than 16 bytes remaining mtctr r7 //(34) we will loop over 1-3 QWs LoopBy16: lvx vb,r9,rs //(1) vb <- 2nd source quadword addi rs,rs,16 //(1) vperm vx,va,vb,vp //(3) vx <- next destination quadword vor va,vb,vb //(3) va <- vb stvx vx,0,rd //(4) assuming store Q is deep enough to mask latency addi rd,rd,16 //(4) bdnz LoopBy16 //(4) // Move remaining bytes in last quadword. rc and cr0 have the count. LCleanup: lwz r6,-4(r1) // load VRSave from CR save area mtspr VRSave,r6 //(35) restore caller's live-register bitmask beqlr //(36) rc==0, ie no leftovers, so done mtxer rc //(37) load byte count (1-15) lswx r8,0,rs stswx r8,0,rd blr //(45) // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> // <><> L O N G A L I G N E D M O V E S <><> // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> // rs, rd <- both quadword aligned // cr0 <- beq if dest is cache block (32-byte) aligned // r9 <- 16 // r10 <- 32 LAligned: lvx va,0,rs // prefetch 1st source quadword li r11,48 // r11<- constant used to access 4th quadword beq LAligned32 // destination already cache-block aligned // Copy 16 bytes to align destination on 32-byte (cache block) boundary // to maximize store gathering. subi rc,rc,16 // adjust count addi rs,rs,16 // adjust source ptr stvx va,0,rd // assuming store Q deep enough to avoid latency addi rd,rd,16 // adjust dest ptr // Destination 32-byte aligned, source 16-byte aligned. Set up for inner loop. LAligned32: srwi. r12,rc,6 // r12<- count of 64-byte chunks to move rlwinm r7,rc,28,30,31 // r7 <- count of 16-byte chunks to move cmpwi cr1,r7,0 // remember if any 16-byte chunks rlwinm r8,r12,0,26,31 // mask chunk count down to 0-63 subi r0,r8,1 // r8==0? beq- LAlignedNoChunks // r12==0, ie no chunks to move rlwimi r8,r0,0,25,25 // if r8==0, then r8 <- 64 li r0,64 // r0 <- used at end of loop sub. r12,r12,r8 // adjust chunk count, set cr0 mtctr r8 // set up loop count li r8,96 //SKP li r6,128 //SKP // Inner loop for aligned sources. We copy 64 bytes per iteration. LAlignedLoopBy64: dcbt rs,r8 //SKP dcbt rs,r6 //SKP lvx va,0,rs //(1) lvx vb,r9,rs //(2) lvx vc,r10,rs //(3) lvx vd,r11,rs //(4) addi rs,rs,64 //(4) stvx va,0,rd //(5) stvx vb,r9,rd //(6) stvx vc,r10,rd //(7) stvx vd,r11,rd //(8) addi rd,rd,64 //(8) bdnz LAlignedLoopBy64 //(8) // End of inner loop. Loop again for next 4KB iff any. beq+ LAlignedNoChunks // r12==0, ie no more chunks to move sub. r12,r12,r0 // set cr0 if more than 4KB remain to xfer mtctr r0 // reinitialize loop count to 64 b LAlignedLoopBy64 // restart inner loop, xfer another 4KB // Fewer than 64 bytes remain to be moved. LAlignedNoChunks: // r7 and cr1 are set with the number of QWs andi. rc,rc,0xF // rc <- leftover bytes beq- cr1,LCleanup // r7==0, ie fewer than 16 bytes remaining mtctr r7 // we will loop over 1-3 QWs LAlignedLoopBy16: lvx va,0,rs // get next quadword addi rs,rs,16 stvx va,0,rd addi rd,rd,16 bdnz LAlignedLoopBy16 b LCleanup // handle last 0-15 bytes, if any // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> // <><> L O N G R E V E R S E M O V E S <><> // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> // Reverse moves. These involve overlapping operands, with the source // lower in memory (lower addresses) than the destination. They must be // done right-to-left, ie from high addresses down to low addresses. // Throughout this code, we maintain rs and rd as pointers one byte past // the end of the untransferred operands. // // The byte count is >=kShort and the following registers are already loaded: // // r6 - VMX mask at entry // cr7 - beq if relatively aligned // LongReverse: add rd,rd,rc // update source/dest ptrs to be 1 byte past end add rs,rs,rc andi. r7,rd,0xF // r7 <- #bytes needed to move to align destination sub rc,rc,r7 // adjust length while we wait sub rs,rs,r7 // adjust ptrs by #bytes to xfer, also while we wait sub rd,rd,r7 beq- LDest16AlignedReverse // Align destination on a quadword. Note that we do NOT align on a cache // block boundary for store gathering etc// since all these operands overlap // many dest cache blocks will already be in the L1, so its not clear that // this would be a win. mtxer r7 // load byte count lswx r8,0,rs stswx r8,0,rd // Prepare for inner loop and start "dstst" stream. Frankly, its not // clear whether "dst" or "dstst" would be better// somebody should // measure. We use "dstst" because, being overlapped, at least some // source cache blocks will also be stored into. LDest16AlignedReverse: srwi. r12,rc,6 // r12 <- count of 64-byte chunks to move rlwinm r0,rc,11,9,15 // position quadword count for dst rlwinm r11,r12,0,26,31 // mask chunk count down to 0-63 li r9,-17 // r9 <- constant used to access 2nd quadword oris r0,r0,0x0100 // set dst block size to 1 qw li r10,-33 // r10<- constant used to access 3rd quadword ori r0,r0,0xFFE0 // set dst stride to -16 bytes li r8,-1 // r8<- constant used to access 1st quadword dstst rs,r0,3 // start stream 0 subi r0,r11,1 // r11==0 ? lvx va,r8,rs // prefetch 1st source quadword rlwinm r7,rc,28,30,31 // r7 <- count of 16-byte chunks to move lvsl vp,0,rs // get permute vector to right shift cmpwi cr1,r7,0 // remember if any 16-byte chunks beq- LNoChunksReverse // r12==0, so skip inner loop rlwimi r11,r0,0,25,25 // if r11==0, then r11 <- 64 sub. r12,r12,r11 // adjust chunk count, set cr0 mtctr r11 // set up loop count li r11,-49 // r11<- constant used to access 4th quadword li r0,-64 // r0 <- used for several purposes beq- cr7,LAlignedLoopBy64Reverse // Inner loop for unaligned sources. We copy 64 bytes per iteration. LoopBy64Reverse: lvx vb,r9,rs //(1) 2nd source quadword (1st already in va) lvx vc,r10,rs //(2) 3rd quadword lvx vd,r11,rs //(3) 4th vperm vx,vb,va,vp //(3) vx <- 1st destination quadword lvx va,rs,r0 //(4) get 1st qw of next 64-byte chunk (note r0 must be RB) vperm vy,vc,vb,vp //(4) vy <- 2nd dest qw stvx vx,r8,rd //(5) vperm vz,vd,vc,vp //(5) vz <- 3rd destination quadword stvx vy,r9,rd //(6) vperm vx,va,vd,vp //(6) vx <- 4th qw stvx vz,r10,rd //(7) subi rs,rs,64 //(7) stvx vx,r11,rd //(8) subi rd,rd,64 //(8) bdnz LoopBy64Reverse //(8) // End of inner loop. Should we reprime dst stream and restart loop? // This block is only executed when we're moving more than 4KB. // It is usually folded out because cr0 is set in the loop prologue. beq+ LNoChunksReverse // r12==0, ie no more chunks to move lis r8,0x0440 // dst control: 64 4-qw blocks add. r12,r12,r0 // set cr0 if more than 4KB remain to xfer ori r8,r8,0xFFC0 // stride is -64 bytes dstst rs,r8,3 // restart the prefetch stream li r8,64 // inner loop count mtctr r8 // initialize loop count to 64 li r8,-1 // restore qw1 offset for inner loop b LoopBy64Reverse // restart inner loop, xfer another 4KB // Fewer than 64 bytes remain to be moved. LNoChunksReverse: // r7 and cr1 are set with the number of QWs andi. rc,rc,0xF // rc <- leftover bytes beq- cr1,LCleanupReverse // r7==0, ie fewer than 16 bytes left mtctr r7 beq- cr7,LAlignedLoopBy16Reverse LoopBy16Reverse: lvx vb,r9,rs // vb <- 2nd source quadword subi rs,rs,16 vperm vx,vb,va,vp // vx <- next destination quadword vor va,vb,vb // va <- vb stvx vx,r8,rd subi rd,rd,16 bdnz LoopBy16Reverse // Fewer that 16 bytes remain to be moved. LCleanupReverse: // rc and cr0 set with remaining byte count lwz r6,-4(r1) // load VRSave from CR save area mtspr VRSave,r6 // restore caller's live-register bitmask beqlr // rc==0, ie no leftovers so done neg r7,rc // get -(#bytes) mtxer rc // byte count lswx r8,r7,rs stswx r8,r7,rd blr // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> // <><> A L I G N E D L O N G R E V E R S E M O V E S <><> // <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> // Inner loop. We copy 64 bytes per iteration. LAlignedLoopBy64Reverse: lvx va,r8,rs //(1) lvx vb,r9,rs //(2) lvx vc,r10,rs //(3) lvx vd,r11,rs //(4) subi rs,rs,64 //(4) stvx va,r8,rd //(5) stvx vb,r9,rd //(6) stvx vc,r10,rd //(7) stvx vd,r11,rd //(8) subi rd,rd,64 //(8) bdnz LAlignedLoopBy64Reverse //(8) // End of inner loop. Loop for next 4KB iff any. beq+ LNoChunksReverse // r12==0, ie no more chunks to move lis r8,0x0440 // dst control: 64 4-qw blocks add. r12,r12,r0 // r12 <- r12 - 64, set cr0 ori r8,r8,0xFFC0 // stride is -64 bytes dstst rs,r8,3 // restart the prefetch stream li r8,64 // inner loop count mtctr r8 // initialize loop count to 64 li r8,-1 // restore qw1 offset for inner loop b LAlignedLoopBy64Reverse // Loop to copy leftover quadwords (1-3). LAlignedLoopBy16Reverse: lvx va,r8,rs // get next qw subi rs,rs,16 stvx va,r8,rd subi rd,rd,16 bdnz LAlignedLoopBy16Reverse b LCleanupReverse // handle up to 15 bytes in last qw