bcopy.s   [plain text]

;			Copy bytes of data around. handles overlapped data.
;			Change this to use Altivec later on

; void bcopy(from, to, nbytes)

;			Use CR5_lt to indicate non-cached
#define noncache	20
.align 2
#if !defined(MEMCPY) && !defined(MEMMOVE)
.globl _bcopy
			crclr		noncache					; Set cached
			cmplw		cr1,r4,r3					; Compare "to" and "from"
			mr.		r5,r5						; Check if we have a 0 length
			mr		r6,r3						; Set source
			beqlr-		cr1						; Bail if "to" and "from" are the same	
			beqlr-								; Bail if length is 0
			b		Lcopyit						; Go copy it...

;			When we move the memory, forward overlays must be handled.  We
;			also can not use the cache instructions if we are from bcopy_nc.
;			We need to preserve R3 because it needs to be returned for memcpy.
;			We can be interrupted and lose control here.
;			There is no stack, so in order to used floating point, we would
;			need to take the FP exception. Any potential gains by using FP 
;			would be more than eaten up by this.
;			Later, we should used Altivec for large moves.

#if defined(MEMCPY)
.globl _memcpy

#if defined(MEMMOVE)
.globl _memmove
			cmplw		cr1,r3,r4					; "to" and "from" the same?
			mr		r6,r4						; Set the "from"
			mr.		r5,r5						; Length zero?
                        crclr   	noncache                                        ; Set cached
			mr		r4,r3						; Set the "to"
			beqlr-		cr1						; "to" and "from" are the same
			beqlr-								; Length is 0
Lcopyit:		sub		r12,r4,r6					; Get potential overlap (negative if backward move)
			lis		r8,0x7FFF					; Start up a mask
			srawi		r11,r12,31					; Propagate the sign bit
			dcbt		0,r6						; Touch in the first source line
			cntlzw		r7,r5						; Get the highest power of 2 factor of the length
			ori		r8,r8,0xFFFF					; Make limit 0x7FFFFFFF
			xor		r9,r12,r11					; If sink - source was negative, invert bits
			srw		r8,r8,r7					; Get move length limitation
			sub		r9,r9,r11					; If sink - source was negative, add 1 and get absolute value
			cmplw		r12,r5						; See if we actually forward overlap
			cmplwi		cr7,r9,32					; See if at least a line between  source and sink
			dcbtst		0,r4						; Touch in the first sink line
			cmplwi		cr1,r5,32					; Are we moving more than a line?
                        cror    	noncache,noncache,28				; Set to not DCBZ output line if not enough space
			blt-		Lfwdovrlap					; This is a forward overlapping area, handle it...

;			R4 = sink
;			R5 = length
;			R6 = source
;			Here we figure out how much we have to move to get the sink onto a
;			cache boundary.  If we can, and there are still more that 32 bytes
;			left to move, we can really speed things up by DCBZing the sink line.
;			We can not do this if noncache is set because we will take an 
;			alignment exception.

			neg		r0,r4						; Get the number of bytes to move to align to a line boundary
			rlwinm.		r0,r0,0,27,31					; Clean it up and test it
			and		r0,r0,r8					; limit to the maximum front end move
			mtcrf		3,r0						; Make branch mask for partial moves
			sub		r5,r5,r0					; Set the length left to move
			beq		Lalline						; Already on a line...
			bf		31,Lalhalf					; No single byte to do...
			lbz		r7,0(r6)					; Get the byte
			addi		r6,r6,1						; Point to the next
			stb		r7,0(r4)					; Save the single
			addi		r4,r4,1						; Bump sink
;			Sink is halfword aligned here

Lalhalf:		bf		30,Lalword					; No halfword to do...
			lhz		r7,0(r6)					; Get the halfword
			addi		r6,r6,2						; Point to the next
			sth		r7,0(r4)					; Save the halfword
			addi		r4,r4,2						; Bump sink
;			Sink is word aligned here

Lalword:		bf		29,Laldouble					; No word to do...
			lwz		r7,0(r6)					; Get the word
			addi		r6,r6,4						; Point to the next
			stw		r7,0(r4)					; Save the word
			addi		r4,r4,4						; Bump sink
;			Sink is double aligned here

Laldouble:		bf		28,Lalquad					; No double to do...
			lwz		r7,0(r6)					; Get the first word
			lwz		r8,4(r6)					; Get the second word
			addi		r6,r6,8						; Point to the next
			stw		r7,0(r4)					; Save the first word
			stw		r8,4(r4)					; Save the second word
			addi		r4,r4,8						; Bump sink
;			Sink is quadword aligned here

Lalquad:       		bf		27,Lalline					; No quad to do...
			lwz		r7,0(r6)					; Get the first word
			lwz		r8,4(r6)					; Get the second word
			lwz		r9,8(r6)					; Get the third word
			stw		r7,0(r4)					; Save the first word
			lwz		r11,12(r6)					; Get the fourth word
			addi		r6,r6,16					; Point to the next
			stw		r8,4(r4)					; Save the second word
			stw		r9,8(r4)					; Save the third word
			stw		r11,12(r4)					; Save the fourth word
			addi		r4,r4,16					; Bump sink
;			Sink is line aligned here

Lalline:       		rlwinm.		r0,r5,27,5,31					; Get the number of full lines to move
			mtcrf		3,r5						; Make branch mask for backend partial moves
			rlwinm		r11,r5,0,0,26					; Get number of bytes to move
			beq-		Lbackend       					; No full lines to move
			sub		r5,r5,r11					; Calculate the residual
                        li              r10,96                                          ; Stride for touch ahead

Lnxtline:		subic.		r0,r0,1						; Account for the line now

			bt-		noncache,Lskipz					; Skip if we are not cached...
			dcbz		0,r4						; Blow away the whole line because we are replacing it
                        dcbt		r6,r10                                          ; Touch ahead a bit

Lskipz:			lwz		r7,0(r6)					; Get the first word
			lwz		r8,4(r6)					; Get the second word
			lwz		r9,8(r6)					; Get the third word
			stw		r7,0(r4)					; Save the first word
			lwz		r11,12(r6)					; Get the fourth word
			stw		r8,4(r4)					; Save the second word
			lwz		r7,16(r6)					; Get the fifth word
			stw		r9,8(r4)					; Save the third word
			lwz		r8,20(r6)					; Get the sixth word
			stw		r11,12(r4)					; Save the fourth word
			lwz		r9,24(r6)					; Get the seventh word
			stw		r7,16(r4)					; Save the fifth word
			lwz		r11,28(r6)					; Get the eighth word
			addi		r6,r6,32					; Point to the next
			stw		r8,20(r4)					; Save the sixth word
			stw		r9,24(r4)					; Save the seventh word
			stw		r11,28(r4)					; Save the eighth word
			addi		r4,r4,32					; Bump sink
			bgt+		Lnxtline					; Do the next line, if any...

;			Move backend quadword

Lbackend:		bf		27,Lnoquad					; No quad to do...
			lwz		r7,0(r6)					; Get the first word
			lwz		r8,4(r6)					; Get the second word
			lwz		r9,8(r6)					; Get the third word
			lwz		r11,12(r6)					; Get the fourth word
			stw		r7,0(r4)					; Save the first word
			addi		r6,r6,16					; Point to the next
			stw		r8,4(r4)					; Save the second word
			stw		r9,8(r4)					; Save the third word
			stw		r11,12(r4)					; Save the fourth word
			addi		r4,r4,16					; Bump sink
;			Move backend double

Lnoquad:		bf		28,Lnodouble					; No double to do...
			lwz		r7,0(r6)					; Get the first word
			lwz		r8,4(r6)					; Get the second word
			addi		r6,r6,8						; Point to the next
			stw		r7,0(r4)					; Save the first word
			stw		r8,4(r4)					; Save the second word
			addi		r4,r4,8						; Bump sink
;			Move backend word

Lnodouble:		bf		29,Lnoword					; No word to do...
			lwz		r7,0(r6)					; Get the word
			addi		r6,r6,4						; Point to the next
			stw		r7,0(r4)					; Save the word
			addi		r4,r4,4						; Bump sink
;			Move backend halfword

Lnoword:       		bf		30,Lnohalf					; No halfword to do...
			lhz		r7,0(r6)					; Get the halfword
			addi		r6,r6,2						; Point to the next
			sth		r7,0(r4)					; Save the halfword
			addi		r4,r4,2						; Bump sink

;			Move backend byte

Lnohalf:       		bflr		31						; Leave cuz we are all done...	
			lbz		r7,0(r6)					; Get the byte
			stb		r7,0(r4)					; Save the single
			blr								; Leave cuz we are all done...			

;			0123456789ABCDEF0123456789ABCDEF
;			 0123456789ABCDEF0123456789ABCDEF
;										    F
;										  DE
;									  9ABC
;							  12345678
;             123456789ABCDEF0	
;            0

;			Here is where we handle a forward overlapping move.  These will be slow
;			because we can not kill the cache of the destination until after we have
;			loaded/saved the source area.  Also, because reading memory backwards is
;			slower when the cache line needs to be loaded because the critical 
;			doubleword is loaded first, i.e., the last, then it goes back to the first,
;			and on in order.  That means that when we are at the second to last DW we
;			have to wait until the whole line is in cache before we can proceed.
Lfwdovrlap:		add		r4,r5,r4					; Point past the last sink byte
			add		r6,r5,r6					; Point past the last source byte 
			and		r0,r4,r8					; Apply movement limit
			li		r12,-1						; Make sure we touch in the actual line 			
			mtcrf		3,r0						; Figure out the best way to move backwards			
			dcbt		r12,r6						; Touch in the last line of source
			rlwinm.		r0,r0,0,27,31					; Calculate the length to adjust to cache boundary
			dcbtst		r12,r4						; Touch in the last line of the sink
			beq-		Lballine						; Aready on cache line boundary
			sub		r5,r5,r0					; Precaculate move length left after alignment
			bf		31,Lbalhalf					; No single byte to do...
			lbz		r7,-1(r6)					; Get the byte
			subi		r6,r6,1						; Point to the next
			stb		r7,-1(r4)					; Save the single
			subi		r4,r4,1						; Bump sink
;			Sink is halfword aligned here

Lbalhalf:		bf		30,Lbalword					; No halfword to do...
			lhz		r7,-2(r6)					; Get the halfword
			subi		r6,r6,2						; Point to the next
			sth		r7,-2(r4)					; Save the halfword
			subi		r4,r4,2						; Bump sink
;			Sink is word aligned here

Lbalword:		bf		29,Lbaldouble					; No word to do...
			lwz		r7,-4(r6)					; Get the word
			subi		r6,r6,4						; Point to the next
			stw		r7,-4(r4)					; Save the word
			subi		r4,r4,4						; Bump sink
;			Sink is double aligned here

Lbaldouble:		bf		28,Lbalquad					; No double to do...
			lwz		r7,-8(r6)					; Get the first word
			lwz		r8,-4(r6)					; Get the second word
			subi		r6,r6,8						; Point to the next
			stw		r7,-8(r4)					; Save the first word
			stw		r8,-4(r4)					; Save the second word
			subi		r4,r4,8						; Bump sink
;			Sink is quadword aligned here

Lbalquad:		bf		27,Lballine					; No quad to do...
			lwz		r7,-16(r6)					; Get the first word
			lwz		r8,-12(r6)					; Get the second word
			lwz		r9,-8(r6)					; Get the third word
			lwz		r11,-4(r6)					; Get the fourth word
			stw		r7,-16(r4)					; Save the first word
			subi		r6,r6,16					; Point to the next
			stw		r8,-12(r4)					; Save the second word
			stw		r9,-8(r4)					; Save the third word
			stw		r11,-4(r4)					; Save the fourth word
			subi		r4,r4,16					; Bump sink
;			Sink is line aligned here

Lballine:		rlwinm.		r0,r5,27,5,31					; Get the number of full lines to move
			mtcrf		3,r5						; Make branch mask for backend partial moves
			beq-		Lbbackend					; No full lines to move

;			Registers in use: 	R0, R1,     R3, R4, R5, R6
;       		Registers not in use:           R2,                 R7, R8, R9, R10, R11, R12 - Ok, we can make another free for 8 of them
Lbnxtline:		subic.		r0,r0,1						; Account for the line now

			lwz		r7,-32(r6)					; Get the first word
			lwz		r5,-28(r6)					; Get the second word
			lwz		r2,-24(r6)					; Get the third word
			lwz		r12,-20(r6)					; Get the third word
			lwz		r11,-16(r6)					; Get the fifth word
			lwz		r10,-12(r6)					; Get the sixth word
			lwz		r9,-8(r6)					; Get the seventh word
			lwz		r8,-4(r6)					; Get the eighth word
			subi		r6,r6,32					; Point to the next
			stw		r7,-32(r4)					; Get the first word
			ble-		Lbnotouch					; Last time, skip touch of source...
			dcbt		0,r6						; Touch in next source line
Lbnotouch:		stw		r5,-28(r4)					; Get the second word
			stw		r2,-24(r4)					; Get the third word
			stw		r12,-20(r4)					; Get the third word
			stw		r11,-16(r4)					; Get the fifth word
			stw		r10,-12(r4)					; Get the sixth word
			stw		r9,-8(r4)					; Get the seventh word
			stw		r8,-4(r4)					; Get the eighth word
			subi		r4,r4,32					; Bump sink
			bgt+		Lbnxtline					; Do the next line, if any...

;			Note: We touched these lines in at the beginning
;			Move backend quadword

Lbbackend:		bf		27,Lbnoquad					; No quad to do...
			lwz		r7,-16(r6)					; Get the first word
			lwz		r8,-12(r6)					; Get the second word
			lwz		r9,-8(r6)					; Get the third word
			lwz		r11,-4(r6)					; Get the fourth word
			stw		r7,-16(r4)					; Save the first word
			subi		r6,r6,16					; Point to the next
			stw		r8,-12(r4)					; Save the second word
			stw		r9,-8(r4)					; Save the third word
			stw		r11,-4(r4)					; Save the fourth word
			subi		r4,r4,16					; Bump sink
;			Move backend double

Lbnoquad:		bf		28,Lbnodouble					; No double to do...
			lwz		r7,-8(r6)					; Get the first word
			lwz		r8,-4(r6)					; Get the second word
			subi		r6,r6,8						; Point to the next
			stw		r7,-8(r4)					; Save the first word
			stw		r8,-4(r4)					; Save the second word
			subi		r4,r4,8						; Bump sink
;			Move backend word

Lbnodouble:		bf		29,Lbnoword					; No word to do...
			lwz		r7,-4(r6)					; Get the word
			subi		r6,r6,4						; Point to the next
			stw		r7,-4(r4)					; Save the word
			subi		r4,r4,4						; Bump sink
;			Move backend halfword

Lbnoword:		bf		30,Lbnohalf					; No halfword to do...
			lhz		r7,-2(r6)					; Get the halfword
			subi		r6,r6,2						; Point to the next
			sth		r7,-2(r4)					; Save the halfword
			subi		r4,r4,2						; Bump sink

;			Move backend byte

Lbnohalf:		bflr		31						; Leave cuz we are all done...	
			lbz		r7,-1(r6)					; Get the byte
			stb		r7,-1(r4)					; Save the single
			blr								; Leave cuz we are all done...