/*
 * Copyright (c) 2003-2005 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <ppc/proc_reg.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>


// commpage_time_dcba() uses a stack frame as follows:

#define	kBufSiz		1024				// Size of the buffer we use to do DCBA timing on G4
#define	kSFSize		(kBufSiz+128+16)	// Stack frame size, which contains the 128-byte-aligned buffer
#define	kLoopCnt	5					// Iterations of the timing loop
#define	kDCBA		22					// Bit in cr5 used as a flag in timing loop


// commpage_set_timestamp() uses the red zone for temporary storage:

#define	rzSaveF1			-8		// caller's FPR1
#define	rzSaveF2			-16		// caller's FPR2
#define	rzSaveF3			-24		// caller's FPR3
#define	rzSaveF4			-32		// caller's FPR4
#define	rzSaveF5			-40		// caller's FPR5
#define	rzNewTimeBase		-48		// used to load 64-bit TBR into a FPR


// commpage_set_timestamp() uses the following data.  kkTicksPerSec remembers
// the number used to compute _COMM_PAGE_SEC_PER_TICK.  Since this constant
// rarely changes, we use it to avoid needless recomputation.  It is a double
// value, pre-initialize with an exponent of 2**52.

#define	kkBinary0		0					// offset in data to long long 0 (a constant)
#define	kkDouble1		8					// offset in data to double 1.0 (a constant)
#define	kkTicksPerSec	16					// offset in data to double(ticks_per_sec)

        .data
        .align	3							// three doubleword fields
Ldata:
        .long	0							// kkBinary0
        .long	0
        .double	1.0e0						// kkDouble1        
        .long	0x43300000					// kkTicksPerSec (plus 2**52)
        .long	0							// this is where we store ticks_per_sec, to float

        .text
        .align	2
        .globl	EXT(commpage_time_dcba)
        .globl	EXT(commpage_set_timestamp)


/*	***********************************************
 *	* C O M M P A G E _ S E T _ T I M E S T A M P *
 *	***********************************************
 *
 *	Update the gettimeofday() shared data on the commpages, as follows:
 *		_COMM_PAGE_TIMESTAMP = the clock offset at timebase (seconds)
 *		_COMM_PAGE_TIMEBASE = the timebase at which the timestamp was valid
 *		_COMM_PAGE_SEC_PER_TICK = multiply timebase ticks by this to get seconds (double)
 *	The convention is that if the timebase is 0, the data is invalid.  Because other
 *	CPUs are reading the three values asynchronously and must get a consistent set, 
 *	it is critical that we update them with the following protocol:
 *		1. set timebase to 0 (atomically), to invalidate all three values
 *		2. eieio (to create a barrier in stores to cacheable memory)
 *		3. change timestamp and "secs per tick"
 *		4. eieio
 *		5. set timebase nonzero (atomically)
 *	This works because readers read the timebase, then the timestamp and divisor, sync
 *	if MP, then read the timebase a second time and check to be sure it is equal to the first.
 *
 *	We could save a few cycles on 64-bit machines by special casing them, but it probably
 *	isn't necessary because this routine shouldn't be called very often.
 *
 *	When called:
 *		r3 = upper half of timebase (timebase is disabled if 0)
 *		r4 = lower half of timebase
 *		r5 = upper half of timestamp
 *		r6 = lower half of timestamp
 *		r7 = divisor (ie, timebase ticks per sec)
 *	We set up:
 *		r8 = ptr to our static data (kkBinary0, kkDouble1, kkTicksPerSec)
 *		r9 = ptr to 32-bit commpage in kernel map
 *     r10 = ptr to 64-bit commpage in kernel map
 *
 *	--> Interrupts must be disabled and rtclock locked when called.  <--
 */
 
        .align	5
LEXT(commpage_set_timestamp)				// void commpage_set_timestamp(tbr,secs,divisor)
        mfmsr	r11							// get MSR
        ori		r2,r11,MASK(MSR_FP)			// turn FP on
        mtmsr	r2
        isync								// wait until MSR changes take effect
        
        or.		r0,r3,r4					// is timebase 0? (thus disabled)
        lis		r8,hi16(Ldata)				// point to our data
        lis		r9,ha16(EXT(commPagePtr32))	// get ptrs to address of commpages in kernel map
		lis		r10,ha16(EXT(commPagePtr64))
        stfd	f1,rzSaveF1(r1)				// save a FPR in the red zone
        ori		r8,r8,lo16(Ldata)
        lwz		r9,lo16(EXT(commPagePtr32))(r9)	// r9 <- 32-bit commpage ptr
		lwz		r10,lo16(EXT(commPagePtr64))(r10) // r10 <- 64-bit commpage ptr
        lfd		f1,kkBinary0(r8)			// get fixed 0s
        li		r0,_COMM_PAGE_BASE_ADDRESS	// get va in user space of commpage
        cmpwi	cr1,r9,0					// is 32-bit commpage allocated yet?
		cmpwi   cr6,r10,0					// is 64-bit commpage allocated yet?
        sub		r9,r9,r0					// r9 <- 32-bit commpage address, biased by user va
		sub		r10,r10,r0					// r10<- 64-bit commpage address
        beq--	cr1,3f						// skip if 32-bit commpage not allocated (64-bit won't be either)
		bne++   cr6,1f						// skip if 64-bit commpage is allocated
		mr		r10,r9						// if no 64-bit commpage, point to 32-bit version with r10 too
1:
        stfd	f1,_COMM_PAGE_TIMEBASE(r9)	// turn off the 32-bit-commpage timestamp (atomically)
		stfd	f1,_COMM_PAGE_TIMEBASE(r10) // and the 64-bit one too
        eieio								// make sure all CPUs see it is off
        beq		3f							// all we had to do is turn off timestamp
        
        lwz		r0,kkTicksPerSec+4(r8)		// get last ticks_per_sec (or 0 if first)
        stw		r3,rzNewTimeBase(r1)		// store new timebase so we can lfd
        stw		r4,rzNewTimeBase+4(r1)
        cmpw	r0,r7						// do we need to recompute _COMM_PAGE_SEC_PER_TICK?
        stw		r5,_COMM_PAGE_TIMESTAMP(r9)	// store the new timestamp in the 32-bit page
        stw		r6,_COMM_PAGE_TIMESTAMP+4(r9)
        stw		r5,_COMM_PAGE_TIMESTAMP(r10)// and the 64-bit commpage
        stw		r6,_COMM_PAGE_TIMESTAMP+4(r10)
        lfd		f1,rzNewTimeBase(r1)		// get timebase in a FPR so we can store atomically
        beq++	2f							// same ticks_per_sec, no need to recompute
        
        stw		r7,kkTicksPerSec+4(r8)		// must recompute SEC_PER_TICK
        stfd	f2,rzSaveF2(r1)				// we'll need a few more temp FPRs
        stfd	f3,rzSaveF3(r1)
        stfd	f4,rzSaveF4(r1)
        stfd	f5,rzSaveF5(r1)
        lfd		f2,_COMM_PAGE_2_TO_52(r9)	// f2 <- double(2**52)
        lfd		f3,kkTicksPerSec(r8)		// float new ticks_per_sec + 2**52
        lfd		f4,kkDouble1(r8)			// f4 <- double(1.0)
        mffs	f5							// save caller's FPSCR
        mtfsfi	7,1							// clear Inexeact Exception bit, set round-to-zero
        fsub	f3,f3,f2					// get ticks_per_sec
        fdiv	f3,f4,f3					// divide 1 by ticks_per_sec to get SEC_PER_TICK
        stfd	f3,_COMM_PAGE_SEC_PER_TICK(r9)
        stfd	f3,_COMM_PAGE_SEC_PER_TICK(r10)
        mtfsf	0xFF,f5						// restore FPSCR
        lfd		f2,rzSaveF2(r1)				// restore FPRs
        lfd		f3,rzSaveF3(r1)
        lfd		f4,rzSaveF4(r1)
        lfd		f5,rzSaveF5(r1)
2:											// f1 == new timestamp
        eieio								// wait until the stores take
        stfd	f1,_COMM_PAGE_TIMEBASE(r9)	// then turn the timestamp back on (atomically)
        stfd	f1,_COMM_PAGE_TIMEBASE(r10)	// both
3:											// here once all fields updated
        lfd		f1,rzSaveF1(r1)				// restore last FPR
        mtmsr	r11							// turn FP back off
        isync
        blr


/*	***************************************
 *	* C O M M P A G E _ T I M E _ D C B A *
 *	***************************************
 *
 *	Not all processors that support the DCBA opcode actually benefit from it.
 *	Some store-gather and read-cancel well enough that there is no need to use
 *	DCBA to avoid fetching cache lines that will be completely overwritten, while
 *	others have this feature disabled (to work around errata etc), and so benefit
 *	from DCBA.  Since it is hard to tell the one group from the other, we just
 *	time loops with and without DCBA, and pick the fastest.  Thus we avoid
 *	delicate dependence on processor and/or platform revisions.
 *
 *	We return either kDcbaRecommended or zero.
 *
 *		int commpage_time_dcba( void );
 */
 
LEXT(commpage_time_dcba)
        mflr	r12					// get return
        stw		r12,8(r1)			// save
        stwu	r1,-kSFSize(r1)		// carve our temp buffer from the stack
        addi	r11,r1,127+16		// get base address...
        rlwinm	r11,r11,0,0,24		// ...of our buffer, 128-byte aligned
        crset	kDCBA				// first, use DCBA
        bl		LTest				// time it with DCBA
        srwi	r0,r3,3				// bias 12 pct in favor of not using DCBA...
        add		r10,r3,r0			// ...because DCBA is always slower with warm cache
        crclr	kDCBA
        bl		LTest				// time without DCBA
        cmplw	r10,r3				// which is better?
        mtlr	r12					// restore return
        lwz		r1,0(r1)			// pop off our stack frame
        li		r3,kDcbaRecommended		// assume using DCBA is faster
        bltlr
        li		r3,0			// no DCBA is faster
        blr
                
        
// Subroutine to time a loop with or without DCBA.
//		kDCBA = set if we should use DCBA
//		r11 = base of buffer to use for test (kBufSiz bytes)
//
//		We return TBR ticks in r3.
//		We use r0,r3-r9.

LTest:
        li		r4,kLoopCnt			// number of times to loop
        li		r3,-1				// initialize fastest time
1:
        mr		r6,r11				// initialize buffer ptr
        li		r0,kBufSiz/32		// r0 <- cache blocks to test
        mtctr	r0
2:
        dcbf	0,r6				// first, force the blocks out of the cache
        addi	r6,r6,32
        bdnz	2b
        sync						// make sure all the flushes take
        mr		r6,r11				// re-initialize buffer ptr
        mtctr	r0					// reset cache-block count
        mftbu	r7					// remember upper half so we can check for carry
        mftb	r8					// start the timer
3:									// loop over cache blocks
        bf		kDCBA,4f			// should we DCBA?
        dcba	0,r6
4:
        stw		r0,0(r6)			// store the entire cache block
        stw		r0,4(r6)
        stw		r0,8(r6)
        stw		r0,12(r6)
        stw		r0,16(r6)
        stw		r0,20(r6)
        stw		r0,24(r6)
        stw		r0,28(r6)
        addi	r6,r6,32
        bdnz	3b
        mftb	r9
        mftbu	r0
        cmpw	r0,r7				// did timebase carry?
        bne		1b					// yes, retest rather than fuss
        sub		r9,r9,r8			// r9 <- time for this loop
        cmplw	r9,r3				// faster than current best?
        bge		5f					// no
        mr		r3,r9				// remember fastest time through loop
5:
        subi	r4,r4,1				// decrement outer loop count
        cmpwi	r4,0				// more to go?
        bne		1b					// loop if so
        blr							// return fastest time in r3