.explicit
.text
.ident "ia64.S, Version 1.1"
.ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
//
// ====================================================================
// Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
// project.
//
// Rights for redistribution and usage in source and binary forms are
// granted according to the OpenSSL license. Warranty of any kind is
// disclaimed.
// ====================================================================
//
// Q. How much faster does it get?
// A. Here is the output from 'openssl speed rsa dsa' for vanilla
// 0.9.6a compiled with gcc version 2.96 20000731 (Red Hat
// Linux 7.1 2.96-81):
//
// sign verify sign/s verify/s
// rsa 512 bits 0.0036s 0.0003s 275.3 2999.2
// rsa 1024 bits 0.0203s 0.0011s 49.3 894.1
// rsa 2048 bits 0.1331s 0.0040s 7.5 250.9
// rsa 4096 bits 0.9270s 0.0147s 1.1 68.1
// sign verify sign/s verify/s
// dsa 512 bits 0.0035s 0.0043s 288.3 234.8
// dsa 1024 bits 0.0111s 0.0135s 90.0 74.2
//
// And here is similar output but for this assembler
// implementation:-)
//
// sign verify sign/s verify/s
// rsa 512 bits 0.0021s 0.0001s 549.4 9638.5
// rsa 1024 bits 0.0055s 0.0002s 183.8 4481.1
// rsa 2048 bits 0.0244s 0.0006s 41.4 1726.3
// rsa 4096 bits 0.1295s 0.0018s 7.7 561.5
// sign verify sign/s verify/s
// dsa 512 bits 0.0012s 0.0013s 891.9 756.6
// dsa 1024 bits 0.0023s 0.0028s 440.4 376.2
//
// Yes, you may argue that it's not fair comparison as it's
// possible to craft the C implementation with BN_UMULT_HIGH
// inline assembler macro. But of course! Here is the output
// with the macro:
//
// sign verify sign/s verify/s
// rsa 512 bits 0.0020s 0.0002s 495.0 6561.0
// rsa 1024 bits 0.0086s 0.0004s 116.2 2235.7
// rsa 2048 bits 0.0519s 0.0015s 19.3 667.3
// rsa 4096 bits 0.3464s 0.0053s 2.9 187.7
// sign verify sign/s verify/s
// dsa 512 bits 0.0016s 0.0020s 613.1 510.5
// dsa 1024 bits 0.0045s 0.0054s 221.0 183.9
//
// My code is still way faster, huh:-) And I believe that even
// higher performance can be achieved. Note that as keys get
// longer, performance gain is larger. Why? According to the
// profiler there is another player in the field, namely
// BN_from_montgomery consuming larger and larger portion of CPU
// time as keysize decreases. I therefore consider putting effort
// to assembler implementation of the following routine:
//
// void bn_mul_add_mont (BN_ULONG *rp,BN_ULONG *np,int nl,BN_ULONG n0)
// {
// int i,j//
// for (i=0// v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2)// rp++// for (j=0// }
//
// It might as well be beneficial to implement even combaX
// variants, as it appears as it can literally unleash the
// performance (see comment section to bn_mul_comba8 below).
//
// And finally for your reference the output for 0.9.6a compiled
// with SGIcc version 0.01.0-12 (keep in mind that for the moment
// of this writing it's not possible to convince SGIcc to use
// BN_UMULT_HIGH inline assembler macro, yet the code is fast,
// i.e. for a compiler generated one:-):
//
// sign verify sign/s verify/s
// rsa 512 bits 0.0022s 0.0002s 452.7 5894.3
// rsa 1024 bits 0.0097s 0.0005s 102.7 2002.9
// rsa 2048 bits 0.0578s 0.0017s 17.3 600.2
// rsa 4096 bits 0.3838s 0.0061s 2.6 164.5
// sign verify sign/s verify/s
// dsa 512 bits 0.0018s 0.0022s 547.3 459.6
// dsa 1024 bits 0.0051s 0.0062s 196.6 161.3
//
// Oh! Benchmarks were performed on 733MHz Lion-class Itanium
// system running Redhat Linux 7.1 (very special thanks to Ray
// McCaffity of Williams Communications for providing an account).
//
// Q. What's the heck with 'rum 1<<5' at the end of every function?
// A. Well, by clearing the "upper FP registers written" bit of the
// User Mask I want to excuse the kernel from preserving upper
// (f32-f128) FP register bank over process context switch, thus
// minimizing bus bandwidth consumption during the switch (i.e.
// after PKI opration completes and the program is off doing
// something else like bulk symmetric encryption). Having said
// this, I also want to point out that it might be good idea
// to compile the whole toolkit (as well as majority of the
// programs for that matter) with -mfixed-range=f32-f127 command
// line option. No, it doesn't prevent the compiler from writing
// to upper bank, but at least discourages to do so. If you don't
// like the idea you have the option to compile the module with
// -Drum=nop.m in command line.
//
#if 1
//
// bn_[add|sub]_words routines.
//
// Loops are spinning in 2*(n+5) ticks on Itanuim (provided that the
// data reside in L1 cache, i.e. 2 ticks away). It's possible to
// compress the epilogue and get down to 2*n+6, but at the cost of
// scalability (the neat feature of this implementation is that it
// shall automagically spin in n+5 on "wider" IA-64 implementations:-)
// I consider that the epilogue is short enough as it is to trade tiny
// performance loss on Itanium for scalability.
//
// BN_ULONG bn_add_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
//
.global bn_add_words#
.proc bn_add_words#
.align 64
.skip 32 // makes the loop body aligned at 64-byte boundary
bn_add_words:
.prologue
.fframe 0
.save ar.pfs,r2
{ .mii{ .mfb
.save ar.lc,r3
{ .mib brp.loop.imp .L_bn_add_words_ctop,.L_bn_add_words_cend-16
}
.body
{ .mib{ .mii mov ar.ec=6 }
{ .mib
.L_bn_add_words_ctop:
{ .mii (p19) cmp.ltu.unc p56,p0=r40,r38 }
{ .mfb (p0) nop.b 0x0 }
{ .mii (p58) add r41=1,r41 } // (p20)
{ .mfb br.ctop.sptk .L_bn_add_words_ctop }
{ .mii mov pr=r9,-1
mov ar.lc=r3 }
{ .mbb.endp bn_add_words#
//
// BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num)
//
.global bn_sub_words#
.proc bn_sub_words#
.align 64
.skip 32 // makes the loop body aligned at 64-byte boundary
bn_sub_words:
.prologue
.fframe 0
.save ar.pfs,r2
{ .mii{ .mfb
.save ar.lc,r3
{ .mib brp.loop.imp .L_bn_sub_words_ctop,.L_bn_sub_words_cend-16
}
.body
{ .mib{ .mii mov ar.ec=6 }
{ .mib
.L_bn_sub_words_ctop:
{ .mii (p19) cmp.gtu.unc p56,p0=r40,r38 }
{ .mfb (p0) nop.b 0x0 }
{ .mii (p58) add r41=-1,r41 } // (p20)
{ .mbb br.ctop.sptk .L_bn_sub_words_ctop }
{ .mii mov pr=r9,-1
mov ar.lc=r3 }
{ .mbb.endp bn_sub_words#
#endif
#if 0
#define XMA_TEMPTATION
#endif
#if 1
//
// BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
//
.global bn_mul_words#
.proc bn_mul_words#
.align 64
.skip 32 // makes the loop body aligned at 64-byte boundary
bn_mul_words:
.prologue
.fframe 0
.save ar.pfs,r2
#ifdef XMA_TEMPTATION
{ .mfi{ .mfi{ .mib(p6) br.ret.spnt.many b0 } .save ar.lc,r3
{ .mii mov r9=pr } .body
{ .mib // ------^----- serves as (p48) at first (p26)
brp.loop.imp .L_bn_mul_words_ctop,.L_bn_mul_words_cend-16
}
#ifndef XMA_TEMPTATION
{ .mii mov ar.lc=r10 }
{ .mii
// This loop spins in 2*(n+11) ticks. It's scheduled for data in L2
// cache (i.e. 9 ticks away) as floating point load/store instructions
// bypass L1 cache and L2 latency is actually best-case scenario for
// ldf8. The loop is not scalable and shall run in 2*(n+11) even on
// "wider" IA-64 implementations. It's a trade-off here. n+22 loop
// would give us ~5% in *overall* performance improvement on "wider"
// IA-64, but would hurt Itanium for about same because of longer
// epilogue. As it's a matter of few percents in either case I've
// chosen to trade the scalability for development time (you can see
// this very instruction sequence in bn_mul_add_words loop which in
// turn is scalable).
.L_bn_mul_words_ctop:
{ .mfi (p27) cmp.ltu p52,p48=r39,r38 }
{ .mfi (p0) nop.i 0x0 } .pred.rel "mutex",p48,p52
(p48) add r38=r37,r33 // (p26)
(p52) add r38=r37,r33,1 } // (p26)
{ .mfb br.ctop.sptk .L_bn_mul_words_ctop }
{ .mii(p49) add r8=r34,r0
(p53) add r8=r34,r0,1 }
{ .mfb nop.b 0x0 }
#else // XMA_TEMPTATION
setf.sig f37=r0 // serves as carry at (p18) tick
mov ar.lc=r10
mov ar.ec=5// Most of you examining this code very likely wonder why in the name
// of Intel the following loop is commented out? Indeed, it looks so
// neat that you find it hard to believe that it's something wrong
// with it, right? The catch is that every iteration depends on the
// result from previous one and the latter isn't available instantly.
// The loop therefore spins at the latency of xma minus 1, or in other
// words at 6*(n+4) ticks:-( Compare to the "production" loop above
// that runs in 2*(n+11) where the low latency problem is worked around
// by moving the dependency to one-tick latent interger ALU. Note that
// "distance" between ldf8 and xma is not latency of ldf8, but the
// *difference* between xma and ldf8 latencies.
.L_bn_mul_words_ctop:
{ .mfi{ .mfb br.ctop.sptk .L_bn_mul_words_ctop }
getf.sig r8=f41 // the return value
#endif // XMA_TEMPTATION
{ .mii mov ar.lc=r3 }
{ .mfb br.ret.sptk.many b0 }#endif
#if 1
//
// BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
//
.global bn_mul_add_words#
.proc bn_mul_add_words#
.align 64
//.skip 0 // makes the loop split at 64-byte boundary
bn_mul_add_words:
.prologue
.fframe 0
.save ar.pfs,r2
{ .mii{ .mfb
.save ar.lc,r3
{ .mii mov r9=pr } .body
{ .mib // ------^----- serves as (p48) at first (p26)
brp.loop.imp .L_bn_mul_add_words_ctop,.L_bn_mul_add_words_cend-16
}
{ .mii mov ar.lc=r10 }
{ .mii mov ar.ec=14 }// This loop spins in 3*(n+13) ticks on Itanium and should spin in
// 2*(n+13) on "wider" IA-64 implementations (to be verified with new
// µ-architecture manuals as they become available). As usual it's
// possible to compress the epilogue, down to 10 in this case, at the
// cost of scalability. Compressed (and therefore non-scalable) loop
// running at 3*(n+10) would buy you ~10% on Itanium but take ~35%
// from "wider" IA-64 so let it be scalable! Special attention was
// paid for having the loop body split at 64-byte boundary. ld8 is
// scheduled for L1 cache as the data is more than likely there.
// Indeed, bn_mul_words has put it there a moment ago:-)
.L_bn_mul_add_words_ctop:
{ .mfi (p27) cmp.ltu p52,p48=r39,r38 }
{ .mfi (p27) add r43=r43,r39 } .pred.rel "mutex",p48,p52
(p48) add r38=r37,r33 // (p26)
(p52) add r38=r37,r33,1 } // (p26)
{ .mfb (p0) nop.b 0x0 }
{ .mii (p58) add r44=1,r44 }
{ .mfb br.ctop.sptk .L_bn_mul_add_words_ctop}
{ .mii(p51) add r8=r36,r0
(p55) add r8=r36,r0,1 }
{ .mfb nop.b 0x0 }(p59) add r8=1,r8
mov pr=r9,-1
mov ar.lc=r3 }
{ .mfb br.ret.sptk.many b0 }#endif
#if 1
//
// void bn_sqr_words(BN_ULONG *rp, BN_ULONG *ap, int num)
//
.global bn_sqr_words#
.proc bn_sqr_words#
.align 64
.skip 32 // makes the loop body aligned at 64-byte boundary
bn_sqr_words:
.prologue
.fframe 0
.save ar.pfs,r2
{ .mii{ .mii{ .mfb
.save ar.lc,r3
{ .mii mov r9=pr } .body
{ .mib brp.loop.imp .L_bn_sqr_words_ctop,.L_bn_sqr_words_cend-16
}
{ .mii mov ar.ec=18 }// 2*(n+17) on Itanium, (n+17) on "wider" IA-64 implementations. It's
// possible to compress the epilogue (I'm getting tired to write this
// comment over and over) and get down to 2*n+16 at the cost of
// scalability. The decision will very likely be reconsidered after the
// benchmark program is profiled. I.e. if perfomance gain on Itanium
// will appear larger than loss on "wider" IA-64, then the loop should
// be explicitely split and the epilogue compressed.
.L_bn_sqr_words_ctop:
{ .mfi (p0) nop.i 0x0 }
{ .mib (p0) nop.b 0x0 }
{ .mfi (p0) nop.i 0x0 }
{ .mib br.ctop.sptk .L_bn_sqr_words_ctop }
{ .mii mov ar.lc=r3 }
{ .mfb br.ret.sptk.many b0 }#endif
#if 1
// Apparently we win nothing by implementing special bn_sqr_comba8.
// Yes, it is possible to reduce the number of multiplications by
// almost factor of two, but then the amount of additions would
// increase by factor of two (as we would have to perform those
// otherwise performed by xma ourselves). Normally we would trade
// anyway as multiplications are way more expensive, but not this
// time... Multiplication kernel is fully pipelined and as we drain
// one 128-bit multiplication result per clock cycle multiplications
// are effectively as inexpensive as additions. Special implementation
// might become of interest for "wider" IA-64 implementation as you'll
// be able to get through the multiplication phase faster (there won't
// be any stall issues as discussed in the commentary section below and
// you therefore will be able to employ all 4 FP units)... But these
// Itanium days it's simply too hard to justify the effort so I just
// drop down to bn_mul_comba8 code:-)
//
// void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
//
.global bn_sqr_comba8#
.proc bn_sqr_comba8#
.align 64
bn_sqr_comba8:
.prologue
.fframe 0
.save ar.pfs,r2
{ .mii add r14=8,r33 }{ .mii add r18=16,r34 }
{ .mfb.endp bn_sqr_comba8#
#endif
#if 1
// I've estimated this routine to run in ~120 ticks, but in reality
// (i.e. according to ar.itc) it takes ~160 ticks. Are those extra
// cycles consumed for instructions fetch? Or did I misinterpret some
// clause in Itanium µ-architecture manual? Comments are welcomed and
// highly appreciated.
//
// However! It should be noted that even 160 ticks is darn good result
// as it's over 10 (yes, ten, spelled as t-e-n) times faster than the
// C version (compiled with gcc with inline assembler). I really
// kicked compiler's butt here, didn't I? Yeah! This brings us to the
// following statement. It's damn shame that this routine isn't called
// very often nowadays! According to the profiler most CPU time is
// consumed by bn_mul_add_words called from BN_from_montgomery. In
// order to estimate what we're missing, I've compared the performance
// of this routine against "traditional" implementation, i.e. against
// following routine:
//
// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
// { r[ 8]=bn_mul_words( &(r[0]),a,8,b[0])// r[10]=bn_mul_add_words(&(r[2]),a,8,b[2])// r[12]=bn_mul_add_words(&(r[4]),a,8,b[4])// r[14]=bn_mul_add_words(&(r[6]),a,8,b[6])// }
//
// The one below is over 8 times faster than the one above:-( Even
// more reasons to "combafy" bn_mul_add_mont...
//
// And yes, this routine really made me wish there were an optimizing
// assembler! It also feels like it deserves a dedication.
//
// To my wife for being there and to my kids...
//
// void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
//
#define carry1 r14
#define carry2 r15
#define carry3 r34
.global bn_mul_comba8#
.proc bn_mul_comba8#
.align 64
bn_mul_comba8:
.prologue
.fframe 0
.save ar.pfs,r2
{ .mii add r17=8,r34 }
.body
{ .mii add r16=24,r33 }
.L_cheat_entry_point8:
{ .mmi ldf8 f32=[r33],32 }{ .mmi{ .mmi{ .mmi{ .mmi
{ .mmi{ .mmi{ .mmi{ .mfi// ------------\ /------------
// -----------------\ /-----------------
// ----------------------\/----------------------
xma.hu f41=f32,f120,f0 }
{ .mfi{ .mfi{ .mfi{ .mfi{ .mfi{ .mfi{ .mfi{ .mfi// prevent "wider" IA-64 implementations from achieving the peak
// performance. Well, not really... The catch is that if you
// intend to keep 4 FP units busy by splitting at every fourth
// bundle and thus perform these 16 multiplications in 4 ticks,
// the first bundle *below* would stall because the result from
// the first xma bundle *above* won't be available for another 3
// ticks (if not more, being an optimist, I assume that "wider"
// implementation will have same latency:-). This stall will hold
// you back and the performance would be as if every second bundle
// were split *anyway*...
{ .mfi add r33=8,r32 }
{ .mfi xma.hu f52=f33,f121,f51 }
{ .mfi xma.hu f62=f33,f122,f61 }
{ .mfi{ .mfi{ .mfi{ .mfi{ .mfi{ .mfi{ .mfi{ .mfi xma.hu f53=f34,f121,f52 }
{ .mfi xma.hu f63=f34,f122,f62
add r25=r25,r24 }
{ .mfi{ .mfi{ .mfi xma.hu f83=f34,f124,f82
(p6) add carry1=1,carry1 }
{ .mfi{ .mfi{ .mfi{ .mfi{ .mfi add r17=r17,r16 }
{ .mfi xma.hu f54=f35,f121,f53 }
{ .mfi{ .mfi cmp.ltu p7,p0=r17,r16 }
{ .mfi{ .mfi(p7) add carry2=1,carry2 }
{ .mfi add r18=r18,carry1 } xma.hu f84=f35,f124,f83
(p7) add carry2=1,carry2 }
{ .mfi{ .mfi(p7) add carry2=1,carry2 }
{ .mfi{ .mfi{ .mfi add r25=r25,r24 }{ .mfi cmp.ltu p6,p0=r25,r24 }
{ .mfi{ .mfi(p6) add carry1=1,carry1 }
{ .mfi xma.hu f65=f36,f122,f64
cmp.ltu p6,p0=r26,r25 }
{ .mfi{ .mfi(p6) add carry1=1,carry1 }
{ .mfi add r27=r27,carry2 } xma.hu f85=f36,f124,f84
(p6) add carry1=1,carry1 }
{ .mfi{ .mfi(p6) add carry1=1,carry1 }
{ .mfi{ .mfi add r17=r17,r16 } cmp.ltu p7,p0=r17,r16 }
{ .mfi//-------------------------------------------------//
{ .mfi(p7) add carry2=1,carry2 }
{ .mfi add r19=r19,r18 } xma.hu f56=f37,f121,f55 }
{ .mfi xma.hu f66=f37,f122,f65
(p7) add carry2=1,carry2 }
{ .mfi add r20=r20,r19 } xma.hu f76=f37,f123,f75
(p7) add carry2=1,carry2 }
{ .mfi add r20=r20,carry1 } xma.hu f86=f37,f124,f85
(p7) add carry2=1,carry2 }
{ .mfi{ .mfi(p7) add carry2=1,carry2 }
{ .mfi{ .mfi xma.lu f105=f37,f126,f105
add r25=r25,r24 } cmp.ltu p6,p0=r25,r24 }
{ .mfi//-------------------------------------------------//
{ .mfi(p6) add carry1=1,carry1 }
{ .mfi add r27=r27,r26 } xma.hu f57=f38,f121,f56
(p6) add carry1=1,carry1 }
{ .mfi add r28=r28,r27 } xma.hu f67=f38,f122,f66
(p6) add carry1=1,carry1 }
{ .mfi add r29=r29,r28 } xma.hu f77=f38,f123,f76
(p6) add carry1=1,carry1 }
{ .mfi add r29=r29,carry2 } xma.hu f87=f38,f124,f86
(p6) add carry1=1,carry1 }
{ .mfi{ .mfi(p6) add carry1=1,carry1 }
{ .mfi{ .mfi{ .mfi add r17=r17,r16 } cmp.ltu p7,p0=r17,r16 }
{ .mfi//-------------------------------------------------//
{ .mfi(p7) add carry2=1,carry2 }
{ .mfi add r19=r19,r18 } xma.hu f58=f39,f121,f57
(p7) add carry2=1,carry2 }
{ .mfi add r20=r20,r19 } xma.hu f68=f39,f122,f67
(p7) add carry2=1,carry2 }
{ .mfi add r21=r21,r20 } xma.hu f78=f39,f123,f77
(p7) add carry2=1,carry2 }
{ .mfi add r22=r22,r21 } xma.hu f88=f39,f124,f87
(p7) add carry2=1,carry2 }
{ .mfi add r22=r22,carry1 } xma.hu f98=f39,f125,f97
(p7) add carry2=1,carry2 }
{ .mfi{ .mfi(p7) add carry2=1,carry2 }
{ .mfi{ .mfi{ .mfi// Leaving muliplier's heaven... Quite a ride, huh?
{ .mii mov carry1=0 } cmp.ltu p6,p0=r25,r24
add r26=r26,r25 }{ .mii cmp.ltu p6,p0=r26,r25
add r27=r27,r26 }{ .mii cmp.ltu p6,p0=r27,r26
add r28=r28,r27 } add r17=r17,r16
mov carry3=0 }
{ .mii cmp.ltu p6,p0=r28,r27
add r29=r29,r28 } cmp.ltu p7,p0=r17,r16 }
{ .mii cmp.ltu p6,p0=r29,r28
add r30=r30,r29 } add r18=r18,r17 }
{ .mii cmp.ltu p6,p0=r30,r29
add r31=r31,r30 }{ .mii add r19=r19,r18 }
{ .mfb(p6) add carry1=1,carry1
cmp.ltu p6,p0=r31,r30
add r31=r31,carry2 }{ .mii add r20=r20,r19 }
{ .mfb(p6) add carry1=1,carry1
cmp.ltu p6,p0=r31,carry2 }{ .mii add r21=r21,r20 }
{ .mii{ .mfb{ .mfb cmp.ltu p7,p0=r21,r20
add r22=r22,r21 }{ .mii add r23=r23,r22 }{ .mii add r23=r23,carry1 }{ .mii{ .mii mov carry1=0 }
{ .mii (p8) add carry2=0,carry3 }{ .mfb cmp.ltu p6,p0=r25,r24
add r26=r26,r25 }{ .mii cmp.ltu p6,p0=r26,r25
add r27=r27,r26 }{ .mii cmp.ltu p6,p0=r27,r26
add r28=r28,r27 }{ .mii cmp.ltu p6,p0=r28,r27
add r29=r29,r28 }{ .mii cmp.ltu p6,p0=r29,r28
add r30=r30,r29 } add r17=r17,r16
mov carry3=0 }
{ .mii cmp.ltu p6,p0=r30,r29
add r30=r30,carry2 } cmp.ltu p7,p0=r17,r16
add r18=r18,r17 }
{ .mii cmp.ltu p6,p0=r30,carry2 }{ .mii
{ .mfb cmp.ltu p7,p0=r18,r17
add r19=r19,r18 }{ .mii add r20=r20,r19 }{ .mii add r21=r21,r20 }{ .mii add r21=r21,carry1 } add r25=r25,r24 }
{ .mib{ .mii (p8) add carry2=0,carry3 }
{ .mii add r26=r26,r25 }{ .mii cmp.ltu p6,p0=r26,r25
add r27=r27,r26 }{ .mii cmp.ltu p6,p0=r27,r26
add r28=r28,r27 }{ .mii cmp.ltu p6,p0=r28,r27
add r28=r28,carry2 } add r17=r17,r16 }
{ .mib cmp.ltu p6,p0=r28,carry2 }(p6) add carry1=1,carry1 }
{ .mii add r18=r18,r17 }{ .mii add r19=r19,r18 }{ .mii add r19=r19,carry1 }{ .mii{ .mii
{ .mfb{ .mfb cmp.ltu p6,p0=r25,r24
add r26=r26,r25 }{ .mii cmp.ltu p6,p0=r26,r25
add r26=r26,carry2 }{ .mii cmp.ltu p6,p0=r26,carry2 }(p6) add carry1=1,carry1 }
{ .mfb{ .mii add r17=r17,carry1 } cmp.ltu p7,p0=r17,carry1} (p7) add carry2=1,carry2 }{ .mib{ .mib.endp bn_mul_comba8#
#undef carry3
#undef carry2
#undef carry1
#endif
#if 1
// It's possible to make it faster (see comment to bn_sqr_comba8), but
// I reckon it doesn't worth the effort. Basically because the routine
// (actually both of them) practically never called... So I just play
// same trick as with bn_sqr_comba8.
//
// void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
//
.global bn_sqr_comba4#
.proc bn_sqr_comba4#
.align 64
bn_sqr_comba4:
.prologue
.fframe 0
.save ar.pfs,r2
{ .mii add r14=8,r33 }{ .mii add r18=16,r34 }
{ .mfb.endp bn_sqr_comba4#
#endif
#if 1
// Runs in ~115 cycles and ~4.5 times faster than C. Well, whatever...
//
// void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
//
#define carry1 r14
#define carry2 r15
.global bn_mul_comba4#
.proc bn_mul_comba4#
.align 64
bn_mul_comba4:
.prologue
.fframe 0
.save ar.pfs,r2
{ .mii add r17=8,r34 }
.body
{ .mii add r16=24,r33 }{ .mmi ldf8 f32=[r33] }
{ .mmi{ .mmi
{ .mmi{ .mfi xma.hu f41=f32,f120,f0 }
{ .mfi{ .mfi{ .mfi{ .mfi// first xma is not available for another 3 ticks.
{ .mfi add r33=8,r32 }
{ .mfi xma.hu f52=f33,f121,f51 }
{ .mfi xma.hu f62=f33,f122,f61 }
{ .mfi{ .mfi{ .mfi{ .mfi xma.hu f53=f34,f121,f52 }
{ .mfi xma.hu f63=f34,f122,f62
add r25=r25,r24 }
{ .mfi{ .mfi cmp.ltu p6,p0=r25,r24 }
{ .mfi{ .mfi(p6) add carry1=1,carry1 }
{ .mfi mov carry2=0 } xma.hu f54=f35,f121,f53
cmp.ltu p7,p0=r17,r16 }
{ .mfi xma.hu f64=f35,f122,f63
add r18=r18,r17 }
{ .mfi{ .mfi cmp.ltu p7,p0=r18,r17 }
{ .mfi//-------------------------------------------------//
{ .mii cmp.ltu p7,p0=r18,carry1 }{ .mfi{ .mii mov carry1=0 } cmp.ltu p6,p0=r25,r24
add r26=r26,r25 }(p6) add carry1=1,carry1
cmp.ltu p6,p0=r26,r25
add r27=r27,r26 }(p6) add carry1=1,carry1
cmp.ltu p6,p0=r27,r26
add r27=r27,carry2 }(p6) add carry1=1,carry1
cmp.ltu p6,p0=r27,carry2 }(p6) add carry1=1,carry1 }
{ .mii mov carry2=0 } cmp.ltu p7,p0=r17,r16
add r18=r18,r17 } cmp.ltu p7,p0=r18,r17
add r19=r19,r18 } cmp.ltu p7,p0=r19,r18
add r19=r19,carry1 } (p7) add carry2=1,carry2
cmp.ltu p7,p0=r19,carry1} (p7) add carry2=1,carry2 }
{ .mii mov carry1=0 } cmp.ltu p6,p0=r25,r24
add r26=r26,r25 }(p6) add carry1=1,carry1
cmp.ltu p6,p0=r26,r25
add r26=r26,carry2 }(p6) add carry1=1,carry1
cmp.ltu p6,p0=r26,carry2 }(p6) add carry1=1,carry1 }
{ .mii mov carry2=0 } add r17=r17,carry1 }{ .mii{ .mii
{ .mii
{ .mib.endp bn_mul_comba4#
#undef carry2
#undef carry1
#endif
#if 1
//
// BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
//
// In the nutshell it's a port of my MIPS III/IV implementation.
//
#define AT r14
#define H r16
#define HH r20
#define L r17
#define D r18
#define DH r22
#define I r21
#if 0
// Some preprocessors (most notably HP-UX) apper to be allergic to
// macros enclosed to parenthesis as these three will be.
#define cont p16
#define break p0 // p20
#define equ p24
#else
cont=p16
break=p0
equ=p24
#endif
.global abort#
.global bn_div_words#
.proc bn_div_words#
.align 64
bn_div_words:
.prologue
.fframe 0
.save ar.pfs,r2
.save b0,r3
{ .mii mov r10=pr } mov r8=-1
(p6) br.ret.spnt.many b0 } .body
{ .mii mov pr.rot=0 }
{ .mii
.L_divw_shift: // -vv- note signed comparison
{ .mfi{ .mfb(p16) br.wtop.dpnt .L_divw_shift }{ .mii sub r35=64,r36 } shr.u AT=H,r35
mov I=r36 } shl H=H,r36
(p6) br.call.spnt.clr b0=abort }{ .mfi{ .mii
{ .mii(p6) sub H=H,D }
{ .mlx///////////////////////////////////////////////////////////
{ .mii cmp.eq p6,p7=HH,DH }(p6) setf.sig f8=AT
(p7) fcvt.xuf.s1 f6=f6
(p7) br.call.sptk b6=.L_udiv64_32_b6 }{ .mfi{ .mfi
{ .mmi
.L_divw_1st_iter:
{ .mii{ .mii (equ) cmp.leu break,cont=r35,H } (p8) add r31=-1,r31
(cont) br.wtop.spnt .L_divw_1st_iter }{ .mii shl L=L,32 }{ .mii cmp.eq p6,p7=HH,DH }(p6) setf.sig f8=AT
(p7) fcvt.xuf.s1 f6=f6
(p7) br.call.sptk b6=.L_udiv64_32_b6 }{ .mfi{ .mfi
{ .mmi
.L_divw_2nd_iter:
{ .mii{ .mii (equ) cmp.leu break,cont=r35,H } (p8) add r31=-1,r31
(cont) br.wtop.spnt .L_divw_2nd_iter }{ .mii mov ar.pfs=r2 } mov pr=r10,-1 }
{ .mfb// Unsigned 64 by 32 (well, by 64 for the moment) bit integer division
// procedure.
//
// inputs: f6 = (double)a, f7 = (double)b
// output: f8 = (int)(a/b)
// clobbered: f8,f9,f10,f11,pred
pred=p15
// This procedure is essentially Intel code and therefore is
// copyrighted to Intel Corporation (I suppose...). It's sligtly
// modified for specific needs.
.align 32
.skip 16
.L_udiv64_32_b6:
frcpa.s1 f8,pred=f6,f7(pred) fnma.s1 f9=f7,f8,f1 // [5] e0 = 1 - b * y0
(pred) fmpy.s1 f10=f6,f8(pred) fma.s1 f10=f9,f10,f10(pred) fma.s1 f9=f11,f10,f10(pred) fnma.s1 f10=f7,f9,f6
fcvt.fxu.trunc.s1 f8=f8 // [30] q = trunc(q3)
br.ret.sptk.many b6#endif