$output=shift;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open STDOUT,"| $^X $xlate $output";
$rp="%rdi"; $ap="%rsi"; $bp="%rdx"; $np="%rcx"; $n0="%r8"; $num="%r9"; $lo0="%r10";
$hi0="%r11";
$bp="%r12"; $hi1="%r13";
$i="%r14";
$j="%r15";
$m0="%rbx";
$m1="%rbp";
$code=<<___;
.text
.globl bn_mul_mont
.type bn_mul_mont,\@function,6
.align 16
bn_mul_mont:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov ${num}d,${num}d
lea 2($num),%rax
mov %rsp,%rbp
neg %rax
lea (%rsp,%rax,8),%rsp and \$-1024,%rsp
mov %rbp,8(%rsp,$num,8) mov %rdx,$bp
mov ($n0),$n0
xor $i,$i xor $j,$j
mov ($bp),$m0 mov ($ap),%rax
mulq $m0 mov %rax,$lo0
mov %rdx,$hi0
imulq $n0,%rax mov %rax,$m1
mulq ($np) add $lo0,%rax adc \$0,%rdx
mov %rdx,$hi1
lea 1($j),$j .L1st:
mov ($ap,$j,8),%rax
mulq $m0 add $hi0,%rax
adc \$0,%rdx
mov %rax,$lo0
mov ($np,$j,8),%rax
mov %rdx,$hi0
mulq $m1 add $hi1,%rax
lea 1($j),$j adc \$0,%rdx
add $lo0,%rax adc \$0,%rdx
mov %rax,-16(%rsp,$j,8) cmp $num,$j
mov %rdx,$hi1
jl .L1st
xor %rdx,%rdx
add $hi0,$hi1
adc \$0,%rdx
mov $hi1,-8(%rsp,$num,8)
mov %rdx,(%rsp,$num,8)
lea 1($i),$i .align 4
.Louter:
xor $j,$j
mov ($bp,$i,8),$m0 mov ($ap),%rax mulq $m0 add (%rsp),%rax adc \$0,%rdx
mov %rax,$lo0
mov %rdx,$hi0
imulq $n0,%rax mov %rax,$m1
mulq ($np,$j,8) add $lo0,%rax mov 8(%rsp),$lo0 adc \$0,%rdx
mov %rdx,$hi1
lea 1($j),$j .align 4
.Linner:
mov ($ap,$j,8),%rax
mulq $m0 add $hi0,%rax
adc \$0,%rdx
add %rax,$lo0 mov ($np,$j,8),%rax
adc \$0,%rdx
mov %rdx,$hi0
mulq $m1 add $hi1,%rax
lea 1($j),$j adc \$0,%rdx
add $lo0,%rax adc \$0,%rdx
mov (%rsp,$j,8),$lo0
cmp $num,$j
mov %rax,-16(%rsp,$j,8) mov %rdx,$hi1
jl .Linner
xor %rdx,%rdx
add $hi0,$hi1
adc \$0,%rdx
add $lo0,$hi1 adc \$0,%rdx
mov $hi1,-8(%rsp,$num,8)
mov %rdx,(%rsp,$num,8)
lea 1($i),$i cmp $num,$i
jl .Louter
lea (%rsp),$ap lea -1($num),$j
mov ($ap),%rax xor $i,$i jmp .Lsub
.align 16
.Lsub: sbb ($np,$i,8),%rax
mov %rax,($rp,$i,8) dec $j mov 8($ap,$i,8),%rax lea 1($i),$i jge .Lsub
sbb \$0,%rax and %rax,$ap
not %rax
mov $rp,$np
and %rax,$np
lea -1($num),$j
or $np,$ap .align 16
.Lcopy: mov ($ap,$j,8),%rax
mov %rax,($rp,$j,8) mov $i,(%rsp,$j,8) dec $j
jge .Lcopy
mov 8(%rsp,$num,8),%rsp mov \$1,%rax
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
ret
.size bn_mul_mont,.-bn_mul_mont
.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
___
print $code;
close STDOUT;