/*
This file provides x86_64/i386 hand implementation of the following function
void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks) which is a C function in sha2.c (from xnu).
The code 1st probes cpu_capabilities to detect whether ssse3 is supported. If not, it branches to
SHA256_Transform_nossse3 (in a separate source file sha256nossse3.s) that was cloned from this file
with all ssse3 instructions replaced with sse3 or below instructions.
sha256 algorithm per block description:
1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
2. load 8 digests a-h from ctx->state
3. for r = 0:15
T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r] h = T1 + Sigma0(a) + Maj(a,b,c)
permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
4. for r = 16:63
W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]) d += T1 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
In the assembly implementation:
- a circular window of message schedule W(r:r+15) is updated and stored in xmm0-xmm3
- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
- the 8 digests (a-h) will be stored in GPR or m32 (all in GPR for x86_64, and some in m32 for i386)
the implementation per block looks like
----------------------------------------------------------------------------
load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
pre_calculate and store W+K(0:15) in stack
load digests a-h from ctx->state for (r=0 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
}
for (r=48 }
ctx->states += digests a-h ----------------------------------------------------------------------------
our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
into the last 16 rounds of its previous block:
----------------------------------------------------------------------------
load W(0:15) (big-endian per 4 bytes) into xmm0:xmm3
pre_calculate and store W+K(0:15) in stack
L_loop:
load digests a-h from ctx->state for (r=0 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
}
num_block--;
if (num_block==0) jmp L_last_block for (r=48 load W([r:r+3]%16) (big-endian per 4 bytes) into xmm0:xmm3
pre_calculate and store W+K([r:r+3]%16) in stack
}
ctx->states += digests a-h jmp L_loopL_last_block:
for (r=48 }
ctx->states += digests a-h ------------------------------------------------------------------------
Apple CoreOS vector & numerics
cclee 8-3-10
*/
#if defined KERNEL
#include <i386/cpu_capabilities.h>
#else
#include <System/i386/cpu_capabilities.h>
#endif
// associate variables with registers or memory
#if defined (__x86_64__)
#define sp %rsp
#define ctx %rdi
#define data %rsi
#define num_blocks %rdx
#define a %r8d
#define b %r9d
#define c %r10d
#define d %r11d
#define e %r12d
#define f %r13d
#define g %r14d
#define h %r15d
#define K %rbx
#define stack_size (8+16*8+16+64) // 8 (align) + xmm0:xmm7 + L_aligned_bswap + WK(0:15)
#define L_aligned_bswap 64(sp) // bswap : big-endian loading of 4-byte words
#define xmm_save 80(sp) // starting address for xmm save/restore
#else
#define sp %esp
#define stack_size (12+16*8+16+16+64) // 12 (align) + xmm0:xmm7 + 16 (c,f,h,K) + L_aligned_bswap + WK(0:15)
#define ctx_addr 20+stack_size(sp) // ret_addr + 4 registers = 20, 1st caller argument
#define data_addr 24+stack_size(sp) // 2nd caller argument
#define num_blocks 28+stack_size(sp) // 3rd caller argument
#define a %ebx
#define b %edx
#define c 64(sp)
#define d %ebp
#define e %esi
#define f 68(sp)
#define g %edi
#define h 72(sp)
#define K 76(sp) // pointer to K256[] table
#define L_aligned_bswap 80(sp) // bswap : big-endian loading of 4-byte words
#define xmm_save 96(sp) // starting address for xmm save/restore
#endif
// 2 local variables
#define t %eax
#define s %ecx
// a window (16 words) of message scheule
#define W0 %xmm0
#define W1 %xmm1
#define W2 %xmm2
#define W3 %xmm3
// circular buffer for WK[(r:r+15)%16]
#define WK(x) (x&15)*4(sp)
// #define Ch(x,y,z) (((x) & (y)) ^ ((~(x)) & (z)))
.macro Ch
mov $0, t // x
mov $0, s // x
not t // ~x
and $1, s // x & y
and $2, t // ~x & z
xor s, t // t = ((x) & (y)) ^ ((~(x)) & (z))
// #define Maj(x,y,z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
.macro Maj
mov $0, t // x
mov $1, s // y
and s, t // x&y
and $2, s // y&z
xor s, t // (x&y) ^ (y&z)
mov $2, s // z
and $0, s // (x&z)
xor s, t // t = (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
.endm
/* Shift-right (used in SHA-256, SHA-384, and SHA-512): */
// #define R(b,x) ((x) >> (b))
/* 32-bit Rotate-right (used in SHA-256): */
// #define S32(b,x) (((x) >> (b)) | ((x) << (32 - (b))))
// #define sigma0_256(x) (S32(7, (x)) ^ S32(18, (x)) ^ R(3 , (x)))
// performs sigma0_256 on 4 words on an xmm registers
// use xmm6/xmm7 as intermediate registers
.macro sigma0
movdqa $0, %xmm6
movdqa $0, %xmm7
psrld $$3, $0 // SHR3(x)
psrld $$7, %xmm6 // part of ROTR7
pslld $$14, %xmm7 // part of ROTR18
pxor %xmm6, $0
pxor %xmm7, $0
psrld $$11, %xmm6 // part of ROTR18
pslld $$11, %xmm7 // part of ROTR7
pxor %xmm6, $0
pxor %xmm7, $0
.endm
// #define sigma1_256(x) (S32(17, (x)) ^ S32(19, (x)) ^ R(10, (x)))
// performs sigma1_256 on 4 words on an xmm registers
// use xmm6/xmm7 as intermediate registers
.macro sigma1
movdqa $0, %xmm6
movdqa $0, %xmm7
psrld $$10, $0 // SHR10(x)
psrld $$17, %xmm6 // part of ROTR17
pxor %xmm6, $0
pslld $$13, %xmm7 // part of ROTR19
pxor %xmm7, $0
psrld $$2, %xmm6 // part of ROTR19
pxor %xmm6, $0
pslld $$2, %xmm7 // part of ROTR17
pxor %xmm7, $0
.endm
// #define Sigma0_256(x) (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
.macro Sigma0
mov $0, t // x
mov $0, s // x
ror $$2, t // S32(2, (x))
ror $$13, s // S32(13, (x))
xor s, t // S32(2, (x)) ^ S32(13, (x))
ror $$9, s // S32(22, (x))
xor s, t // t = (S32(2, (x)) ^ S32(13, (x)) ^ S32(22, (x)))
.endm
// #define Sigma1_256(x) (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
.macro Sigma1
mov $0, s // x
ror $$6, s // S32(6, (x))
mov s, t // S32(6, (x))
ror $$5, s // S32(11, (x))
xor s, t // S32(6, (x)) ^ S32(11, (x))
ror $$14, s // S32(25, (x))
xor s, t // t = (S32(6, (x)) ^ S32(11, (x)) ^ S32(25, (x)))
.endm
// per round digests update
.macro round
Sigma1 $4 // t = T1
add t, $7 // use h to store h+Sigma1(e)
Ch $4, $5, $6 // t = Ch (e, f, g) add WK($8), t // h = T1
add t, $3 // d += T1 Sigma0 $0 // t = Sigma0(a) Maj $0, $1, $2 // t = Maj(a,b,c)
add t, $7 // h = T1 + Sigma0(a) + Maj(a,b,c)
// per 4 rounds digests update and permutation
// permutation is absorbed by rotating the roles of digests a-h
.macro rounds
round $0, $1, $2, $3, $4, $5, $6, $7, 0+$8
round $7, $0, $1, $2, $3, $4, $5, $6, 1+$8
round $6, $7, $0, $1, $2, $3, $4, $5, 2+$8
round $5, $6, $7, $0, $1, $2, $3, $4, 3+$8
.endm
// update the message schedule W and W+K (4 rounds) 16 rounds ahead in the future
.macro message_schedule
// 4 32-bit K256 words in xmm5
#if defined (__x86_64__)
movdqu (K), %xmm5
#else
mov K, t
movdqu (t), %xmm5
#endif
add $$16, K // K points to next K256 word for next iteration
movdqa $1, %xmm4 // W7:W4
palignr $$4, $0, %xmm4 // W4:W1
sigma0 %xmm4 // sigma0(W4:W1)
movdqa $3, %xmm6 // W15:W12
paddd %xmm4, $0 // $0 = W3:W0 + sigma0(W4:W1)
palignr $$4, $2, %xmm6 // W12:W9
paddd %xmm6, $0 // $0 = W12:W9 + sigma0(W4:W1) + W3:W0
movdqa $3, %xmm4 // W15:W12
psrldq $$8, %xmm4 // 0,0,W15,W14
sigma1 %xmm4 // sigma1(0,0,W15,W14)
paddd %xmm4, $0 // sigma1(0,0,W15,W14) + W12:W9 + sigma0(W4:W1) + W3:W0
movdqa $0, %xmm4 // W19-sigma1(W17), W18-sigma1(W16), W17, W16
pslldq $$8, %xmm4 // W17, W16, 0, 0
sigma1 %xmm4 // sigma1(W17,W16,0,0)
paddd %xmm4, $0 // W19:W16
paddd $0, %xmm5 // WK
movdqa %xmm5, WK($4)
.endm
// this macro is used in the last 16 rounds of a current block
// it reads the next message (16 4-byte words), load it into 4 words W[r:r+3], computes WK[r:r+3]
// and save into stack to prepare for next block
.macro update_W_WK
#if defined (__x86_64__)
movdqu $0*16(data), $1 // read 4 4-byte words
pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
movdqu $0*16(K), %xmm4 // K[r:r+3]
#else
mov data_addr, t
movdqu $0*16(t), $1 // read 4 4-byte words
pshufb L_aligned_bswap, $1 // big-endian of each 4-byte word, W[r:r+3]
mov K, t
movdqu $0*16(t), %xmm4 // K[r:r+3]
#endif
paddd $1, %xmm4 // WK[r:r+3]
movdqa %xmm4, WK($0*4) // save WK[r:r+3] into stack circular buffer
.endm
.text
#if defined (__x86_64__) || defined (__i386__)
.globl _SHA256_Transform
_SHA256_Transform:
// detect SSSE3 and dispatch appropriate code branch
#if defined __x86_64__
movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities
mov (%rax), %eax // %eax = __cpu_capabilities
#else // i386
#if defined KERNEL
leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities
mov (%eax), %eax // %eax = __cpu_capabilities
#else
mov _COMM_PAGE_CPU_CAPABILITIES, %eax
#endif
#endif
test $(kHasSupplementalSSE3), %eax
je _SHA256_Transform_nossse3 // branch to no-ssse3 code
// push callee-saved registers
#if defined (__x86_64__)
push %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
#else
push %ebp
push %ebx
push %esi
push %edi
#endif
// allocate stack space
sub $stack_size, sp
// if kernel code, save used xmm registers
#if KERNEL
movdqa %xmm0, 0*16+xmm_save
movdqa %xmm1, 1*16+xmm_save
movdqa %xmm2, 2*16+xmm_save
movdqa %xmm3, 3*16+xmm_save
movdqa %xmm4, 4*16+xmm_save
movdqa %xmm5, 5*16+xmm_save
movdqa %xmm6, 6*16+xmm_save
movdqa %xmm7, 7*16+xmm_save
#endif
// set up bswap parameters in the aligned stack space and pointer to table K256[]
#if defined (__x86_64__)
lea _K256(%rip), K
lea L_bswap(%rip), %rax
movdqa (%rax), %xmm0
#else
lea _K256, t
mov t, K
lea L_bswap, %eax
movdqa (%eax), %xmm0
#endif
movdqa %xmm0, L_aligned_bswap
// load W[0:15] into xmm0-xmm3
#if defined (__x86_64__)
movdqu 0*16(data), W0
movdqu 1*16(data), W1
movdqu 2*16(data), W2
movdqu 3*16(data), W3
add $64, data
#else
mov data_addr, t
movdqu 0*16(t), W0
movdqu 1*16(t), W1
movdqu 2*16(t), W2
movdqu 3*16(t), W3
add $64, data_addr
#endif
pshufb L_aligned_bswap, W0
pshufb L_aligned_bswap, W1
pshufb L_aligned_bswap, W2
pshufb L_aligned_bswap, W3
// compute WK[0:15] and save in stack
#if defined (__x86_64__)
movdqu 0*16(K), %xmm4
movdqu 1*16(K), %xmm5
movdqu 2*16(K), %xmm6
movdqu 3*16(K), %xmm7
#else
mov K, t
movdqu 0*16(t), %xmm4
movdqu 1*16(t), %xmm5
movdqu 2*16(t), %xmm6
movdqu 3*16(t), %xmm7
#endif
add $64, K
paddd %xmm0, %xmm4
paddd %xmm1, %xmm5
paddd %xmm2, %xmm6
paddd %xmm3, %xmm7
movdqa %xmm4, WK(0)
movdqa %xmm5, WK(4)
movdqa %xmm6, WK(8)
movdqa %xmm7, WK(12)
L_loop:
// digests a-h = ctx->states mov 0*4(ctx), a
mov 1*4(ctx), b
mov 2*4(ctx), c
mov 3*4(ctx), d
mov 4*4(ctx), e
mov 5*4(ctx), f
mov 6*4(ctx), g
mov 7*4(ctx), h
#else
mov ctx_addr, t
mov 0*4(t), a
mov 1*4(t), b
mov 2*4(t), s
mov s, c
mov 3*4(t), d
mov 4*4(t), e
mov 5*4(t), s
mov s, f
mov 6*4(t), g
mov 7*4(t), s
mov s, h
#endif
// rounds 0:47 interleaved with W/WK update for rounds 16:63
rounds a, b, c, d, e, f, g, h, 0
message_schedule W0,W1,W2,W3,16
rounds e, f, g, h, a, b, c, d, 4
message_schedule W1,W2,W3,W0,20
rounds a, b, c, d, e, f, g, h, 8
message_schedule W2,W3,W0,W1,24
rounds e, f, g, h, a, b, c, d, 12
message_schedule W3,W0,W1,W2,28
rounds a, b, c, d, e, f, g, h, 16
message_schedule W0,W1,W2,W3,32
rounds e, f, g, h, a, b, c, d, 20
message_schedule W1,W2,W3,W0,36
rounds a, b, c, d, e, f, g, h, 24
message_schedule W2,W3,W0,W1,40
rounds e, f, g, h, a, b, c, d, 28
message_schedule W3,W0,W1,W2,44
rounds a, b, c, d, e, f, g, h, 32
message_schedule W0,W1,W2,W3,48
rounds e, f, g, h, a, b, c, d, 36
message_schedule W1,W2,W3,W0,52
rounds a, b, c, d, e, f, g, h, 40
message_schedule W2,W3,W0,W1,56
rounds e, f, g, h, a, b, c, d, 44
message_schedule W3,W0,W1,W2,60
// revert K to the beginning of K256[]
#if defined __x86_64__
sub $256, K
#else
subl $256, K
#endif
sub $1, num_blocks // num_blocks--
je L_final_block // if final block, wrap up final rounds
// rounds 48:63 interleaved with W/WK initialization for next block rounds 0:15
rounds a, b, c, d, e, f, g, h, 48
update_W_WK 0, W0
rounds e, f, g, h, a, b, c, d, 52
update_W_WK 1, W1
rounds a, b, c, d, e, f, g, h, 56
update_W_WK 2, W2
rounds e, f, g, h, a, b, c, d, 60
update_W_WK 3, W3
add $64, K
#if defined (__x86_64__)
add $64, data
#else
add $64, data_addr
#endif
// ctx->states += digests a-h
#if defined (__x86_64__)
add a, 0*4(ctx)
add b, 1*4(ctx)
add c, 2*4(ctx)
add d, 3*4(ctx)
add e, 4*4(ctx)
add f, 5*4(ctx)
add g, 6*4(ctx)
add h, 7*4(ctx)
#else
mov ctx_addr, t
add a, 0*4(t)
add b, 1*4(t)
mov c, s
add s, 2*4(t)
add d, 3*4(t)
add e, 4*4(t)
mov f, s
add s, 5*4(t)
add g, 6*4(t)
mov h, s
add s, 7*4(t)
#endif
jmp L_loop // branch for next block
// wrap up digest update round 48:63 for final block
L_final_block:
rounds a, b, c, d, e, f, g, h, 48
rounds e, f, g, h, a, b, c, d, 52
rounds a, b, c, d, e, f, g, h, 56
rounds e, f, g, h, a, b, c, d, 60
// ctx->states += digests a-h
#if defined (__x86_64__)
add a, 0*4(ctx)
add b, 1*4(ctx)
add c, 2*4(ctx)
add d, 3*4(ctx)
add e, 4*4(ctx)
add f, 5*4(ctx)
add g, 6*4(ctx)
add h, 7*4(ctx)
#else
mov ctx_addr, t
add a, 0*4(t)
add b, 1*4(t)
mov c, s
add s, 2*4(t)
add d, 3*4(t)
add e, 4*4(t)
mov f, s
add s, 5*4(t)
add g, 6*4(t)
mov h, s
add s, 7*4(t)
#endif
// if kernel, restore xmm0-xmm7
#if KERNEL
movdqa 0*16+xmm_save, %xmm0
movdqa 1*16+xmm_save, %xmm1
movdqa 2*16+xmm_save, %xmm2
movdqa 3*16+xmm_save, %xmm3
movdqa 4*16+xmm_save, %xmm4
movdqa 5*16+xmm_save, %xmm5
movdqa 6*16+xmm_save, %xmm6
movdqa 7*16+xmm_save, %xmm7
#endif
// free allocated stack memory
add $stack_size, sp
// restore callee-saved registers
#if defined (__x86_64__)
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbx
pop %rbp
#else
pop %edi
pop %esi
pop %ebx
pop %ebp
#endif
// return
ret
.const
.align 4, 0x90
L_bswap:
.long 0x00010203
.long 0x04050607
.long 0x08090a0b
.long 0x0c0d0e0f
#endif // x86_64/i386