14 years ago · 48f07d9219
--- a/sha256_sse2_amd64.c
+++ b/sha256_sse2_amd64.c
@@ -22,9 +22,9 @@
 
				 #include <stdint.h>
			
 
				 #include <stdio.h>
			
 
				 
			
 
				-extern void CalcSha256_x64(__m128i *res, __m128i *data, uint32_t init[8]);
			
 
				+extern void sha256_sse2_64_new (__m128i *res, __m128i *res1, __m128i *data, const uint32_t init[8]);
			
 
				 
			
 
				-static uint32_t g_sha256_k[] = {
			
 
				+static uint32_t g_sha256_k[]__attribute__((aligned(0x100))) = {
			
 
				     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /*  0 */
			
 
				     0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
			
 
				     0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, /*  8 */
			
@@ -44,10 +44,11 @@ static uint32_t g_sha256_k[] = {
 
				 };
			
 
				 
			
 
				 
			
 
				-static uint32_t g_sha256_hinit[8] =
			
 
				+const uint32_t sha256_init[8]__attribute__((aligned(0x100))) =
			
 
				 {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
			
 
				 
			
 
				 __m128i g_4sha256_k[64];
			
 
				+__m128i sha256_consts_m128i[64]__attribute__((aligned(0x1000)));
			
 
				 
			
 
				 int scanhash_sse2_64(int thr_id, const unsigned char *pmidstate,
			
 
				 	unsigned char *pdata,
			
@@ -58,7 +59,9 @@ int scanhash_sse2_64(int thr_id, const unsigned char *pmidstate,
 
				 {
			
 
				     uint32_t *nNonce_p = (uint32_t *)(pdata + 12);
			
 
				     uint32_t m_midstate[8], m_w[16], m_w1[16];
			
 
				-    __m128i m_4w[64], m_4hash[64], m_4hash1[64];
			
 
				+    __m128i m_4w[64] __attribute__ ((aligned (0x100)));
			
 
				+    __m128i m_4hash[64] __attribute__ ((aligned (0x100)));
			
 
				+    __m128i m_4hash1[64] __attribute__ ((aligned (0x100)));
			
 
				     __m128i offset;
			
 
				     int i;
			
 
				 
			
@@ -84,7 +87,7 @@ int scanhash_sse2_64(int thr_id, const unsigned char *pmidstate,
 
				         m_4hash1[i] = _mm_set1_epi32(m_w1[i]);
			
 
				 
			
 
				     for (i = 0; i < 64; i++)
			
 
				-	g_4sha256_k[i] = _mm_set1_epi32(g_sha256_k[i]);
			
 
				+	sha256_consts_m128i[i] = _mm_set1_epi32(g_sha256_k[i]);
			
 
				 
			
 
				     offset = _mm_set_epi32(0x3, 0x2, 0x1, 0x0);
			
 
				 
			
@@ -94,9 +97,7 @@ int scanhash_sse2_64(int thr_id, const unsigned char *pmidstate,
 
				 
			
 
				 	m_4w[3] = _mm_add_epi32(offset, _mm_set1_epi32(nonce));
			
 
				 
			
 
				-	/* Some optimization can be done here W.R.T. precalculating some hash */
			
 
				-        CalcSha256_x64(m_4hash1, m_4w, m_midstate);
			
 
				-	CalcSha256_x64(m_4hash, m_4hash1, g_sha256_hinit);
			
 
				+	sha256_sse2_64_new (m_4hash, m_4hash1, m_4w, m_midstate);
			
 
				 
			
 
				 	for (j = 0; j < 4; j++) {
			
 
				 	    mi.m = m_4hash[7];
			
--- a/x86_64/sha256_xmm_amd64.asm
+++ b/x86_64/sha256_xmm_amd64.asm
@@ -1,256 +1,322 @@
 
				-;; SHA-256 for X86-64 for Linux, based off of:
			
 
				-
			
 
				-; (c) Ufasoft 2011 http://ufasoft.com mailto:support@ufasoft.com
			
 
				-; Version 2011
			
 
				-; This software is Public Domain
			
 
				-
			
 
				-; Significant re-write/optimisation and reordering by,
			
 
				-; Neil Kettle <mu-b@digit-labs.org>
			
 
				-; ~18% performance improvement
			
 
				-
			
 
				-; SHA-256 CPU SSE cruncher for Bitcoin Miner
			
 
				+;/*
			
 
				+; * Copyright (C) 2011 - Neil Kettle <neil@digit-labs.org>
			
 
				+; *
			
 
				+; * This file is part of cpuminer-ng.
			
 
				+; *
			
 
				+; * cpuminer-ng is free software: you can redistribute it and/or modify
			
 
				+; * it under the terms of the GNU General Public License as published by
			
 
				+; * the Free Software Foundation, either version 3 of the License, or
			
 
				+; * (at your option) any later version.
			
 
				+; *
			
 
				+; * cpuminer-ng is distributed in the hope that it will be useful,
			
 
				+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				+; * GNU General Public License for more details.
			
 
				+; *
			
 
				+; * You should have received a copy of the GNU General Public License
			
 
				+; * along with cpuminer-ng.  If not, see <http://www.gnu.org/licenses/>.
			
 
				+; */
			
 
				+
			
 
				+; %rbp, %rbx, and %r12-%r15 - callee save
			
 
				 
			
 
				 ALIGN 32
			
 
				 BITS 64
			
 
				 
			
 
				-%define hash rdi
			
 
				-%define data rsi
			
 
				-%define init rdx
			
 
				+%define hash  rdi
			
 
				+%define hash1 rsi
			
 
				+%define data  rdx
			
 
				+%define init  rcx
			
 
				 
			
 
				 ; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
			
 
				-%define LAB_CALC_PARA	2
			
 
				-%define LAB_CALC_UNROLL	8
			
 
				-
			
 
				-%define LAB_LOOP_UNROLL 8
			
 
				-
			
 
				-extern g_4sha256_k
			
 
				-
			
 
				-global CalcSha256_x64
			
 
				-;	CalcSha256	hash(rdi), data(rsi), init(rdx)
			
 
				-CalcSha256_x64:
			
 
				-
			
 
				-	push	rbx
			
 
				-
			
 
				-LAB_NEXT_NONCE:
			
 
				-
			
 
				-	mov	rcx, 64*4					; 256 - rcx is # of SHA-2 rounds
			
 
				-	mov	rax, 16*4					; 64 - rax is where we expand to
			
 
				-
			
 
				-LAB_SHA:
			
 
				-	push	rcx
			
 
				-	lea	rcx, qword [data+rcx*4]				; + 1024
			
 
				-	lea	r11, qword [data+rax*4]				; + 256
			
 
				-
			
 
				-LAB_CALC:
			
 
				-%macro	lab_calc_blk 1
			
 
				-	movdqa	xmm0, [r11-(15-%1)*16]				; xmm0 = W[I-15]
			
 
				-	movdqa	xmm4, [r11-(15-(%1+1))*16]			; xmm4 = W[I-15+1]
			
 
				-	movdqa	xmm2, xmm0					; xmm2 = W[I-15]
			
 
				-	movdqa	xmm6, xmm4					; xmm6 = W[I-15+1]
			
 
				-	psrld	xmm0, 3						; xmm0 = W[I-15] >> 3
			
 
				-	psrld	xmm4, 3						; xmm4 = W[I-15+1] >> 3
			
 
				-	movdqa	xmm1, xmm0					; xmm1 = W[I-15] >> 3
			
 
				-	movdqa	xmm5, xmm4					; xmm5 = W[I-15+1] >> 3
			
 
				-	pslld	xmm2, 14					; xmm2 = W[I-15] << 14
			
 
				-	pslld	xmm6, 14					; xmm6 = W[I-15+1] << 14
			
 
				-	psrld	xmm1, 4						; xmm1 = W[I-15] >> 7
			
 
				-	psrld	xmm5, 4						; xmm5 = W[I-15+1] >> 7
			
 
				-	pxor	xmm0, xmm1					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
			
 
				-	pxor	xmm4, xmm5					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)
			
 
				-	psrld	xmm1, 11					; xmm1 = W[I-15] >> 18
			
 
				-	psrld	xmm5, 11					; xmm5 = W[I-15+1] >> 18
			
 
				-	pxor	xmm0, xmm2					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
			
 
				-	pxor	xmm4, xmm6					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
			
 
				-	pslld	xmm2, 11					; xmm2 = W[I-15] << 25
			
 
				-	pslld	xmm6, 11					; xmm6 = W[I-15+1] << 25
			
 
				-	pxor	xmm0, xmm1					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
			
 
				-	pxor	xmm4, xmm5					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)
			
 
				-	pxor	xmm0, xmm2					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
			
 
				-	pxor	xmm4, xmm6					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
			
 
				-
			
 
				-	movdqa	xmm3, [r11-(2-%1)*16]				; xmm3 = W[I-2]
			
 
				-	movdqa	xmm7, [r11-(2-(%1+1))*16]			; xmm7 = W[I-2+1]
			
 
				-
			
 
				-	paddd	xmm0, [r11-(16-%1)*16]				; xmm0 = s0(W[I-15]) + W[I-16]
			
 
				-	paddd	xmm4, [r11-(16-(%1+1))*16]			; xmm4 = s0(W[I-15+1]) + W[I-16+1]
			
 
				+%define SHA_CALC_W_PARA         2
			
 
				+%define SHA_CALC_W_UNROLL       8
			
 
				 
			
 
				-;;;;;;;;;;;;;;;;;;
			
 
				+%define SHA_ROUND_LOOP_UNROLL   16
			
 
				+
			
 
				+%ifidn __YASM_OBJFMT__, macho64
			
 
				+extern _sha256_consts_m128i
			
 
				+extern _sha256_init
			
 
				+%else
			
 
				+extern sha256_consts_m128i
			
 
				+extern sha256_init
			
 
				+%endif
			
 
				+
			
 
				+%ifidn __YASM_OBJFMT__, macho64
			
 
				+global _sha256_sse2_64_new
			
 
				+%else
			
 
				+global sha256_sse2_64_new
			
 
				+%endif
			
 
				+
			
 
				+%define sr1   xmm6
			
 
				+%define sr2   xmm1
			
 
				+%define sr3   xmm2
			
 
				+%define sr4   xmm13
			
 
				+
			
 
				+%define rA    xmm7
			
 
				+%define rB    xmm5
			
 
				+%define rC    xmm4
			
 
				+%define rD    xmm3
			
 
				+%define rE    xmm0
			
 
				+%define rF    xmm8
			
 
				+%define rG    xmm9
			
 
				+%define rH    xmm10
			
 
				+
			
 
				+%macro  sha_round_blk 0
			
 
				+    movdqa    sr1, [data+rax]                   ; T1  =                                             w;
			
 
				+    ;movdqa    sr1, xmm11
			
 
				+    movdqa    sr2, rE                           ; sr2 = rE
			
 
				+
			
 
				+    pandn     sr2, rG                           ; sr2 = ~rE & rG
			
 
				+    movdqa    sr3, rF                           ; sr3 = rF
			
 
				+
			
 
				+    paddd     sr1, rH                           ; T1  = h                + sha256_consts_m128i[i] + w;
			
 
				+    movdqa    rH, rG                            ; rH  = rG
			
 
				+
			
 
				+    pand      sr3, rE                           ; sr3 = rE & rF
			
 
				+    movdqa    rG, rF                            ; rG  = rF
			
 
				+
			
 
				+%ifidn __YASM_OBJFMT__, macho64
			
 
				+    paddd     sr1, [rcx+rax]
			
 
				+%else
			
 
				+    paddd     sr1, sha256_consts_m128i[rax]     ; T1  =                    sha256_consts_m128i[i] + w;
			
 
				+%endif
			
 
				+    pxor      sr2, sr3                          ; sr2 = (rE & rF) ^ (~rE & rG) = Ch (e, f, g)
			
 
				+
			
 
				+    movdqa    rF, rE                            ; rF  = rE
			
 
				+    paddd     sr1, sr2                          ; T1  = h + Ch (e, f, g) + sha256_consts_m128i[i] + w;
			
 
				+
			
 
				+    movdqa    sr2, rE                           ; sr2 = rE
			
 
				+    psrld     rE, 6                 ; e >> 6
			
 
				+
			
 
				+    movdqa    sr3, rE               ; e >> 6
			
 
				+    pslld     sr2, 7                ; e << 7
			
 
				+
			
 
				+    psrld     sr3, 5                ; e >> 11
			
 
				+    pxor      rE, sr2               ; e >> 6 ^ e << 7
			
 
				+
			
 
				+    pslld     sr2, 14               ; e << 21
			
 
				+    pxor      rE, sr3               ; e >> 6 ^ e << 7 ^ e >> 11
			
 
				+
			
 
				+    psrld     sr3, 14               ; e >> 25
			
 
				+    pxor      rE, sr2               ; e >> 6 ^ e << 7 ^ e >> 11 ^ e << 21
			
 
				+
			
 
				+    pslld     sr2, 5                ; e << 26
			
 
				+    pxor      rE, sr3               ; e >> 6 ^ e << 7 ^ e >> 11 ^ e << 21 ^ e >> 25
			
 
				+
			
 
				+    pxor      rE, sr2               ; e >> 6 ^ e << 7 ^ e >> 11 ^ e << 21 ^ e >> 25 ^ e << 26
			
 
				+    movdqa    sr2, rB                           ; sr2 = rB
			
 
				+
			
 
				+    paddd     sr1, rE                           ; sr1 = h + BIGSIGMA1_256(e) + Ch (e, f, g) + sha256_consts_m128i[i] + w;
			
 
				+    movdqa    rE, rD                            ; rE  = rD
			
 
				+
			
 
				+    movdqa    rD, rC                            ; rD  = rC
			
 
				+    paddd     rE, sr1                           ; rE  = rD + T1
			
 
				+
			
 
				+    movdqa    sr3, rC                           ; sr3 = rC
			
 
				+    pand      rC, rA                            ; rC  = rC & rA
			
 
				+
			
 
				+    pand      sr3, rB                           ; sr3 = rB & rC
			
 
				+    pand      sr2, rA                           ; sr2 = rB & rA
			
 
				+
			
 
				+    pxor      sr2, rC                           ; sr2 = (rB & rA) ^ (rC & rA)
			
 
				+    movdqa    rC, rB                            ; rC  = rB
			
 
				 
			
 
				-	movdqa	xmm2, xmm3					; xmm2 = W[I-2]
			
 
				-	movdqa	xmm6, xmm7					; xmm6 = W[I-2+1]
			
 
				-	psrld	xmm3, 10					; xmm3 = W[I-2] >> 10
			
 
				-	psrld	xmm7, 10					; xmm7 = W[I-2+1] >> 10
			
 
				-	movdqa	xmm1, xmm3					; xmm1 = W[I-2] >> 10
			
 
				-	movdqa	xmm5, xmm7					; xmm5 = W[I-2+1] >> 10
			
 
				-
			
 
				-	paddd	xmm0, [r11-(7-%1)*16]				; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
			
 
				-
			
 
				-	pslld	xmm2, 13					; xmm2 = W[I-2] << 13
			
 
				-	pslld	xmm6, 13					; xmm6 = W[I-2+1] << 13
			
 
				-	psrld	xmm1, 7						; xmm1 = W[I-2] >> 17
			
 
				-	psrld	xmm5, 7						; xmm5 = W[I-2+1] >> 17
			
 
				-
			
 
				-	paddd	xmm4, [r11-(7-(%1+1))*16]			; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]
			
 
				-
			
 
				-	pxor	xmm3, xmm1					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
			
 
				-	pxor	xmm7, xmm5					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
			
 
				-	psrld	xmm1, 2						; xmm1 = W[I-2] >> 19
			
 
				-	psrld	xmm5, 2						; xmm5 = W[I-2+1] >> 19
			
 
				-	pxor	xmm3, xmm2					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
			
 
				-	pxor	xmm7, xmm6					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)
			
 
				-	pslld	xmm2, 2						; xmm2 = W[I-2] << 15
			
 
				-	pslld	xmm6, 2						; xmm6 = W[I-2+1] << 15
			
 
				-	pxor	xmm3, xmm1					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
			
 
				-	pxor	xmm7, xmm5					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)
			
 
				-	pxor	xmm3, xmm2					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
			
 
				-	pxor	xmm7, xmm6					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
			
 
				-
			
 
				-	paddd	xmm0, xmm3					; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
			
 
				-	paddd	xmm4, xmm7					; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]
			
 
				-	movdqa	[r11+(%1*16)], xmm0
			
 
				-	movdqa	[r11+((%1+1)*16)], xmm4
			
 
				+    pxor      sr2, sr3                          ; sr2 = (rB & rA) ^ (rC & rA) ^ (rB & rC)
			
 
				+    movdqa    rB, rA                            ; rB  = rA
			
 
				+
			
 
				+    paddd     sr1, sr2                          ; sr1 = T1 + (rB & rA) ^ (rC & rA) ^ (rB & rC)
			
 
				+    lea       rax, [rax+16]
			
 
				+
			
 
				+    movdqa    sr3, rA                           ; sr3 = rA
			
 
				+    psrld     rA, 2                 ; a >> 2
			
 
				+
			
 
				+    pslld     sr3, 10               ; a << 10
			
 
				+    movdqa    sr2, rA               ; a >> 2
			
 
				+
			
 
				+    pxor      rA, sr3               ; a >> 2 ^ a << 10
			
 
				+    psrld     sr2, 11               ; a >> 13
			
 
				+
			
 
				+    pxor      rA, sr2               ; a >> 2 ^ a << 10 ^ a >> 13
			
 
				+    pslld     sr3, 9                ; a << 19
			
 
				+
			
 
				+    pxor      rA, sr3               ; a >> 2 ^ a << 10 ^ a >> 13 ^ a << 19
			
 
				+    psrld     sr2, 9                ; a >> 21
			
 
				+
			
 
				+    pxor      rA, sr2               ; a >> 2 ^ a << 10 ^ a >> 13 ^ a << 19 ^ a >> 21
			
 
				+    pslld     sr3, 11               ; a << 30
			
 
				+
			
 
				+    pxor      rA, sr3               ; a >> 2 ^ a << 10 ^ a >> 13 ^ a << 19 ^ a >> 21 ^ a << 30
			
 
				+    paddd     rA, sr1                           ; T1 + BIGSIGMA0_256(a) + Maj(a, b, c);
			
 
				 %endmacro
			
 
				 
			
 
				-%assign i 0
			
 
				-%rep    LAB_CALC_UNROLL
			
 
				-        lab_calc_blk i
			
 
				-%assign i i+LAB_CALC_PARA
			
 
				-%endrep
			
 
				+%macro  sha_calc_w_blk 1
			
 
				+    movdqa	xmm0, [r11-(15-%1)*16]				; xmm0 = W[I-15]
			
 
				+    movdqa	xmm4, [r11-(15-(%1+1))*16]			; xmm4 = W[I-15+1]
			
 
				+    movdqa	xmm2, xmm0					; xmm2 = W[I-15]
			
 
				+    movdqa	xmm6, xmm4					; xmm6 = W[I-15+1]
			
 
				+    psrld	xmm0, 3						; xmm0 = W[I-15] >> 3
			
 
				+    psrld	xmm4, 3						; xmm4 = W[I-15+1] >> 3
			
 
				+    movdqa	xmm1, xmm0					; xmm1 = W[I-15] >> 3
			
 
				+    movdqa	xmm5, xmm4					; xmm5 = W[I-15+1] >> 3
			
 
				+    pslld	xmm2, 14					; xmm2 = W[I-15] << 14
			
 
				+    pslld	xmm6, 14					; xmm6 = W[I-15+1] << 14
			
 
				+    psrld	xmm1, 4						; xmm1 = W[I-15] >> 7
			
 
				+    psrld	xmm5, 4						; xmm5 = W[I-15+1] >> 7
			
 
				+    pxor	xmm0, xmm1					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7)
			
 
				+    pxor	xmm4, xmm5					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7)
			
 
				+    psrld	xmm1, 11					; xmm1 = W[I-15] >> 18
			
 
				+    psrld	xmm5, 11					; xmm5 = W[I-15+1] >> 18
			
 
				+    pxor	xmm0, xmm2					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14)
			
 
				+    pxor	xmm4, xmm6					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14)
			
 
				+    pslld	xmm2, 11					; xmm2 = W[I-15] << 25
			
 
				+    pslld	xmm6, 11					; xmm6 = W[I-15+1] << 25
			
 
				+    pxor	xmm0, xmm1					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18)
			
 
				+    pxor	xmm4, xmm5					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18)
			
 
				+    pxor	xmm0, xmm2					; xmm0 = (W[I-15] >> 3) ^ (W[I-15] >> 7) ^ (W[I-15] << 14) ^ (W[I-15] >> 18) ^ (W[I-15] << 25)
			
 
				+    pxor	xmm4, xmm6					; xmm4 = (W[I-15+1] >> 3) ^ (W[I-15+1] >> 7) ^ (W[I-15+1] << 14) ^ (W[I-15+1] >> 18) ^ (W[I-15+1] << 25)
			
 
				+
			
 
				+    movdqa	xmm3, [r11-(2-%1)*16]				; xmm3 = W[I-2]
			
 
				+    movdqa	xmm7, [r11-(2-(%1+1))*16]			; xmm7 = W[I-2+1]
			
 
				+
			
 
				+    paddd	xmm0, [r11-(16-%1)*16]				; xmm0 = s0(W[I-15]) + W[I-16]
			
 
				+    paddd	xmm4, [r11-(16-(%1+1))*16]			; xmm4 = s0(W[I-15+1]) + W[I-16+1]
			
 
				 
			
 
				-	add	r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16
			
 
				-	cmp	r11, rcx
			
 
				-	jb	LAB_CALC
			
 
				-
			
 
				-	pop	rcx
			
 
				-	mov	rax, 0
			
 
				-
			
 
				-; Load the init values of the message into the hash.
			
 
				-
			
 
				-	movdqa	xmm7, [init]
			
 
				-	pshufd	xmm5, xmm7, 0x55		; xmm5 == b
			
 
				-	pshufd	xmm4, xmm7, 0xAA		; xmm4 == c
			
 
				-	pshufd	xmm3, xmm7, 0xFF		; xmm3 == d
			
 
				-	pshufd	xmm7, xmm7, 0			; xmm7 == a
			
 
				-
			
 
				-	movdqa	xmm0, [init+4*4]
			
 
				-	pshufd	xmm8, xmm0, 0x55		; xmm8 == f
			
 
				-	pshufd	xmm9, xmm0, 0xAA		; xmm9 == g
			
 
				-	pshufd	xmm10, xmm0, 0xFF		; xmm10 == h
			
 
				-	pshufd	xmm0, xmm0, 0			; xmm0 == e
			
 
				-
			
 
				-LAB_LOOP:
			
 
				-
			
 
				-;; T t1 = h + (Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)) + ((e & f) ^ AndNot(e, g)) + Expand32<T>(g_sha256_k[j]) + w[j]
			
 
				-
			
 
				-%macro	lab_loop_blk 0
			
 
				-	movdqa	xmm6, [data+rax*4]
			
 
				-	paddd	xmm6, g_4sha256_k[rax*4]
			
 
				-	add	rax, 4
			
 
				-
			
 
				-	paddd	xmm6, xmm10	; +h
			
 
				-
			
 
				-	movdqa	xmm1, xmm0
			
 
				-	movdqa	xmm2, xmm9
			
 
				-	pandn	xmm1, xmm2	; ~e & g
			
 
				-
			
 
				-	movdqa	xmm10, xmm2	; h = g
			
 
				-	movdqa	xmm2, xmm8	; f
			
 
				-	movdqa	xmm9, xmm2	; g = f
			
 
				-
			
 
				-	pand	xmm2, xmm0	; e & f
			
 
				-	pxor	xmm1, xmm2	; (e & f) ^ (~e & g)
			
 
				-	movdqa	xmm8, xmm0	; f = e
			
 
				-
			
 
				-	paddd	xmm6, xmm1	; Ch + h + w[i] + k[i]
			
 
				-
			
 
				-	movdqa	xmm1, xmm0
			
 
				-	psrld	xmm0, 6
			
 
				-	movdqa	xmm2, xmm0
			
 
				-	pslld	xmm1, 7
			
 
				-	psrld	xmm2, 5
			
 
				-	pxor	xmm0, xmm1
			
 
				-	pxor	xmm0, xmm2
			
 
				-	pslld	xmm1, 14
			
 
				-	psrld	xmm2, 14
			
 
				-	pxor	xmm0, xmm1
			
 
				-	pxor	xmm0, xmm2
			
 
				-	pslld	xmm1, 5
			
 
				-	pxor	xmm0, xmm1	; Rotr32(e, 6) ^ Rotr32(e, 11) ^ Rotr32(e, 25)
			
 
				-	paddd	xmm6, xmm0	; xmm6 = t1
			
 
				-
			
 
				-	movdqa	xmm0, xmm3	; d
			
 
				-	paddd	xmm0, xmm6	; e = d+t1
			
 
				-
			
 
				-	movdqa	xmm1, xmm5	; =b
			
 
				-	movdqa	xmm3, xmm4	; d = c
			
 
				-	movdqa	xmm2, xmm4	; c
			
 
				-	pand	xmm2, xmm5	; b & c
			
 
				-	pand	xmm4, xmm7	; a & c
			
 
				-	pand	xmm1, xmm7	; a & b
			
 
				-	pxor	xmm1, xmm4
			
 
				-	movdqa	xmm4, xmm5	; c = b
			
 
				-	movdqa	xmm5, xmm7	; b = a
			
 
				-	pxor	xmm1, xmm2	; (a & c) ^ (a & d) ^ (c & d)
			
 
				-	paddd	xmm6, xmm1	; t1 + ((a & c) ^ (a & d) ^ (c & d))
			
 
				-
			
 
				-	movdqa	xmm2, xmm7
			
 
				-	psrld	xmm7, 2
			
 
				-	movdqa	xmm1, xmm7
			
 
				-	pslld	xmm2, 10
			
 
				-	psrld	xmm1, 11
			
 
				-	pxor	xmm7, xmm2
			
 
				-	pxor	xmm7, xmm1
			
 
				-	pslld	xmm2, 9
			
 
				-	psrld	xmm1, 9
			
 
				-	pxor	xmm7, xmm2
			
 
				-	pxor	xmm7, xmm1
			
 
				-	pslld	xmm2, 11
			
 
				-	pxor	xmm7, xmm2
			
 
				-	paddd	xmm7, xmm6	; a = t1 + (Rotr32(a, 2) ^ Rotr32(a, 13) ^ Rotr32(a, 22)) + ((a & c) ^ (a & d) ^ (c & d));
			
 
				+;;;;;;;;;;;;;;;;;;
			
 
				+
			
 
				+    movdqa	xmm2, xmm3					; xmm2 = W[I-2]
			
 
				+    movdqa	xmm6, xmm7					; xmm6 = W[I-2+1]
			
 
				+    psrld	xmm3, 10					; xmm3 = W[I-2] >> 10
			
 
				+    psrld	xmm7, 10					; xmm7 = W[I-2+1] >> 10
			
 
				+    movdqa	xmm1, xmm3					; xmm1 = W[I-2] >> 10
			
 
				+    movdqa	xmm5, xmm7					; xmm5 = W[I-2+1] >> 10
			
 
				+
			
 
				+    paddd	xmm0, [r11-(7-%1)*16]				; xmm0 = s0(W[I-15]) + W[I-16] + W[I-7]
			
 
				+
			
 
				+    pslld	xmm2, 13					; xmm2 = W[I-2] << 13
			
 
				+    pslld	xmm6, 13					; xmm6 = W[I-2+1] << 13
			
 
				+    psrld	xmm1, 7						; xmm1 = W[I-2] >> 17
			
 
				+    psrld	xmm5, 7						; xmm5 = W[I-2+1] >> 17
			
 
				+
			
 
				+    paddd	xmm4, [r11-(7-(%1+1))*16]			; xmm4 = s0(W[I-15+1]) + W[I-16+1] + W[I-7+1]
			
 
				+
			
 
				+    pxor	xmm3, xmm1					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17)
			
 
				+    pxor	xmm7, xmm5					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17)
			
 
				+    psrld	xmm1, 2						; xmm1 = W[I-2] >> 19
			
 
				+    psrld	xmm5, 2						; xmm5 = W[I-2+1] >> 19
			
 
				+    pxor	xmm3, xmm2					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13)
			
 
				+    pxor	xmm7, xmm6					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13)
			
 
				+    pslld	xmm2, 2						; xmm2 = W[I-2] << 15
			
 
				+    pslld	xmm6, 2						; xmm6 = W[I-2+1] << 15
			
 
				+    pxor	xmm3, xmm1					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19)
			
 
				+    pxor	xmm7, xmm5					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19)
			
 
				+    pxor	xmm3, xmm2					; xmm3 = (W[I-2] >> 10) ^ (W[I-2] >> 17) ^ (W[I-2] << 13) ^ (W[I-2] >> 19) ^ (W[I-2] << 15)
			
 
				+    pxor	xmm7, xmm6					; xmm7 = (W[I-2+1] >> 10) ^ (W[I-2+1] >> 17) ^ (W[I-2+1] << 13) ^ (W[I-2+1] >> 19) ^ (W[I-2+1] << 15)
			
 
				+
			
 
				+    paddd	xmm0, xmm3					; xmm0 = s0(W[I-15]) + W[I-16] + s1(W[I-2]) + W[I-7]
			
 
				+    paddd	xmm4, xmm7					; xmm4 = s0(W[I-15+1]) + W[I-16+1] + s1(W[I-2+1]) + W[I-7+1]
			
 
				+    movdqa	[r11+(%1*16)], xmm0
			
 
				+    movdqa	[r11+((%1+1)*16)], xmm4
			
 
				 %endmacro
			
 
				 
			
 
				+; _sha256_sse2_64_new hash(rdi), hash1(rsi), data(rdx), init(rcx),
			
 
				+
			
 
				+%ifidn __YASM_OBJFMT__, macho64
			
 
				+_sha256_sse2_64_new:
			
 
				+%else
			
 
				+sha256_sse2_64_new:
			
 
				+%endif
			
 
				+
			
 
				+    push        rbx
			
 
				+
			
 
				+%macro  SHA_256  0
			
 
				+    mov         rbx, 64*4   ; rbx is # of SHA-2 rounds
			
 
				+    mov         rax, 16*4   ; rax is where we expand to
			
 
				+
			
 
				+    push        rbx
			
 
				+    lea         rbx, qword [data+rbx*4]
			
 
				+    lea         r11, qword [data+rax*4]
			
 
				+
			
 
				+%%SHA_CALC_W:
			
 
				+%assign i 0
			
 
				+%rep    SHA_CALC_W_UNROLL
			
 
				+        sha_calc_w_blk i
			
 
				+%assign i i+SHA_CALC_W_PARA
			
 
				+%endrep
			
 
				+    add       r11, SHA_CALC_W_UNROLL*SHA_CALC_W_PARA*16
			
 
				+    cmp       r11, rbx
			
 
				+    jb        %%SHA_CALC_W
			
 
				+
			
 
				+    pop       rbx
			
 
				+    mov       rax, 0
			
 
				+    lea       rbx, [rbx*4]
			
 
				+
			
 
				+    movdqa    rA, [init]
			
 
				+    pshufd    rB, rA, 0x55          ; rB == B
			
 
				+    pshufd    rC, rA, 0xAA          ; rC == C
			
 
				+    pshufd    rD, rA, 0xFF          ; rD == D
			
 
				+    pshufd    rA, rA, 0             ; rA == A
			
 
				+
			
 
				+    movdqa    rE, [init+4*4]
			
 
				+    pshufd    rF, rE, 0x55          ; rF == F
			
 
				+    pshufd    rG, rE, 0xAA          ; rG == G
			
 
				+    pshufd    rH, rE, 0xFF          ; rH == H
			
 
				+    pshufd    rE, rE, 0             ; rE == E
			
 
				+
			
 
				+%ifidn __YASM_OBJFMT__, macho64
			
 
				+    lea       rcx, [_sha256_consts_m128i wrt rip]
			
 
				+%endif
			
 
				+
			
 
				+%%SHAROUND_LOOP:
			
 
				 %assign i 0
			
 
				-%rep    LAB_LOOP_UNROLL
			
 
				-        lab_loop_blk
			
 
				+%rep    SHA_ROUND_LOOP_UNROLL
			
 
				+        sha_round_blk
			
 
				 %assign i i+1
			
 
				 %endrep
			
 
				-
			
 
				-	cmp	rax, rcx
			
 
				-	jb	LAB_LOOP
			
 
				+    cmp   rax, rbx
			
 
				+    jb    %%SHAROUND_LOOP
			
 
				 
			
 
				 ; Finished the 64 rounds, calculate hash and save
			
 
				 
			
 
				-	movdqa	xmm1, [rdx]
			
 
				-	pshufd	xmm2, xmm1, 0x55
			
 
				-	pshufd	xmm6, xmm1, 0xAA
			
 
				-	pshufd	xmm11, xmm1, 0xFF
			
 
				-	pshufd	xmm1, xmm1, 0
			
 
				-
			
 
				-	paddd	xmm5, xmm2
			
 
				-	paddd	xmm4, xmm6
			
 
				-	paddd	xmm3, xmm11
			
 
				-	paddd	xmm7, xmm1
			
 
				-
			
 
				-	movdqa	xmm1, [rdx+4*4]
			
 
				-	pshufd	xmm2, xmm1, 0x55
			
 
				-	pshufd	xmm6, xmm1, 0xAA
			
 
				-	pshufd	xmm11, xmm1, 0xFF
			
 
				-	pshufd	xmm1, xmm1, 0
			
 
				-
			
 
				-	paddd	xmm8, xmm2
			
 
				-	paddd	xmm9, xmm6
			
 
				-	paddd	xmm10, xmm11
			
 
				-	paddd	xmm0, xmm1
			
 
				-
			
 
				-	movdqa	[hash+0*16], xmm7
			
 
				-	movdqa	[hash+1*16], xmm5
			
 
				-	movdqa	[hash+2*16], xmm4
			
 
				-	movdqa	[hash+3*16], xmm3
			
 
				-	movdqa	[hash+4*16], xmm0
			
 
				-	movdqa	[hash+5*16], xmm8
			
 
				-	movdqa	[hash+6*16], xmm9
			
 
				-	movdqa	[hash+7*16], xmm10
			
 
				+    movdqa    sr1, [init]
			
 
				+    pshufd    sr2, sr1, 0x55
			
 
				+    pshufd    sr3, sr1, 0xAA
			
 
				+    pshufd    sr4, sr1, 0xFF
			
 
				+    pshufd    sr1, sr1, 0
			
 
				+
			
 
				+    paddd     rB, sr2
			
 
				+    paddd     rC, sr3
			
 
				+    paddd     rD, sr4
			
 
				+    paddd     rA, sr1
			
 
				+
			
 
				+    movdqa    sr1, [init+4*4]
			
 
				+    pshufd    sr2, sr1, 0x55
			
 
				+    pshufd    sr3, sr1, 0xAA
			
 
				+    pshufd    sr4, sr1, 0xFF
			
 
				+    pshufd    sr1, sr1, 0
			
 
				+
			
 
				+    paddd     rF, sr2
			
 
				+    paddd     rG, sr3
			
 
				+    paddd     rH, sr4
			
 
				+    paddd     rE, sr1
			
 
				+%endmacro
			
 
				+
			
 
				+    SHA_256
			
 
				+    movdqa    [hash1+0*16], rA
			
 
				+    movdqa    [hash1+1*16], rB
			
 
				+    movdqa    [hash1+2*16], rC
			
 
				+    movdqa    [hash1+3*16], rD
			
 
				+    movdqa    [hash1+4*16], rE
			
 
				+    movdqa    [hash1+5*16], rF
			
 
				+    movdqa    [hash1+6*16], rG
			
 
				+    movdqa    [hash1+7*16], rH
			
 
				+
			
 
				+    mov       data, hash1
			
 
				+    mov       init, sha256_init
			
 
				+
			
 
				+    SHA_256
			
 
				+
			
 
				+    movdqa    [hash+7*16], rH
			
 
				 
			
 
				 LAB_RET:
			
 
				-	pop	rbx
			
 
				-	ret
			
 
				+    pop       rbx
			
 
				+    ret