Browse Source

Compile CPU mining for win32 and win64

James Z.M. Gao 13 years ago
parent
commit
c9ae715019
22 changed files with 225 additions and 50 deletions
  1. 2 0
      .gitignore
  2. 3 7
      Makefile.am
  3. 35 0
      autogen-win32.sh
  4. 36 0
      autogen-win64.sh
  5. 11 6
      cgminer.c
  6. 10 2
      compat.h
  7. 35 6
      configure.ac
  8. 5 5
      driver-bitforce.c
  9. 5 3
      driver-cpu.c
  10. 1 1
      driver-cpu.h
  11. 1 1
      driver-icarus.c
  12. 1 1
      driver-ztex.c
  13. 5 0
      elist.h
  14. 3 2
      fpgautils.c
  15. 2 0
      lib/signal.in.h
  16. 2 1
      logging.c
  17. 2 1
      miner.h
  18. 1 1
      x86_32/Makefile.am
  19. 4 4
      x86_32/sha256_xmm.asm
  20. 1 1
      x86_64/Makefile.am
  21. 35 8
      x86_64/sha256_sse4_amd64.asm
  22. 25 0
      x86_64/sha256_xmm_amd64.asm

+ 2 - 0
.gitignore

@@ -41,3 +41,5 @@ lib/string.h
 lib/warn-on-use.h
 
 mkinstalldirs
+
+*.swp

+ 3 - 7
Makefile.am

@@ -19,7 +19,7 @@ INCLUDES	= $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES)
 
 bin_PROGRAMS	= cgminer
 
-bin_SCRIPTS	= *.cl
+bin_SCRIPTS	= $(top_srcdir)/*.cl
 
 cgminer_LDFLAGS	= $(PTHREAD_FLAGS)
 cgminer_LDADD	= $(DLOPEN_FLAGS) @LIBCURL_LIBS@ @JANSSON_LIBS@ @PTHREAD_LIBS@ \
@@ -27,11 +27,7 @@ cgminer_LDADD	= $(DLOPEN_FLAGS) @LIBCURL_LIBS@ @JANSSON_LIBS@ @PTHREAD_LIBS@ \
 		  @UDEV_LIBS@ @LIBUSB_LIBS@ \
 		  @MATH_LIBS@ lib/libgnu.a ccan/libccan.a
 
-if HAVE_WINDOWS
-cgminer_CPPFLAGS = -I$(top_builddir)/lib -I$(top_srcdir)/lib @OPENCL_FLAGS@ @LIBUSB_CFLAGS@
-else
 cgminer_CPPFLAGS = -I$(top_builddir)/lib -I$(top_srcdir)/lib @OPENCL_FLAGS@ @LIBUSB_CFLAGS@ @LIBCURL_CFLAGS@
-endif
 
 # common sources
 cgminer_SOURCES := cgminer.c
@@ -101,11 +97,11 @@ endif
 if HAS_MODMINER
 cgminer_SOURCES += driver-modminer.c
 bitstreamsdir = $(bindir)/bitstreams
-dist_bitstreams_DATA = bitstreams/*
+dist_bitstreams_DATA = $(top_srcdir)/bitstreams/*
 endif
 
 if HAS_ZTEX
 cgminer_SOURCES += driver-ztex.c libztex.c libztex.h
 bitstreamsdir = $(bindir)/bitstreams
-dist_bitstreams_DATA = bitstreams/*
+dist_bitstreams_DATA = $(top_srcdir)/bitstreams/*
 endif

+ 35 - 0
autogen-win32.sh

@@ -0,0 +1,35 @@
+#!/bin/bash
+
+bs_dir="$(dirname $(readlink -f $0))"
+build_dir="$PWD"
+rm -rf "${bs_dir}"/autom4te.cache
+rm -f "${bs_dir}"/aclocal.m4 "${bs_dir}"/ltmain.sh
+
+echo 'Running autoreconf -ifv...'
+autoreconf -ifv -I "/usr/local/share/aclocal/" "$bs_dir" || exit 1
+
+if test -z "$NOCONFIGURE" ; then
+   echo 'Configuring...'
+
+   if [[ "$bs_dir" != "`pwd`" ]]; then
+      export CPPFLAGS+=" -I $bs_dir"
+   fi
+
+   if [[ ! -z "$CGMINER_SDK" ]]; then
+      export CPPFLAGS="-I $CGMINER_SDK/include $CPPFLAGS"
+      export LDFLAGS="-L $CGMINER_SDK/lib $LDFLAGS"
+      export PKG_CONFIG_PATH="$CGMINER_SDK/lib/pkgconfig${PKG_CONFIG_PATH:+:$PKG_CONFIG_PATH}"
+      export ADL_SDK="$CGMINER_SDK/include/ADL_SDK"
+   fi
+
+   CFLAGS="-O3 -msse2" \
+   "$bs_dir"/configure \
+      --prefix="$build_dir"/opt \
+      --enable-cpumining \
+      --enable-scrypt \
+      --enable-bitforce \
+      --enable-icarus \
+      --enable-modminer \
+      --enable-ztex \
+      $@
+fi

+ 36 - 0
autogen-win64.sh

@@ -0,0 +1,36 @@
+#!/bin/bash
+
+bs_dir="$(dirname $(readlink -f $0))"
+build_dir="$PWD"
+rm -rf "${bs_dir}"/autom4te.cache
+rm -f "${bs_dir}"/aclocal.m4 "${bs_dir}"/ltmain.sh
+
+echo 'Running autoreconf -ifv...'
+autoreconf -ifv -I "/usr/local/share/aclocal/" "$bs_dir" || exit 1
+
+if test -z "$NOCONFIGURE" ; then
+   echo 'Configuring...'
+
+   if [[ "$bs_dir" != "`pwd`" ]]; then
+      export CPPFLAGS+=" -I $bs_dir"
+   fi
+
+   if [[ ! -z "$CGMINER_SDK" ]]; then
+      export CPPFLAGS="-I $CGMINER_SDK/include $CPPFLAGS"
+      export LDFLAGS="-L $CGMINER_SDK/lib64 $LDFLAGS"
+      export PKG_CONFIG_PATH="$CGMINER_SDK/lib64/pkgconfig${PKG_CONFIG_PATH:+:$PKG_CONFIG_PATH}"
+      export ADL_SDK="$CGMINER_SDK/include/ADL_SDK"
+   fi
+
+   CFLAGS="-O3 -msse4" \
+   "$bs_dir"/configure \
+      --target=x86_64-w64-mingw32 \
+      --prefix="$build_dir"/opt \
+      --enable-cpumining \
+      --enable-scrypt \
+      --enable-bitforce \
+      --enable-icarus \
+      --enable-modminer \
+      --enable-ztex \
+      $@
+fi

+ 11 - 6
cgminer.c

@@ -318,7 +318,8 @@ static bool should_run(void)
 		return true;
 
 	gettimeofday(&tv, NULL);
-	tm = localtime(&tv.tv_sec);
+	const time_t tmp_time = tv.tv_sec;
+	tm = localtime(&tmp_time);
 	if (schedstart.enable) {
 		if (!schedstop.enable) {
 			if (time_before(tm, &schedstart.tm))
@@ -350,7 +351,8 @@ void get_datestamp(char *f, struct timeval *tv)
 {
 	struct tm *tm;
 
-	tm = localtime(&tv->tv_sec);
+	const time_t tmp_time = tv->tv_sec;
+	tm = localtime(&tmp_time);
 	sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d]",
 		tm->tm_year + 1900,
 		tm->tm_mon + 1,
@@ -364,7 +366,8 @@ void get_timestamp(char *f, struct timeval *tv)
 {
 	struct tm *tm;
 
-	tm = localtime(&tv->tv_sec);
+	const time_t tmp_time = tv->tv_sec;
+	tm = localtime(&tmp_time);
 	sprintf(f, "[%02d:%02d:%02d]",
 		tm->tm_hour,
 		tm->tm_min,
@@ -2584,9 +2587,11 @@ static bool submit_upstream_work(struct work *work, CURL *curl, bool resubmit)
 			double submit_time = tdiff(&tv_submit_reply, &tv_submit);
 			int diffplaces = 3;
 
-			tm = localtime(&(work->tv_getwork.tv_sec));
+			time_t tmp_time = work->tv_getwork.tv_sec;
+			tm = localtime(&tmp_time);
 			memcpy(&tm_getwork, tm, sizeof(struct tm));
-			tm = localtime(&(tv_submit_reply.tv_sec));
+			tmp_time = tv_submit_reply.tv_sec;
+			tm = localtime(&tmp_time);
 			memcpy(&tm_submit_reply, tm, sizeof(struct tm));
 
 			if (work->clone) {
@@ -2957,7 +2962,7 @@ void app_restart(void)
 	}
 #endif
 
-	execv(initial_args[0], initial_args);
+	execv(initial_args[0], (EXECV_2ND_ARG_TYPE)initial_args);
 	applog(LOG_WARNING, "Failed to restart application");
 }
 

+ 10 - 2
compat.h

@@ -2,15 +2,17 @@
 #define __COMPAT_H__
 
 #ifdef WIN32
+#include "config.h"
 #include <errno.h>
 #include <time.h>
 #include <pthread.h>
 #include <sys/time.h>
 
-#include <windows.h>
-
 #include "miner.h"  // for timersub
 
+#include <windows.h>
+
+#ifndef HAVE_LIBWINPTHREAD
 static inline int nanosleep(const struct timespec *req, struct timespec *rem)
 {
 	struct timeval tstart;
@@ -42,6 +44,7 @@ static inline int nanosleep(const struct timespec *req, struct timespec *rem)
 	}
 	return 0;
 }
+#endif
 
 static inline int sleep(unsigned int secs)
 {
@@ -71,7 +74,12 @@ typedef unsigned int uint;
 typedef long suseconds_t;
 #endif
 
+#ifdef HAVE_LIBWINPTHREAD
+#define PTH(thr) ((thr)->pth)
+#else
 #define PTH(thr) ((thr)->pth.p)
+#endif
+
 #else
 #define PTH(thr) ((thr)->pth)
 #endif /* WIN32 */

+ 35 - 6
configure.ac

@@ -81,7 +81,6 @@ esac
 
 case $target in
   *-*-mingw*)
-    have_x86_64=false
     have_win32=true
     PTHREAD_FLAGS=""
     DLOPEN_FLAGS=""
@@ -166,9 +165,18 @@ else
 	OPENCL_LIBS=""
 fi
 
-AC_CHECK_LIB(pthread, pthread_create, ,
-        AC_MSG_ERROR([Could not find pthread library - please install libpthread]))
-PTHREAD_LIBS=-lpthread
+has_winpthread=false
+if test "x$have_win32" = xtrue; then
+        has_winpthread=true
+        AC_CHECK_LIB(winpthread, nanosleep, , has_winpthread=false)
+        PTHREAD_LIBS=-lwinpthread
+fi
+
+if test "x$has_winpthread" != xtrue; then
+        AC_CHECK_LIB(pthread, pthread_create, ,
+                AC_MSG_ERROR([Could not find pthread library - please install libpthread]))
+        PTHREAD_LIBS=-lpthread
+fi
 
 AC_CHECK_LIB(jansson, json_loads, request_jansson=false, request_jansson=true)
 
@@ -181,7 +189,7 @@ scrypt="no"
 
 if test "$found_opencl" = 1; then
 	if test "x$adl" != xno; then
-		AC_CHECK_FILE([ADL_SDK/adl_sdk.h], have_adl=true, have_adl=false,)
+		AC_CHECK_FILE([${ADL_SDK:-ADL_SDK}/adl_sdk.h], have_adl=true, have_adl=false,)
 		if test x$have_adl = xtrue
 		then
 			AC_DEFINE([HAVE_ADL], [1], [Defined if ADL headers were found])
@@ -305,7 +313,7 @@ has_yasm=false
 AC_PATH_PROG([YASM],[yasm],[false])
 if test "x$YASM" != "xfalse" ; then
   AC_MSG_CHECKING([if yasm version is greater than 1.0.1])
-  yasmver=`yasm --version | head -1 | cut -d\  -f2`
+  yasmver=`"$YASM" --version | head -1 | cut -d\  -f2`
   yamajor=`echo $yasmver | cut -d. -f1`
   yaminor=`echo $yasmver | cut -d. -f2`
   yamini=`echo $yasmver | cut -d. -f3`
@@ -332,6 +340,18 @@ if test "x$YASM" != "xfalse" ; then
 fi
 if test "x$has_yasm" = "xfalse" ; then
   AC_MSG_NOTICE([yasm is required for the assembly algorithms. They will be skipped.])
+else
+  if test "x$have_x86_64" = xtrue; then
+    if test "x$have_win32" = xtrue; then
+      YASM_FMT="win64"
+    else
+      YASM_FMT="elf64"
+    fi
+  elif test "x$have_win32" = xtrue; then
+    YASM_FMT="coff"
+  else
+    YASM_FMT="elf32"
+  fi
 fi
 
 AM_CONDITIONAL([HAS_YASM], [test x$has_yasm = xtrue])
@@ -382,6 +402,14 @@ else
 fi
 AC_SUBST(LIBCURL_LIBS)
 
+#check execv signature
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+		   #include <process.h>
+		   int execv(const char*, const char*const*);
+		   ])],
+		   AC_DEFINE([EXECV_2ND_ARG_TYPE], [const char* const*], [int execv(const char*, const char*const*);]),
+		   AC_DEFINE([EXECV_2ND_ARG_TYPE], [char* const*], [int execv(const char*, char*const*);]))
+
 dnl CCAN wants to know a lot of vars.
 # All the configuration checks.  Regrettably, the __attribute__ checks will
 # give false positives on old GCCs, since they just cause warnings.  But that's
@@ -438,6 +466,7 @@ AC_SUBST(PDCURSES_LIBS)
 AC_SUBST(WS2_LIBS)
 AC_SUBST(MATH_LIBS)
 AC_SUBST(UDEV_LIBS)
+AC_SUBST(YASM_FMT)
 
 AC_CONFIG_FILES([
 	Makefile

+ 5 - 5
driver-bitforce.c

@@ -9,6 +9,8 @@
  * any later version.  See COPYING for more details.
  */
 
+#include "config.h"
+
 #include <limits.h>
 #include <pthread.h>
 #include <stdint.h>
@@ -17,16 +19,14 @@
 #include <sys/time.h>
 #include <unistd.h>
 
-#include "config.h"
+#include "compat.h"
+#include "miner.h"
+#include "usbutils.h"
 
 #ifdef WIN32
 #include <windows.h>
 #endif /* WIN32 */
 
-#include "compat.h"
-#include "miner.h"
-#include "usbutils.h"
-
 #define BITFORCE_IDENTIFY "ZGX"
 #define BITFORCE_IDENTIFY_LEN (sizeof(BITFORCE_IDENTIFY)-1)
 #define BITFORCE_FLASH "ZMX"

+ 5 - 3
driver-cpu.c

@@ -202,7 +202,9 @@ static const sha256_func sha256_funcs[] = {
 
 
 #ifdef WANT_CPUMINE
-#if defined(WANT_X8664_SSE2) && defined(__SSE2__)
+#if defined(WANT_X8664_SSE4) && defined(__SSE4_1__)
+enum sha256_algos opt_algo = ALGO_SSE4_64;
+#elif defined(WANT_X8664_SSE2) && defined(__SSE2__)
 enum sha256_algos opt_algo = ALGO_SSE2_64;
 #elif defined(WANT_X8632_SSE2) && defined(__SSE2__)
 enum sha256_algos opt_algo = ALGO_SSE2_32;
@@ -720,8 +722,8 @@ static void cpu_detect()
 	// Reckon number of cores in the box
 	#if defined(WIN32)
 	{
-		DWORD system_am;
-		DWORD process_am;
+		DWORD_PTR system_am;
+		DWORD_PTR process_am;
 		BOOL ok = GetProcessAffinityMask(
 			GetCurrentProcess(),
 			&system_am,

+ 1 - 1
driver-cpu.h

@@ -30,7 +30,7 @@
 #define WANT_X8664_SSE2 1
 #endif
 
-#if defined(__x86_64__) && defined(HAS_YASM)
+#if defined(__x86_64__) && defined(HAS_YASM) && defined(__SSE4_1__)
 #define WANT_X8664_SSE4 1
 #endif
 

+ 1 - 1
driver-icarus.c

@@ -30,6 +30,7 @@
  */
 
 #include "config.h"
+#include "miner.h"
 
 #include <limits.h>
 #include <pthread.h>
@@ -51,7 +52,6 @@
 #endif
 
 #include "elist.h"
-#include "miner.h"
 #include "fpgautils.h"
 
 // The serial I/O speed - Linux uses a define 'B115200' in bits/termios.h

+ 1 - 1
driver-ztex.c

@@ -23,9 +23,9 @@
  *   You should have received a copy of the GNU General Public License
  *   along with this program; if not, see http://www.gnu.org/licenses/.
 **/
+#include "miner.h"
 #include <unistd.h>
 #include <sha2.h>
-#include "miner.h"
 #include "libztex.h"
 
 #define GOLDEN_BACKLOG 5

+ 5 - 0
elist.h

@@ -180,8 +180,13 @@ static inline void list_splice_init(struct list_head *list,
  * @type:	the type of the struct this is embedded in.
  * @member:	the name of the list_struct within the struct.
  */
+#ifndef _WIN64
 #define list_entry(ptr, type, member) \
 	((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
+#else
+#define list_entry(ptr, type, member) \
+	((type *)((char *)(ptr)-(unsigned long long)(&((type *)0)->member)))
+#endif
 
 /**
  * list_for_each	-	iterate over a list

+ 3 - 2
fpgautils.c

@@ -14,6 +14,8 @@
 #include <dirent.h>
 #include <string.h>
 
+#include "miner.h"
+
 #ifndef WIN32
 #include <errno.h>
 #include <termios.h>
@@ -34,7 +36,6 @@
 
 #include "elist.h"
 #include "logging.h"
-#include "miner.h"
 #include "fpgautils.h"
 
 #ifdef HAVE_LIBUDEV
@@ -356,7 +357,7 @@ int serial_open(const char *devpath, unsigned long baud, signed short timeout, b
 		PurgeComm(hSerial, PURGE_TXCLEAR);
 	}
 
-	return _open_osfhandle((LONG)hSerial, 0);
+	return _open_osfhandle((intptr_t)hSerial, 0);
 #else
 	int fdDev = open(devpath, O_RDWR | O_CLOEXEC | O_NOCTTY);
 

+ 2 - 0
lib/signal.in.h

@@ -20,6 +20,8 @@
 #endif
 @PRAGMA_COLUMNS@
 
+#include "config.h"
+
 #if defined __need_sig_atomic_t || defined __need_sigset_t
 /* Special invocation convention inside glibc header files.  */
 

+ 2 - 1
logging.c

@@ -85,7 +85,8 @@ static void log_generic(int prio, const char *fmt, va_list ap)
 
 		gettimeofday(&tv, NULL);
 
-		tm = localtime(&tv.tv_sec);
+		const time_t tmp_time = tv.tv_sec;
+		tm = localtime(&tmp_time);
 
 		len = 40 + strlen(fmt) + 22;
 		f = alloca(len);

+ 2 - 1
miner.h

@@ -118,7 +118,8 @@ static inline int fsync (int fd)
   #include "usbutils.h"
 #endif
 
-#if !defined(WIN32) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
+#if (!defined(WIN32) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))) \
+    || (defined(WIN32) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
 #define bswap_16 __builtin_bswap16
 #define bswap_32 __builtin_bswap32
 #define bswap_64 __builtin_bswap64

+ 1 - 1
x86_32/Makefile.am

@@ -5,4 +5,4 @@ SUFFIXES = .asm
 libx8632_a_SOURCES	= sha256_xmm.asm
 
 .asm.o:
-	$(YASM) -f elf32 $<
+	$(YASM) -f $(YASM_FMT) $<

+ 4 - 4
x86_32/sha256_xmm.asm

@@ -19,11 +19,11 @@ BITS 32
 
 %define LAB_LOOP_UNROLL 64
 
-extern sha256_consts_m128i
+extern _sha256_consts_m128i
 
-global CalcSha256_x86
+global $@CalcSha256_x86@12
 ;	CalcSha256	hash(ecx), data(edx), init([esp+4])
-CalcSha256_x86:
+@CalcSha256_x86@12:
 	push	esi
 	push	edi
 	mov	init, [esp+12]
@@ -134,7 +134,7 @@ LAB_LOOP:
 
 %macro	lab_loop_blk 1
 	movdqa	xmm6, [data+%1]
-	paddd	xmm6, sha256_consts_m128i[%1]
+	paddd	xmm6, _sha256_consts_m128i[%1]
 
 	paddd	xmm6, [hash+2*16]		; +h
 

+ 1 - 1
x86_64/Makefile.am

@@ -5,4 +5,4 @@ SUFFIXES = .asm
 libx8664_a_SOURCES	= sha256_xmm_amd64.asm sha256_sse4_amd64.asm
 
 .asm.o:
-	$(YASM) -f elf64 $<
+	$(YASM) -f $(YASM_FMT) -o $@ $<

+ 35 - 8
x86_64/sha256_sse4_amd64.asm

@@ -13,9 +13,17 @@
 ALIGN 32
 BITS 64
 
+%ifidn __OUTPUT_FORMAT__,win64
+%define hash rcx
+%define data rdx
+%define init r8
+%define temp r9
+%else
 %define hash rdi
 %define data rsi
 %define init rdx
+%define temp rcx
+%endif
 
 ; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
 %define LAB_CALC_PARA	2
@@ -27,18 +35,28 @@ extern g_4sha256_k
 
 global CalcSha256_x64_sse4
 ;	CalcSha256	hash(rdi), data(rsi), init(rdx)
+;	CalcSha256	hash(rcx), data(rdx), init(r8)
 CalcSha256_x64_sse4:
 
 	push	rbx
+%ifidn __OUTPUT_FORMAT__,win64
+	sub	rsp, 16 * 6
+	movdqa	[rsp + 16*0], xmm6
+	movdqa	[rsp + 16*1], xmm7
+	movdqa	[rsp + 16*2], xmm8
+	movdqa	[rsp + 16*3], xmm9
+	movdqa	[rsp + 16*4], xmm10
+	movdqa	[rsp + 16*5], xmm11
+%endif
 
 LAB_NEXT_NONCE:
 
-	mov	rcx, 64*4					; 256 - rcx is # of SHA-2 rounds
+	mov	temp, 64*4					; 256 - temp is # of SHA-2 rounds
 	mov	rax, 16*4					; 64 - rax is where we expand to
 
 LAB_SHA:
-	push	rcx
-	lea	rcx, qword [data+rcx*4]				; + 1024
+	push	temp
+	lea	temp, qword [data+temp*4]			; + 1024
 	lea	r11, qword [data+rax*4]				; + 256
 
 LAB_CALC:
@@ -122,10 +140,10 @@ LAB_CALC:
 %endrep
 
 	add	r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16
-	cmp	r11, rcx
+	cmp	r11, temp
 	jb	LAB_CALC
 
-	pop	rcx
+	pop	temp
 	mov	rax, 0
 
 ; Load the init values of the message into the hash.
@@ -219,12 +237,12 @@ LAB_LOOP:
 %assign i i+1
 %endrep
 
-	cmp	rax, rcx
+	cmp	rax, temp
 	jb	LAB_LOOP
 
 ; Finished the 64 rounds, calculate hash and save
 
-	movntdqa	xmm1, [rdx]
+	movntdqa	xmm1, [init]
 	pshufd	xmm2, xmm1, 0x55
 	paddd	xmm5, xmm2
 	pshufd	xmm6, xmm1, 0xAA
@@ -234,7 +252,7 @@ LAB_LOOP:
 	pshufd	xmm1, xmm1, 0
 	paddd	xmm7, xmm1
 
-	movntdqa	xmm1, [rdx+4*4]
+	movntdqa	xmm1, [init+4*4]
 	pshufd	xmm2, xmm1, 0x55
 	paddd	xmm8, xmm2
 	pshufd	xmm6, xmm1, 0xAA
@@ -254,6 +272,15 @@ LAB_LOOP:
 	movdqa	[hash+7*16], xmm10
 
 LAB_RET:
+%ifidn __OUTPUT_FORMAT__,win64
+	movdqa	xmm6, [rsp + 16*0]
+	movdqa	xmm7, [rsp + 16*1]
+	movdqa	xmm8, [rsp + 16*2]
+	movdqa	xmm9, [rsp + 16*3]
+	movdqa	xmm10, [rsp + 16*4]
+	movdqa	xmm11, [rsp + 16*5]
+	add	rsp, 16 * 6
+%endif
 	pop	rbx
 	ret
 

+ 25 - 0
x86_64/sha256_xmm_amd64.asm

@@ -22,10 +22,17 @@
 ALIGN 32
 BITS 64
 
+%ifidn __OUTPUT_FORMAT__,win64
+%define hash  rcx
+%define hash1 rdx
+%define data  r8
+%define init  r9
+%else
 %define hash  rdi
 %define hash1 rsi
 %define data  rdx
 %define init  rcx
+%endif
 
 ; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
 %define SHA_CALC_W_PARA         2
@@ -227,6 +234,15 @@ sha256_sse2_64_new:
 %endif
 
     push        rbx
+%ifidn __OUTPUT_FORMAT__,win64
+    sub         rsp, 16 * 6
+    movdqa      [rsp + 16*0], xmm6
+    movdqa      [rsp + 16*1], xmm7
+    movdqa      [rsp + 16*2], xmm8
+    movdqa      [rsp + 16*3], xmm9
+    movdqa      [rsp + 16*4], xmm10
+    movdqa      [rsp + 16*5], xmm13
+%endif
 
 %macro  SHA_256  0
     mov         rbx, 64*4   ; rbx is # of SHA-2 rounds
@@ -318,6 +334,15 @@ sha256_sse2_64_new:
     movdqa    [hash+7*16], rH
 
 LAB_RET:
+%ifidn __OUTPUT_FORMAT__,win64
+    movdqa    xmm6, [rsp + 16*0]
+    movdqa    xmm7, [rsp + 16*1]
+    movdqa    xmm8, [rsp + 16*2]
+    movdqa    xmm9, [rsp + 16*3]
+    movdqa    xmm10, [rsp + 16*4]
+    movdqa    xmm13, [rsp + 16*5]
+    add       rsp, 16 * 6
+%endif
     pop       rbx
     ret