Browse Source

Merge branch 'cg_merges_20130523a' into bfgminer

Luke Dashjr 12 years ago
parent
commit
64160ea69c
21 changed files with 282 additions and 59 deletions
  1. 1 0
      .gitignore
  2. 4 4
      Makefile.am
  3. 7 5
      README
  4. 79 6
      README.GPU
  5. 71 3
      README.scrypt
  6. 4 3
      compat.h
  7. 21 6
      configure.ac
  8. 5 3
      driver-cpu.c
  9. 1 1
      driver-cpu.h
  10. 1 1
      driver-icarus.c
  11. 1 1
      driver-ztex.c
  12. 2 1
      fpgautils.c
  13. 2 0
      lib/signal.in.h
  14. 11 9
      miner.c
  15. 2 1
      miner.h
  16. 2 2
      sha256_sse2_i386.c
  17. 6 3
      util.c
  18. 1 1
      x86_32/Makefile.am
  19. 1 1
      x86_64/Makefile.am
  20. 35 8
      x86_64/sha256_sse4_amd64.asm
  21. 25 0
      x86_64/sha256_xmm_amd64.asm

+ 1 - 0
.gitignore

@@ -34,6 +34,7 @@ mingw32-config.cache
 *~
 *.orig
 *.rej
+*.swp
 *.kate-swp
 
 ext_deps

+ 4 - 4
Makefile.am

@@ -14,7 +14,7 @@ INCLUDES	= -fno-strict-aliasing
 
 bin_PROGRAMS	= bfgminer
 
-bin_SCRIPTS	= *.cl
+bin_SCRIPTS	= $(top_srcdir)/*.cl
 
 bfgminer_LDFLAGS	= $(PTHREAD_FLAGS)
 bfgminer_LDADD	= $(DLOPEN_FLAGS) @LIBCURL_LIBS@ @JANSSON_LIBS@ @PTHREAD_LIBS@ \
@@ -124,19 +124,19 @@ endif
 if HAS_MODMINER
 bfgminer_SOURCES += driver-modminer.c
 bitstreamsdir = $(bindir)/bitstreams
-dist_bitstreams_DATA = bitstreams/*
+dist_bitstreams_DATA = $(top_srcdir)/bitstreams/*
 endif
 
 if HAS_X6500
 bfgminer_SOURCES += driver-x6500.c ft232r.c ft232r.h jtag.c jtag.h
 bitstreamsdir = $(bindir)/bitstreams
-dist_bitstreams_DATA = bitstreams/*
+dist_bitstreams_DATA = $(top_srcdir)/bitstreams/*
 endif
 
 if HAS_ZTEX
 bfgminer_SOURCES += driver-ztex.c libztex.c libztex.h
 bitstreamsdir = $(bindir)/bitstreams
-dist_bitstreams_DATA = bitstreams/*
+dist_bitstreams_DATA = $(top_srcdir)/bitstreams/*
 endif
 
 bin_PROGRAMS += bfgminer-rpc

+ 7 - 5
README

@@ -251,6 +251,7 @@ GPU only options:
 scrypt only options:
 
 --lookup-gap <arg>  Set GPU lookup gap for scrypt mining, comma separated
+--shaders <arg>     GPU shaders per card for tuning scrypt, comma separated
 --thread-concurrency <arg> Set GPU thread concurrency for scrypt mining, comma separated
 
 See README.scrypt for more information regarding (non-bitcoin) scrypt mining.
@@ -353,7 +354,7 @@ Q quits the application.
 
 G gives you something like:
 
-GPU 0: [124.2 / 191.3 Mh/s] [Q:212  A:77  R:33  HW:0  E:36%  U:1.73/m]
+GPU 0: [124.2 / 191.3 Mh/s] [A:77  R:33  HW:0  U:1.73/m  WU 1.73/m]
 Temp: 67.0 C
 Fan Speed: 35% (2500 RPM)
 Engine Clock: 960 MHz
@@ -413,11 +414,9 @@ The number of hardware erorrs
 The utility defines as the number of shares / minute
 
 The BFGMiner status line shows:
- ST: 1  DW: 0  GW: 301  LW: 8  GF: 1  NB: 1  AS: 0  RF: 1  E: 2.42
+ ST: 1  LW: 8  GF: 1  NB: 1  AS: 0  RF: 1  E: 2.42
 
 ST is STaged work items (ready to use).
-DW is Discarded Work items (work from block no longer valid to work on)
-GW is GetWork requested (work items from pools)
 LW is Locally generated Work items
 GF is Getwork Fail Occasions (server slow to provide work)
 NB is New Blocks detected on the network
@@ -652,7 +651,7 @@ with -g 1. It is also recommended to use --failover-only since the work is
 effectively like a different block chain. If mining with a Mini Rig, it is worth
 adding the --bfl-range option.
 
-Q: Are kernels from other mining software useable in BFGMiner?
+Q: Are OpenCL kernels from other mining software useable in BFGMiner?
 A: No, the APIs are slightly different between the different software and they
 will not work.
 
@@ -697,6 +696,9 @@ To permanently give your account the 'dialout' group:
  sudo usermod -G dialout -a `whoami`
 Then logout and back in again
 
+Q: Can I mine scrypt with FPGAs or ASICs?
+A: No.
+
 Q: What is stratum and how do I use it?
 A: Stratum is a protocol designed to reduce resources for mining pools at the
 cost of keeping the miner in the dark and blindly transferring his mining

+ 79 - 6
README.GPU

@@ -49,6 +49,10 @@ Install AMD APP sdk, ideal version (see FAQ!) - put it into a system location.
 Download the correct version for either 32 bit or 64 bit from here:
 	http://developer.amd.com/tools/heterogeneous-computing/amd-accelerated-parallel-processing-app-sdk/downloads/
 
+The best version for Radeon 5xxx and 6xxx is v2.5, while 7xxx cards need v2.6 or
+later, 2.7 seems the best.
+
+For versions 2.4 or earlier you will need to manually install them:
 This will give you a file with a name like:
  AMD-APP-SDK-v2.4-lnx64.tgz (64-bit)
 or
@@ -80,8 +84,9 @@ device may be very slow to return responses, or produce errors.
 
 NOTE: Running intensities above 9 with current hardware is likely to only
 diminish return performance even if the hash rate might appear better. A good
-starting baseline intensity to try on dedicated miners is 9. Higher values are
-there to cope with future improvements in hardware.
+starting baseline intensity to try on dedicated miners is 9. 11 is the upper
+limit for intensity while Bitcoin mining, if the GPU_USE_SYNC_OBJECTS variable
+is set (see FAQ). The upper limit for SHA256d mining is 14 and 20 for scrypt.
 
 
 ---
@@ -342,10 +347,17 @@ binaries will be built. It is known that the 2.6 ATI SDK has a huge hashrate
 penalty on generating new binaries. It is recommended to not use this SDK at
 this time unless you are using an ATI 7xxx card that needs it.
 
-Q: Which ATI SDK is the best for BFGMiner?
-A: At the moment, versions 2.4 and 2.5 work the best. If you are forced to use
-the 2.6 SDK, the phatk kernel will perform poorly, while the diablo or my
-custom modified poclbm kernel are optimised for it.
+Q: Which AMD SDK is the best for BFGMiner?
+A: At the moment, versions 2.4 and 2.5 work the best for 5xxx and 6xxx GPUs. SDK
+2.6 or 2.7 works best for 7xxx. SDK 2.8 is known to have many problems. If you
+need to use the 2.6+ SDK (7xxx and later), the phatk kernel will perform poorly,
+while the diablo or (modified) poclbm kernel are optimised for it.
+
+Q: Which AMD driver is the best?
+A: Unfortunately AMD has a history of having quite a few releases with issues
+when it comes to mining, either in terms of breaking mining, increasing CPU
+usage or very low hashrates. Only experimentation can tell you for sure, but
+some good releases were 11.6, 11.12, 12.4 and 12.8
 
 Q: I have multiple SDKs installed, can I choose which one it uses?
 A: Run bfgminer with the -n option and it will list all the platforms currently
@@ -355,6 +367,67 @@ Q: BFGMiner reports no devices or only one device on startup on Linux although
 I have multiple devices and drivers+SDK installed properly?
 A: Try "export DISPLAY=:0" before running BFGMiner.
 
+Q: BFGMiner crashes immediately on startup.
+A: One of the common reasons for this is that you have mixed files on your
+machine for the driver or SDK. Windows has a nasty history of not cleanly
+uninstalling files so you may have to use third party tools like driversweeper
+to remove old versions. The other common reason for this is windows antivirus
+software is disabling one of the DLLs from working. If BFGMiner starts with the
+-T option but never starts without it, this is a sure fire sign you have this
+problem and will have to disable your antivirus or make exceptions.
+
+Q: Is it faster to mine on Windows or Linux?
+A: It makes no difference. It comes down to choice of operating system for their
+various features. Linux offers much better long term stability and remote
+monitoring and security, while Windows offers you overclocking tools that can
+achieve much more than BFGMiner can do on Linux.
+
+Q: BFGMiner cannot see any of my GPUs even though I have configured them all to
+be enabled and installed OpenCL (+/- Xorg is running and the DISPLAY variable is
+exported on Linux)?
+A: Check the output of 'bfgminer -d?', it will list what OpenCL devices your
+installed SDK recognises. If it lists none, you have a problem with your version
+or installation of the SDK.
+
+Q: BFGMiner is mining on the wrong GPU, I want it on the AMD but it's mining on
+my on board GPU?
+A: Make sure the AMD OpenCL SDK is installed, check the output of 'bfgminer -d?'
+and use the appropriate parameter with --gpu-platform.
+
+Q: I'm getting much lower hashrates than I should be for my GPU?
+A: Look at your driver/SDK combination and disable power saving options for your
+GPU. Specifically look to disable ULPS. Make sure not to set intensity above 11
+for Bitcoin mining.
+
+Q: Can I mine with AMD while running Nvidia or Intel GPUs at the same time?
+A: If you can install both drivers successfully (easier on windows) then yes,
+using the --gpu-platform option.
+
+Q: Can I mine with Nvidia or Intel GPUs?
+A: Yes, but their hashrate is very poor and likely you'll be using much more
+energy than you'll be earning in coins.
+
+Q: Can I mine on Linux without running Xorg?
+A: With Nvidia you can, but with AMD you cannot.
+
+Q: I'm trying to mine a scrypt cryptocurrency but BFGMiner shows MH values
+instead of kH and submits no shares?
+A: Add the --scrypt parameter.
+
+Q: I can't get anywhere near enough hashrate for scrypt compared to other
+people?
+A: You may not have enough system RAM as this is also required.
+
+Q: My scrypt hashrate is high but the pool reports only a tiny proportion of my
+hashrate?
+A: You are generating garbage hashes due to your choice of settings. Try
+decreasing your intensity, do not increase the number of gpu-threads, and
+consider adding system RAM to match your GPU ram.
+
+Q: Scrypt fails to initialise the kernel every time?
+A: Your parameters are too high. Don't add GPU threads, don't set intensity too
+high, decrease thread concurrency. See the README.scrypt for a lot more help.
+
 Q: Should I use crossfire/SLI?
 A: It does not benefit mining at all and depending on the GPU may actually
 worsen performance.

+ 71 - 3
README.scrypt

@@ -51,9 +51,9 @@ disastrous with scrypt because it CAN run out of ram. High intensities
 start writing over the same ram and it is highly dependent on the GPU, but they
 can start actually DECREASING your hashrate, or even worse, start producing
 garbage with HW errors skyrocketing. Note that if you do NOT specify an
-intensity, cgminer uses dynamic mode which is designed to minimise the harm
+intensity, BFGMiner uses dynamic mode which is designed to minimise the harm
 to a running desktop and performance WILL be poor. The lower limit to intensity
-with scrypt is usually 8 and cgminer will prevent it going too low.
+with scrypt is usually 8 and BFGMiner will prevent it going too low.
 SUMMARY: Setting this for reasonable hashrates is mandatory.
 
 --shaders XXX
@@ -62,7 +62,7 @@ is a new option where you tell BFGMiner how many shaders your GPU has. This
 helps BFGMiner try to choose some meaningful baseline parameters. Use this table
 below to determine how many shaders your GPU has, and note that there are some
 variants of these cards, and Nvidia shaders are much much lower and virtually
-pointless trying to mine on. If this is not set, cgminer will query the
+pointless trying to mine on. If this is not set, BFGMiner will query the
 device for how much memory it supports and will try to set a value based on
 that.
 SUMMARY: This will get you started but fine tuning for optimal performance is
@@ -162,7 +162,75 @@ For example, a 7970 running with the following settings:
 was using 305W!
 
 ---
+TUNING AN AMD RADEON 7970
+Example tuning a 7970 for Scrypt mining:
 
+On Linux run this command:
+export GPU_MAX_ALLOC_PERCENT=100
+or on Windows this:
+setx GPU_MAX_ALLOC_PERCENT 100
+in the same console/bash/dos prompt/bat file/whatever you want to call it,
+before running BFGMiner.
+
+First, find the highest thread concurrency that you can start it at. They should
+all start at 8192 but some will go up to 3 times that. Don't go too high on the
+intensity while testing and don't change gpu threads. If you cannot go above
+8192, don't fret as you can still get a high hashrate.
+
+Delete any .bin files so you're starting from scratch and see what bins get
+generated.
+
+First try without any thread concurrency or even shaders, as BFGMiner will try to
+find an optimal value
+bfgminer -I 13
+
+If that starts mining, see what bin was generated, it is likely the largest
+meaningful TC you can set.
+Starting it on mine I get:
+scrypt130302Tahitiglg2tc22392w64l8.bin
+
+See tc22392 that's telling you what thread concurrency it was. It should start
+without TC parameters, but you never know. So if it doesn't, start with
+--thread-concurrency 8192 and add 2048 to it at a time till you find the highest
+value it will start successfully at.
+
+If you wish to get a little extra from your hardware, you may also try
+overclocking. Do note that this will damage your GPUs and void your warranty,
+so unless you are willing to take that risk, skip the --gpu-engine and
+--gpu-memclock sections!
+
+Then start overclocking the eyeballs off your memory, as 7970s are exquisitely
+sensitive to memory speed and amazingly overclockable but please make sure it
+keeps adequately cooled with --auto-fan! Do it while it's running from the GPU
+menu. Go up by 25 at a time every 30 seconds or so until your GPU crashes. Then
+reboot and start it 25 lower as a rough start. One example runs stable at 1900
+memory without overvolting.
+
+Then once you find the maximum memory clock speed, you need to find the sweet
+spot engine clock speed that matches it. It's a fine line where one more MHz
+will make the hashrate drop by 20%. It's somewhere in the .57 - 0.6 ratio range.
+Start your engine clock speed at half your memory clock speed and then increase
+it by 5 at a time. The hashrate should climb a little each rise in engine speed
+and then suddenly drop above a certain value. Decrease it by 1 then until you
+find it climbs dramatically. If your engine clock speed cannot get that high
+without crashing the GPU, you will have to use a lower memclock.
+
+Then, and only then, bother trying to increase intensity further.
+
+My final settings were:
+--gpu-engine 1157  --gpu-memclock 1900 -I 20
+for a hashrate of 725kH.
+
+Note I did not bother setting a thread concurrency. Once you have the magic
+endpoint, look at what tc was chosen by the bin file generated and then hard
+code that in next time (eg --thread-concurrency 22392) as slight changes in
+thread concurrency will happen every time if you don't specify one, and the tc
+to clock ratios are critical!
+
+Your numbers will be your numbers depending on your hardware combination and OS,
+so don't expect to get exactly the same results!
+
+---
 If you wish to donate to the author of scrypt support, Con Kolivas, please send
 your donations to:
 

+ 4 - 3
compat.h

@@ -76,7 +76,9 @@ struct tm *localtime_convert(time_t t)
 {
 	return localtime(&t);
 }
+#endif
 
+#ifndef HAVE_NANOSLEEP
 static inline int nanosleep(const struct timespec *req, struct timespec *rem)
 {
 	struct timeval tstart;
@@ -108,7 +110,9 @@ static inline int nanosleep(const struct timespec *req, struct timespec *rem)
 	}
 	return 0;
 }
+#endif
 
+#ifdef WIN32
 static inline int sleep(unsigned int secs)
 {
 	struct timespec req, rem;
@@ -136,9 +140,6 @@ typedef unsigned int uint;
 typedef long suseconds_t;
 #endif
 
-#define PTH(thr) ((thr)->pth.p)
-#else
-#define PTH(thr) ((thr)->pth)
 #endif /* WIN32 */
 
 #ifndef HAVE_PTHREAD_CANCEL

+ 21 - 6
configure.ac

@@ -85,7 +85,6 @@ esac
 
 case $target in
   *-*-mingw*)
-    have_x86_64=false
     have_win32=true
     DLOPEN_FLAGS=""
     WS2_LIBS="-lws2_32"
@@ -126,7 +125,7 @@ m4_define([BFG_PREPROC_IFELSE],
 cpumining="no"
 
 AC_ARG_ENABLE([cpumining],
-	[AC_HELP_STRING([--enable-cpumining],[Build with cpu mining support(default disabled)])],
+	[AC_HELP_STRING([--enable-cpumining],[Build with CPU mining support (default disabled)])],
 	[cpumining=$enableval]
 	)
 if test "x$cpumining" = xyes; then
@@ -150,7 +149,7 @@ AM_CONDITIONAL([HAVE_OPENCL], [test x$opencl = xyes])
 m4_define([BFG_PTHREAD_FLAG_CHECK],
 	AC_MSG_CHECKING([for $1])
 	for cflag in ' -pthread' ''; do
-		for lib in ' -lpthread' ''; do
+		for lib in ' -lpthread' ' -lwinpthread' ''; do
 			CFLAGS="${save_CFLAGS}${cflag}"
 			LIBS="${save_LIBS}${lib}"
 			AC_LINK_IFELSE([
@@ -173,6 +172,9 @@ m4_define([BFG_PTHREAD_FLAG_CHECK],
 			],[])
 		done
 	done
+	if test "x${found_pthread}" = "xfalse"; then
+		AC_MSG_RESULT([no])
+	fi
 )
 
 save_CFLAGS="${CFLAGS}"
@@ -182,13 +184,13 @@ BFG_PTHREAD_FLAG_CHECK([pthread_cancel],[
 	AC_DEFINE([HAVE_PTHREAD_CANCEL], [1], [Define if you have a native pthread_cancel])
 ])
 if test "x${found_pthread}" = "xfalse"; then
-	AC_MSG_RESULT([no])
 	BFG_PTHREAD_FLAG_CHECK([pthread_create])
 	if test "x${found_pthread}" = "xfalse"; then
-		AC_MSG_RESULT([no])
 		AC_MSG_ERROR([Could not find pthread library - please install libpthread])
 	fi
 fi
+# check for nanosleep here, since it is provided by winpthread
+AC_CHECK_FUNCS([nanosleep])
 CFLAGS="${save_CFLAGS}"
 LIBS="${save_LIBS}"
 
@@ -501,7 +503,7 @@ if test "x$have_x86_32$have_x86_64" != "xfalsefalse"; then
 AC_PATH_PROG([YASM],[yasm],[false])
 if test "x$YASM" != "xfalse" ; then
   AC_MSG_CHECKING([if yasm version is greater than 1.0.1])
-  yasmver=`yasm --version | head -1 | cut -d\  -f2`
+  yasmver=`"$YASM" --version | head -1 | cut -d\  -f2`
   yamajor=`echo $yasmver | cut -d. -f1`
   yaminor=`echo $yasmver | cut -d. -f2`
   yamini=`echo $yasmver | cut -d. -f3`
@@ -528,6 +530,18 @@ if test "x$YASM" != "xfalse" ; then
 fi
 if test "x$has_yasm" = "xfalse" ; then
   AC_MSG_NOTICE([yasm is required for the assembly algorithms. They will be skipped.])
+else
+  if test "x$have_x86_64" = xtrue; then
+    if test "x$have_win32" = xtrue; then
+      YASM_FMT="win64"
+    else
+      YASM_FMT="elf64"
+    fi
+  elif test "x$have_win32" = xtrue; then
+    YASM_FMT="coff"
+  else
+    YASM_FMT="elf32"
+  fi
 fi
 fi
 
@@ -740,6 +754,7 @@ AC_SUBST(PDCURSES_LIBS)
 AC_SUBST(WS2_LIBS)
 AC_SUBST(MATH_LIBS)
 AC_SUBST(UDEV_LIBS)
+AC_SUBST(YASM_FMT)
 
 AC_CONFIG_FILES([
 	Makefile

+ 5 - 3
driver-cpu.c

@@ -202,7 +202,9 @@ static const sha256_func sha256_funcs[] = {
 
 
 #ifdef WANT_CPUMINE
-#if defined(WANT_X8664_SSE2) && defined(__SSE2__)
+#if defined(WANT_X8664_SSE4) && defined(__SSE4_1__)
+enum sha256_algos opt_algo = ALGO_SSE4_64;
+#elif defined(WANT_X8664_SSE2) && defined(__SSE2__)
 enum sha256_algos opt_algo = ALGO_SSE2_64;
 #elif defined(WANT_X8632_SSE2) && defined(__SSE2__)
 enum sha256_algos opt_algo = ALGO_SSE2_32;
@@ -726,8 +728,8 @@ static void cpu_detect()
 	// Reckon number of cores in the box
 	#if defined(WIN32)
 	{
-		DWORD system_am;
-		DWORD process_am;
+		DWORD_PTR system_am;
+		DWORD_PTR process_am;
 		BOOL ok = GetProcessAffinityMask(
 			GetCurrentProcess(),
 			&system_am,

+ 1 - 1
driver-cpu.h

@@ -30,7 +30,7 @@
 #define WANT_X8664_SSE2 1
 #endif
 
-#if defined(__x86_64__) && defined(HAS_YASM)
+#if defined(__x86_64__) && defined(HAS_YASM) && defined(__SSE4_1__)
 #define WANT_X8664_SSE4 1
 #endif
 

+ 1 - 1
driver-icarus.c

@@ -30,6 +30,7 @@
  */
 
 #include "config.h"
+#include "miner.h"
 
 #ifdef WIN32
 #include <winsock2.h>
@@ -62,7 +63,6 @@
 #include "dynclock.h"
 #include "elist.h"
 #include "icarus-common.h"
-#include "miner.h"
 #include "fpgautils.h"
 
 // The serial I/O speed - Linux uses a define 'B115200' in bits/termios.h

+ 1 - 1
driver-ztex.c

@@ -25,13 +25,13 @@
 
 #include "config.h"
 
+#include "miner.h"
 #include <unistd.h>
 #include <sha2.h>
 
 #include "deviceapi.h"
 #include "dynclock.h"
 #include "fpgautils.h"
-#include "miner.h"
 #include "libztex.h"
 
 #define GOLDEN_BACKLOG 5

+ 2 - 1
fpgautils.c

@@ -22,6 +22,8 @@
 #include <dirent.h>
 #include <string.h>
 
+#include "miner.h"
+
 #ifndef WIN32
 #include <errno.h>
 #include <termios.h>
@@ -59,7 +61,6 @@ enum {
 
 #include "elist.h"
 #include "logging.h"
-#include "miner.h"
 #include "fpgautils.h"
 
 #define SEARCH_NEEDLES_BEGIN()  {  \

+ 2 - 0
lib/signal.in.h

@@ -20,6 +20,8 @@
 #endif
 @PRAGMA_COLUMNS@
 
+#include "config.h"
+
 #if defined __need_sig_atomic_t || defined __need_sigset_t
 /* Special invocation convention inside glibc header files.  */
 

+ 11 - 9
miner.c

@@ -2469,9 +2469,8 @@ static void curses_print_status(void)
 	mvwhline(statuswin, 1, 0, '-', 80);
 	mvwprintw(statuswin, 2, 0, " %s", statusline);
 	wclrtoeol(statuswin);
-	mvwprintw(statuswin, 3, 0, " ST: %d  DW: %d  GW: %d  LW: %d  GF: %d  NB: %d  AS: %d  RF: %d  E: %.2f",
-		total_staged(), total_discarded,
-		total_getworks,
+	mvwprintw(statuswin, 3, 0, " ST: %d  LW: %d  GF: %d  NB: %d  AS: %d  RF: %d  E: %.2f",
+		total_staged(),
 		local_work,
 		total_go,
 		new_blocks,
@@ -5765,7 +5764,7 @@ static void *api_thread(void *userdata)
 
 	api(api_thr_id);
 
-	PTH(mythr) = 0L;
+	mythr->has_pth = false;
 
 	return NULL;
 }
@@ -5857,8 +5856,8 @@ static void hashmeter(int thr_id, struct timeval *diff,
 
 	total_mhashes_done += local_mhashes;
 	local_mhashes_done += local_mhashes;
+	/* Only update with opt_log_interval */
 	if (total_diff.tv_sec < opt_log_interval)
-		/* Only update the total every opt_log_interval seconds */
 		goto out_unlock;
 	showlog = true;
 	gettimeofday(&total_tv_end, NULL);
@@ -7811,7 +7810,6 @@ void print_summary(void)
 	applog(LOG_WARNING, "Average hashrate: %.1f Megahash/s", total_mhashes_done / total_secs);
 	applog(LOG_WARNING, "Solved blocks: %d", found_blocks);
 	applog(LOG_WARNING, "Best share difficulty: %s", best_share);
-	applog(LOG_WARNING, "Queued work requests: %d", total_getworks);
 	applog(LOG_WARNING, "Share submissions: %d", total_accepted + total_rejected);
 	applog(LOG_WARNING, "Accepted shares: %d", total_accepted);
 	applog(LOG_WARNING, "Rejected shares: %d", total_rejected);
@@ -7823,7 +7821,6 @@ void print_summary(void)
 	applog(LOG_WARNING, "Efficiency (accepted shares * difficulty / 2 KB): %.2f", efficiency);
 	applog(LOG_WARNING, "Utility (accepted shares / min): %.2f/min\n", utility);
 
-	applog(LOG_WARNING, "Discarded work due to new blocks: %d", total_discarded);
 	applog(LOG_WARNING, "Stale submissions discarded due to new blocks: %d", total_stale);
 	applog(LOG_WARNING, "Unable to get work from server occasions: %d", total_go);
 	applog(LOG_WARNING, "Work items generated locally: %d", local_work);
@@ -7837,7 +7834,6 @@ void print_summary(void)
 			applog(LOG_WARNING, "Pool: %s", pool->rpc_url);
 			if (pool->solved)
 				applog(LOG_WARNING, "SOLVED %d BLOCK%s!", pool->solved, pool->solved > 1 ? "S" : "");
-			applog(LOG_WARNING, " Queued work requests: %d", pool->getwork_requested);
 			applog(LOG_WARNING, " Share submissions: %d", pool->accepted + pool->rejected);
 			applog(LOG_WARNING, " Accepted shares: %d", pool->accepted);
 			applog(LOG_WARNING, " Rejected shares: %d", pool->rejected);
@@ -7849,7 +7845,6 @@ void print_summary(void)
 			efficiency = pool_bytes_xfer ? pool->diff_accepted * 2048. / pool_bytes_xfer : 0.0;
 			applog(LOG_WARNING, " Efficiency (accepted * difficulty / 2 KB): %.2f", efficiency);
 
-			applog(LOG_WARNING, " Discarded work due to new blocks: %d", pool->discarded_work);
 			applog(LOG_WARNING, " Stale submissions discarded due to new blocks: %d", pool->stale_shares);
 			applog(LOG_WARNING, " Unable to get work from server occasions: %d", pool->getfail_occasions);
 			applog(LOG_WARNING, " Submitting work remotely delay occasions: %d\n", pool->remotefail_occasions);
@@ -8608,8 +8603,15 @@ int main(int argc, char *argv[])
 			register_device(devices[i]);
 	}
 
+#ifdef USE_USBUTILS
+	if (!total_devices) {
+		applog(LOG_WARNING, "No devices detected!");
+		applog(LOG_WARNING, "Waiting for USB hotplug devices or press q to quit");
+	}
+#else
 	if (!total_devices)
 		quit(1, "All devices disabled, cannot mine!");
+#endif
 
 	load_temp_config();
 

+ 2 - 1
miner.h

@@ -560,6 +560,7 @@ struct thr_info {
 	int		device_thread;
 	bool		primary_thread;
 
+	bool		has_pth;
 	pthread_t	pth;
 	struct thread_q	*q;
 	struct cgpu_info *cgpu;
@@ -1218,7 +1219,7 @@ extern enum test_nonce2_result _test_nonce2(struct work *, uint32_t nonce, bool
 extern void submit_nonce(struct thr_info *thr, struct work *work, uint32_t nonce);
 extern struct work *get_queued(struct cgpu_info *cgpu);
 extern struct work *__find_work_bymidstate(struct work *que, char *midstate, size_t midstatelen, char *data, int offset, size_t datalen);
-struct work *find_queued_work_bymidstate(struct cgpu_info *cgpu, char *midstate, size_t midstatelen, char *data, int offset, size_t datalen);
+extern struct work *find_queued_work_bymidstate(struct cgpu_info *cgpu, char *midstate, size_t midstatelen, char *data, int offset, size_t datalen);
 extern void work_completed(struct cgpu_info *cgpu, struct work *work);
 extern bool abandon_work(struct work *, struct timeval *work_runtime, uint64_t hashes);
 extern void hash_queued_work(struct thr_info *mythr);

+ 2 - 2
sha256_sse2_i386.c

@@ -21,7 +21,7 @@
 #include <stdint.h>
 #include <stdio.h>
 
-extern void CalcSha256_x86 (__m128i *res, __m128i *data, const uint32_t init[8])__attribute__((fastcall));
+extern void CalcSha256_x86 (__m128i *res, __m128i *data, const uint32_t init[8])__asm__("CalcSha256_x86")__attribute__((fastcall));
 
 static uint32_t g_sha256_k[]__attribute__((aligned(0x100))) = {
     0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, /*  0 */
@@ -47,7 +47,7 @@ const uint32_t sha256_32init[8]__attribute__((aligned(0x100))) =
 {0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19};
 
 __m128i g_4sha256_k[64];
-__m128i sha256_consts_m128i[64]__attribute__((aligned(0x1000)));
+__m128i sha256_consts_m128i[64]__asm__("sha256_consts_m128i")__attribute__((aligned(0x1000)));
 
 bool scanhash_sse2_32(struct thr_info*thr, const unsigned char *pmidstate,
 	unsigned char *pdata,

+ 6 - 3
util.c

@@ -909,7 +909,10 @@ out:
 
 int thr_info_create(struct thr_info *thr, pthread_attr_t *attr, void *(*start) (void *), void *arg)
 {
-	return pthread_create(&thr->pth, attr, start, arg);
+	int rv = pthread_create(&thr->pth, attr, start, arg);
+	if (likely(!rv))
+		thr->has_pth = true;
+	return rv;
 }
 
 void thr_info_freeze(struct thr_info *thr)
@@ -938,9 +941,9 @@ void thr_info_cancel(struct thr_info *thr)
 	if (!thr)
 		return;
 
-	if (PTH(thr) != 0L) {
+	if (thr->has_pth) {
 		pthread_cancel(thr->pth);
-		PTH(thr) = 0L;
+		thr->has_pth = false;
 	}
 }
 

+ 1 - 1
x86_32/Makefile.am

@@ -5,4 +5,4 @@ SUFFIXES = .asm
 libx8632_a_SOURCES	= sha256_xmm.asm
 
 .asm.o:
-	$(YASM) -f elf32 $<
+	$(YASM) -f $(YASM_FMT) $<

+ 1 - 1
x86_64/Makefile.am

@@ -5,4 +5,4 @@ SUFFIXES = .asm
 libx8664_a_SOURCES	= sha256_xmm_amd64.asm sha256_sse4_amd64.asm
 
 .asm.o:
-	$(YASM) -f elf64 $<
+	$(YASM) -f $(YASM_FMT) -o $@ $<

+ 35 - 8
x86_64/sha256_sse4_amd64.asm

@@ -13,9 +13,17 @@
 ALIGN 32
 BITS 64
 
+%ifidn __OUTPUT_FORMAT__,win64
+%define hash rcx
+%define data rdx
+%define init r8
+%define temp r9
+%else
 %define hash rdi
 %define data rsi
 %define init rdx
+%define temp rcx
+%endif
 
 ; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
 %define LAB_CALC_PARA	2
@@ -27,18 +35,28 @@ extern g_4sha256_k
 
 global CalcSha256_x64_sse4
 ;	CalcSha256	hash(rdi), data(rsi), init(rdx)
+;	CalcSha256	hash(rcx), data(rdx), init(r8)
 CalcSha256_x64_sse4:
 
 	push	rbx
+%ifidn __OUTPUT_FORMAT__,win64
+	sub	rsp, 16 * 6
+	movdqa	[rsp + 16*0], xmm6
+	movdqa	[rsp + 16*1], xmm7
+	movdqa	[rsp + 16*2], xmm8
+	movdqa	[rsp + 16*3], xmm9
+	movdqa	[rsp + 16*4], xmm10
+	movdqa	[rsp + 16*5], xmm11
+%endif
 
 LAB_NEXT_NONCE:
 
-	mov	rcx, 64*4					; 256 - rcx is # of SHA-2 rounds
+	mov	temp, 64*4					; 256 - temp is # of SHA-2 rounds
 	mov	rax, 16*4					; 64 - rax is where we expand to
 
 LAB_SHA:
-	push	rcx
-	lea	rcx, qword [data+rcx*4]				; + 1024
+	push	temp
+	lea	temp, qword [data+temp*4]			; + 1024
 	lea	r11, qword [data+rax*4]				; + 256
 
 LAB_CALC:
@@ -122,10 +140,10 @@ LAB_CALC:
 %endrep
 
 	add	r11, LAB_CALC_UNROLL*LAB_CALC_PARA*16
-	cmp	r11, rcx
+	cmp	r11, temp
 	jb	LAB_CALC
 
-	pop	rcx
+	pop	temp
 	mov	rax, 0
 
 ; Load the init values of the message into the hash.
@@ -219,12 +237,12 @@ LAB_LOOP:
 %assign i i+1
 %endrep
 
-	cmp	rax, rcx
+	cmp	rax, temp
 	jb	LAB_LOOP
 
 ; Finished the 64 rounds, calculate hash and save
 
-	movntdqa	xmm1, [rdx]
+	movntdqa	xmm1, [init]
 	pshufd	xmm2, xmm1, 0x55
 	paddd	xmm5, xmm2
 	pshufd	xmm6, xmm1, 0xAA
@@ -234,7 +252,7 @@ LAB_LOOP:
 	pshufd	xmm1, xmm1, 0
 	paddd	xmm7, xmm1
 
-	movntdqa	xmm1, [rdx+4*4]
+	movntdqa	xmm1, [init+4*4]
 	pshufd	xmm2, xmm1, 0x55
 	paddd	xmm8, xmm2
 	pshufd	xmm6, xmm1, 0xAA
@@ -254,6 +272,15 @@ LAB_LOOP:
 	movdqa	[hash+7*16], xmm10
 
 LAB_RET:
+%ifidn __OUTPUT_FORMAT__,win64
+	movdqa	xmm6, [rsp + 16*0]
+	movdqa	xmm7, [rsp + 16*1]
+	movdqa	xmm8, [rsp + 16*2]
+	movdqa	xmm9, [rsp + 16*3]
+	movdqa	xmm10, [rsp + 16*4]
+	movdqa	xmm11, [rsp + 16*5]
+	add	rsp, 16 * 6
+%endif
 	pop	rbx
 	ret
 

+ 25 - 0
x86_64/sha256_xmm_amd64.asm

@@ -22,10 +22,17 @@
 ALIGN 32
 BITS 64
 
+%ifidn __OUTPUT_FORMAT__,win64
+%define hash  rcx
+%define hash1 rdx
+%define data  r8
+%define init  r9
+%else
 %define hash  rdi
 %define hash1 rsi
 %define data  rdx
 %define init  rcx
+%endif
 
 ; 0 = (1024 - 256) (mod (LAB_CALC_UNROLL*LAB_CALC_PARA*16))
 %define SHA_CALC_W_PARA         2
@@ -227,6 +234,15 @@ sha256_sse2_64_new:
 %endif
 
     push        rbx
+%ifidn __OUTPUT_FORMAT__,win64
+    sub         rsp, 16 * 6
+    movdqa      [rsp + 16*0], xmm6
+    movdqa      [rsp + 16*1], xmm7
+    movdqa      [rsp + 16*2], xmm8
+    movdqa      [rsp + 16*3], xmm9
+    movdqa      [rsp + 16*4], xmm10
+    movdqa      [rsp + 16*5], xmm13
+%endif
 
 %macro  SHA_256  0
     mov         rbx, 64*4   ; rbx is # of SHA-2 rounds
@@ -318,6 +334,15 @@ sha256_sse2_64_new:
     movdqa    [hash+7*16], rH
 
 LAB_RET:
+%ifidn __OUTPUT_FORMAT__,win64
+    movdqa    xmm6, [rsp + 16*0]
+    movdqa    xmm7, [rsp + 16*1]
+    movdqa    xmm8, [rsp + 16*2]
+    movdqa    xmm9, [rsp + 16*3]
+    movdqa    xmm10, [rsp + 16*4]
+    movdqa    xmm13, [rsp + 16*5]
+    add       rsp, 16 * 6
+%endif
     pop       rbx
     ret