Browse Source

Merge branch 'scrypt' into bfgminer

Luke Dashjr 13 years ago
parent
commit
8b557bd1d1
24 changed files with 1916 additions and 100 deletions
  1. 5 1
      Makefile.am
  2. 75 0
      NEWS
  3. 9 0
      README
  4. 131 0
      SCRYPT-README
  5. 15 10
      autogen.sh
  6. 25 6
      configure.ac
  7. 2 2
      diablo120724.cl
  8. 2 2
      diakgcn120724.cl
  9. 21 2
      driver-cpu.c
  10. 6 0
      driver-cpu.h
  11. 161 24
      driver-opencl.c
  12. 5 0
      driver-opencl.h
  13. 13 3
      findnonce.c
  14. 3 3
      findnonce.h
  15. 1 1
      make-release
  16. 71 7
      miner.c
  17. 21 0
      miner.h
  18. 2 2
      mkinstalldirs
  19. 128 33
      ocl.c
  20. 6 0
      ocl.h
  21. 2 2
      phatk120724.cl
  22. 2 2
      poclbm120724.cl
  23. 453 0
      scrypt.c
  24. 757 0
      scrypt120724.cl

+ 5 - 1
Makefile.am

@@ -10,7 +10,7 @@ endif
 EXTRA_DIST	= example.conf m4/gnulib-cache.m4 linux-usb-bfgminer \
 EXTRA_DIST	= example.conf m4/gnulib-cache.m4 linux-usb-bfgminer \
 		  ADL_SDK/readme.txt api-example.php miner.php	\
 		  ADL_SDK/readme.txt api-example.php miner.php	\
 		  API.class API.java api-example.c windows-build.txt \
 		  API.class API.java api-example.c windows-build.txt \
-		  bitstreams/* API-README FPGA-README
+		  bitstreams/* API-README FPGA-README SCRYPT-README
 
 
 SUBDIRS		= lib compat ccan
 SUBDIRS		= lib compat ccan
 
 
@@ -56,6 +56,10 @@ bfgminer_SOURCES	+= \
 # the CPU portion extracted from original main.c
 # the CPU portion extracted from original main.c
 bfgminer_SOURCES += driver-cpu.h driver-cpu.c
 bfgminer_SOURCES += driver-cpu.h driver-cpu.c
 
 
+if HAS_SCRYPT
+bfgminer_SOURCES += scrypt.c
+endif
+
 if HAS_YASM
 if HAS_YASM
 AM_CFLAGS	= -DHAS_YASM
 AM_CFLAGS	= -DHAS_YASM
 if HAVE_x86_64
 if HAVE_x86_64

+ 75 - 0
NEWS

@@ -1,3 +1,76 @@
+Version 2.6.0 - July 29, 2012
+
+- Add scrypt documentation in the form of a separate readme.
+- Fix build error without scrypt enabled.
+- Limit thread concurrency for scrypt to 5xshaders if shaders is specified.
+- Simplify repeated use of gpus[gpu]. in ocl.c
+- Find the nearest power of 2 maximum alloc size for the scrypt buffer that can
+successfully be allocated and is large enough to accomodate the thread
+concurrency chosen, thus mapping it to an intensity.
+- Don't make opt_scrypt mandatory blocking with opencl code.
+- Update kernel versions reflecting changes in the API.
+- Make the thread concurrency and lookup gap options hidden on the command line
+and autotune parameters with a newly parsed --shaders option.
+- Fix target testing with scrypt kernel as it would have been missing shares
+below target.
+- Always create the largest possible padbuffer for scrypt kernels even if not
+needed for thread_concurrency, giving us some headroom for intensity levels.
+- Use the detected maximum allocable memory on a GPU to determine the optimal
+scrypt settings when lookup_gap and thread_concurrency parameters are not given.
+- Check the maximum allocable memory size per opencl device.
+- Add debugging output if buffer allocation fails for scrypt and round up
+bufsize to a multiple of 256.
+- Nonce testing for btc got screwed up, leading to no accepted shares. Fix it.
+- Display size of scrypt buffer used in debug.
+- Allow intensities up to 20 if scrypt is compiled in.
+- Add name to scrypt kernel copyright.
+- Allow lookup gap and thread concurrency to be passed per device and store
+details in kernel binary filename.
+- Ignore negative intensities for scrypt.
+- Change the scale of intensity for scrypt kernel and fix a build warning.
+- Correct target value passed to scrypt kernel.
+- Use 256 output slots for kernels to allow 1 for each worksize.
+- Test the target in the actual scrypt kernel itself saving further
+calculations.
+- Reinstate GPU only opencl device detection.
+- Decrease lookup gap to 1. Does not seem to help in any way being 2.
+- Fix build.
+- Make pad0 and pad1 local variable in scrypt kernel.
+- Constify input variable in scrypt kernel.
+- Send correct values to scrypt kernel to get it finally working.
+- Create command queue before compiling program in opencl.
+- Fix external scrypt algo missing.
+- Limit scrypt to 1 vector.
+- Handle KL_SCRYPT in config write.
+- Get rid of stuff.
+- Don't enqueuewrite buffer at all for pad8 and pass work details around for
+scrypt in dev_blk.
+- Set the correct data for cldata and prepare for pad8 fixes.
+- Get rid of spaces in arrays in scrypt kernel.
+- Start with smaller amount of hashes in cpu mining to enable scrypt to return
+today sometime.
+- Free the scratchbuf memory allocated in scrypt and don't check if CPUs are
+sick since they can't be. Prepare for khash hash rates in display.
+- Add cpumining capability for scrypt.
+- Set scrypt settings and buffer size in ocl.c code to be future modifiable.
+- Cope with when we cannot set intensity low enough to meet dynamic interval by
+inducing a forced sleep.
+- Make dynamic and scrypt opencl calls blocking.
+- Fix nonce submission code for scrypt.
+- Make sure goffset is set for scrypt and drop padbuffer8 to something
+manageable for now.
+- Set up buffer8 for scrypt.
+- Build fix for opt scrypt.
+- Don't check postcalc nonce with sha256 in scrypt.
+- Don't test nonce with sha and various fixes for scrypt.
+- Make scrypt buffers and midstate compatible.
+- Use specific output array entries in scrypt kernel.
+- Provide initial support for the scrypt kernel to compile with and mine scrypt
+with the --scrypt option.
+- Enable completely compiling scrypt out.
+- Begin import of scrypt opencl kernel from reaper.
+
+
 BFGMiner Version 2.5.2 - July 29, 2012
 BFGMiner Version 2.5.2 - July 29, 2012
 
 
 - Limit total number of curls recruited per pool to the number of mining threads
 - Limit total number of curls recruited per pool to the number of mining threads
@@ -153,6 +226,8 @@ mt_disable function.
 - Use standard cfsetispeed/cfsetospeed to set baud rate on *nix
 - Use standard cfsetispeed/cfsetospeed to set baud rate on *nix
 - miner.php split() flagged deprecated in PHP 5.3.0
 - miner.php split() flagged deprecated in PHP 5.3.0
 - Bugfix: Use nmsleep instead of restart_wait, so we always wait the full time
 - Bugfix: Use nmsleep instead of restart_wait, so we always wait the full time
+- Make long timeout 10seconds on bitforce for when usleep or nanosleep just
+can't be accurate...
 
 
 
 
 BFGMiner Version 2.5.0 - July 7, 2012
 BFGMiner Version 2.5.0 - July 7, 2012

+ 9 - 0
README

@@ -152,6 +152,7 @@ Options for both config file and command line:
 --scan-time|-s <arg> Upper bound on time spent scanning current work, in seconds (default: 60)
 --scan-time|-s <arg> Upper bound on time spent scanning current work, in seconds (default: 60)
 --sched-start <arg> Set a time of day in HH:MM to start mining (a once off without a stop time)
 --sched-start <arg> Set a time of day in HH:MM to start mining (a once off without a stop time)
 --sched-stop <arg>  Set a time of day in HH:MM to stop mining (will quit without a start time)
 --sched-stop <arg>  Set a time of day in HH:MM to stop mining (will quit without a start time)
+--scrypt            Use the scrypt algorithm for mining (non-bitcoin)
 --sharelog <arg>    Append share log to file
 --sharelog <arg>    Append share log to file
 --shares <arg>      Quit after mining N shares (default: unlimited)
 --shares <arg>      Quit after mining N shares (default: unlimited)
 --socks-proxy <arg> Set socks4 proxy (host:port)
 --socks-proxy <arg> Set socks4 proxy (host:port)
@@ -198,6 +199,14 @@ GPU only options:
 --worksize|-w <arg> Override detected optimal worksize - one value or comma separated list
 --worksize|-w <arg> Override detected optimal worksize - one value or comma separated list
 
 
 
 
+SCRYPT only options:
+
+--lookup-gap <arg>  Set GPU lookup gap for scrypt mining, comma separated
+--thread-concurrency <arg> Set GPU thread concurrency for scrypt mining, comma separated
+
+See SCRYPT-README for more information regarding (non-bitcoin) scrypt mining.
+
+
 FPGA mining boards(BitForce, Icarus, ModMiner, Ztex) only options:
 FPGA mining boards(BitForce, Icarus, ModMiner, Ztex) only options:
 
 
 --scan-serial|-S <arg> Serial port to probe for FPGA mining device
 --scan-serial|-S <arg> Serial port to probe for FPGA mining device

+ 131 - 0
SCRYPT-README

@@ -0,0 +1,131 @@
+If you wish to donate to the author of scrypt support, Con Kolivas, please send
+your donations to:
+
+Bitcoin : 15qSxP1SQcUX3o4nhkfdbgyoWEFMomJ4rZ
+Litecoin: Lc8TWMiKM7gRUrG8VB8pPNP1Yvt1SGZnoH
+
+---
+
+
+Scrypt mining for GPU is completely different to sha256 used for bitcoin
+mining. It has very different requirements to bitcoin mining and is a
+lot more complicated to get working well. Note that it is a ram dependent
+workload, and requires you to have enough system ram as well as fast enough
+GPU ram.
+
+There are 5 main parameters to tuning scrypt, 2 of which you MUST set, and
+the others are optional for further fine tuning. When you start scrypt mining
+with the --scrypt option, BFGMiner will fail IN RANDOM WAYS. They are all due
+to parameters being outside what the GPU can cope with. Not giving BFGMiner a
+hint as to your GPU type, it will hardly ever perform well.
+
+
+Step 1 on linux:
+export GPU_MAX_ALLOC_PERCENT=100
+If you do not do this, you may find it impossible to scrypt mine. You may find
+a value of 40 is enough and increasing this further has little effect.
+
+export GPU_USE_SYNC_OBJECTS=1
+may help CPU usage a little as well.
+
+--shaders XXX
+
+is a new option where you tell BFGMiner how many shaders your GPU has. This
+helps BFGMiner try to choose some meaningful baseline parameters. Use this table
+below to determine how many shaders your GPU has, and note that there are some
+variants of these cards, and nvidia shaders are much much lower and virtually
+pointless trying to mine on.
+
+GPU  Shaders
+7750 512
+7770 640
+7850 1024
+7870 1280
+7950 1792
+7970 2048
+
+6850 960
+6870 1120
+6950 1408
+6970 1536
+6990 (6970x2)
+
+6570 480
+6670 480
+6790 800
+
+6450 160
+
+5670 400
+5750 720
+5770 800
+5830 1120
+5850 1440
+5870 1600
+5970 (5870x2)
+
+These are only used as a rough guide for BFGMiner, and it is rare that this is
+all you will need to set.
+
+
+--intensity XX
+
+Just like in bitcoin mining, scrypt mining takes an intensity, however the
+scale goes from 0 to 20 to mimic the "Aggression" used in mtrlt's reaper. The
+reason this is crucial is that too high an intensity can actually be
+disastrous with scrypt because it CAN run out of ram. Intensities over 13
+start writing over the same ram and it is highly dependent on the GPU, but they
+can start actually DECREASING your hashrate, or even worse, start producing
+garbage with rejects skyrocketing.
+
+
+Optional parameters to tune:
+-g, --thread-concurrency, --lookup-gap
+
+-g:
+Once you have found the optimal shaders and intensity, you can start increasing
+the -g value till BFGMiner fails to start. Rarely will you be able to go over
+about -g 4 and each increase in -g only increases hashrate slightly.
+
+--thread-concurrency:
+This tunes the optimal size of work that scrypt can do. It is internally tuned
+by BFGMiner to be the highest reasonable multiple of shaders that it can
+allocate on your GPU. Ideally it should be a multiple of your shader count.
+vliw5 architecture (R5XXX) would be best at 5x shaders, while VLIW4 (R6xxx and
+R7xxx) are best at 4x. Setting thread concurrency overrides anything you put
+into --shaders.
+
+--lookup-gap
+This tunes a compromise between ram usage and performance. Performance peaks
+at a gap of 2, but increasing the gap can save you some GPU ram, but almost
+always at the cost of significant loss of hashrate. Setting lookup gap
+overrides the default of 2, but BFGMiner will use the --shaders value to choose
+a thread-concurrency if you haven't chosen one.
+
+
+Overclocking for scrypt mining:
+First of all, do not underclock your memory initially. Scrypt mining requires
+memory speed and on most, but not all, GPUs, lowering memory speed lowers
+mining performance.
+
+Second, absolute engine clock speeds do NOT correlate with hashrate. The ratio
+of engine clock speed to memory matters, so if you set your memory to the
+default value, and then start overclocking as you are running it, you should
+find a sweet spot where the hashrate peaks and then it might actually drop if
+you increase the engine clock speed further. Unless you wish to run with a
+dynamic intensity, do not go over 13 without testing it while it's running to
+see that it increases hashrate AND utility WITHOUT increasing your rejects.
+
+
+Suggested values for 7970 for example:
+export GPU_MAX_ALLOC_PERCENT=100
+--shaders 2048 -g 5 --gpu-engine 1135 --gpu-memclock 1375
+
+
+---
+
+If you wish to donate to the author of scrypt support, Con Kolivas, please send
+your donations to:
+
+Bitcoin : 15qSxP1SQcUX3o4nhkfdbgyoWEFMomJ4rZ
+Litecoin: Lc8TWMiKM7gRUrG8VB8pPNP1Yvt1SGZnoH

+ 15 - 10
autogen.sh

@@ -1,12 +1,17 @@
 #!/bin/sh
 #!/bin/sh
+cwd="$PWD"
+bs_dir="$(dirname $(readlink -f $0))"
+rm -rf "${bs_dir}"/autom4te.cache
+rm -f "${bs_dir}"/aclocal.m4 "${bs_dir}"/ltmain.sh
 
 
-# You need autoconf 2.5x, preferably 2.57 or later
-# You need automake 1.7 or later. 1.6 might work.
-
-set -e
-
-aclocal -I m4
-autoheader
-automake --add-missing --copy
-autoconf
-
+echo 'Running autoreconf -if...'
+autoreconf -if || exit 1
+if test -z "$NOCONFIGURE" ; then
+	echo 'Configuring...'
+	cd "${bs_dir}" &> /dev/null
+	test "$?" = "0" || e=1
+	test "$cwd" != "$bs_dir" && cd "$bs_dir" &> /dev/null
+	./configure $@
+	test "$e" = "1" && exit 1
+	cd "$cwd"
+fi

+ 25 - 6
configure.ac

@@ -1,8 +1,8 @@
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 m4_define([v_maj], [2])
 m4_define([v_maj], [2])
-m4_define([v_min], [5])
-m4_define([v_mic], [2])
+m4_define([v_min], [6])
+m4_define([v_mic], [0])
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 m4_define([v_ver], [v_maj.v_min.v_mic])
 m4_define([v_ver], [v_maj.v_min.v_mic])
 m4_define([lt_rev], m4_eval(v_maj + v_min))
 m4_define([lt_rev], m4_eval(v_maj + v_min))
@@ -197,6 +197,18 @@ if test "x$ztex" = xyes; then
 fi
 fi
 AM_CONDITIONAL([HAS_ZTEX], [test x$ztex = xyes])
 AM_CONDITIONAL([HAS_ZTEX], [test x$ztex = xyes])
 
 
+
+scrypt="no"
+
+AC_ARG_ENABLE([scrypt],
+	[AC_HELP_STRING([--enable-scrypt],[Compile support for scrypt mining (default disabled)])],
+	[scrypt=$enableval]
+	)
+if test "x$scrypt" = xyes; then
+	AC_DEFINE([USE_SCRYPT], [1], [Defined to 1 if scrypt support is wanted])
+fi
+
+
 curses="auto"
 curses="auto"
 
 
 AC_ARG_WITH([curses],
 AC_ARG_WITH([curses],
@@ -223,6 +235,7 @@ fi
 
 
 
 
 AM_CONDITIONAL([NEED_FPGAUTILS], [test x$icarus$bitforce$modminer$ztex != xnononono])
 AM_CONDITIONAL([NEED_FPGAUTILS], [test x$icarus$bitforce$modminer$ztex != xnononono])
+AM_CONDITIONAL([HAS_SCRYPT], [test x$scrypt = xyes])
 AM_CONDITIONAL([HAVE_CURSES], [test x$curses = xyes])
 AM_CONDITIONAL([HAVE_CURSES], [test x$curses = xyes])
 AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
 AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
 AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
 AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
@@ -343,10 +356,11 @@ fi
 
 
 AC_DEFINE_UNQUOTED([CGMINER_PREFIX], ["$prefix/bin"], [Path to bfgminer install])
 AC_DEFINE_UNQUOTED([CGMINER_PREFIX], ["$prefix/bin"], [Path to bfgminer install])
 
 
-AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120223"], [Filename for phatk kernel])
-AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120327"], [Filename for poclbm kernel])
-AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120427"], [Filename for diakgcn kernel])
-AC_DEFINE_UNQUOTED([DIABLO_KERNNAME], ["diablo120328"], [Filename for diablo kernel])
+AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120724"], [Filename for phatk kernel])
+AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120724"], [Filename for poclbm kernel])
+AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120724"], [Filename for diakgcn kernel])
+AC_DEFINE_UNQUOTED([DIABLO_KERNNAME], ["diablo120724"], [Filename for diablo kernel])
+AC_DEFINE_UNQUOTED([SCRYPT_KERNNAME], ["scrypt120724"], [Filename for scrypt kernel])
 
 
 
 
 AC_SUBST(JANSSON_LIBS)
 AC_SUBST(JANSSON_LIBS)
@@ -385,6 +399,11 @@ echo "Configuration Options Summary:"
 echo
 echo
 
 
 echo "  curses.TUI...........: $cursesmsg"
 echo "  curses.TUI...........: $cursesmsg"
+if test "x$scrypt" != xno; then
+	echo "  scrypt...............: Enabled"
+else
+	echo "  scrypt...............: Disabled"
+fi
 
 
 echo
 echo
 
 

+ 2 - 2
diablo120328.cl → diablo120724.cl

@@ -1242,8 +1242,8 @@ void search(
     
     
     ZA[924] = (ZCh(ZA[922], ZA[920], ZA[918]) + ZA[923]) + ZR26(ZA[922]);
     ZA[924] = (ZCh(ZA[922], ZA[920], ZA[918]) + ZA[923]) + ZR26(ZA[922]);
     
     
-#define FOUND (0x80)
-#define NFLAG (0x7F)
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
 
 
 #if defined(VECTORS4)
 #if defined(VECTORS4)
 	bool result = any(ZA[924] == 0x136032EDU);
 	bool result = any(ZA[924] == 0x136032EDU);

+ 2 - 2
diakgcn120427.cl → diakgcn120724.cl

@@ -571,8 +571,8 @@ __kernel
 
 
 	V[7] += V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
 	V[7] += V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
 
 
-#define FOUND (0x80)
-#define NFLAG (0x7F)
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
 
 
 #ifdef VECTORS4
 #ifdef VECTORS4
 	if ((V[7].x == 0x136032edU) ^ (V[7].y == 0x136032edU) ^ (V[7].z == 0x136032edU) ^ (V[7].w == 0x136032edU))
 	if ((V[7].x == 0x136032edU) ^ (V[7].y == 0x136032edU) ^ (V[7].z == 0x136032edU) ^ (V[7].w == 0x136032edU))

+ 21 - 2
driver-cpu.c

@@ -131,6 +131,9 @@ extern bool scanhash_sse2_32(struct thr_info*, const unsigned char *pmidstate, u
 	uint32_t max_nonce, uint32_t *last_nonce,
 	uint32_t max_nonce, uint32_t *last_nonce,
 	uint32_t nonce);
 	uint32_t nonce);
 
 
+extern bool scanhash_scrypt(struct thr_info *thr, int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
+	const unsigned char *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done);
 
 
 
 
 
 
@@ -161,6 +164,9 @@ const char *algo_names[] = {
 #ifdef WANT_ALTIVEC_4WAY
 #ifdef WANT_ALTIVEC_4WAY
     [ALGO_ALTIVEC_4WAY] = "altivec_4way",
     [ALGO_ALTIVEC_4WAY] = "altivec_4way",
 #endif
 #endif
+#ifdef WANT_SCRYPT
+    [ALGO_SCRYPT] = "scrypt",
+#endif
 };
 };
 
 
 static const sha256_func sha256_funcs[] = {
 static const sha256_func sha256_funcs[] = {
@@ -185,7 +191,10 @@ static const sha256_func sha256_funcs[] = {
 	[ALGO_SSE2_64]		= (sha256_func)scanhash_sse2_64,
 	[ALGO_SSE2_64]		= (sha256_func)scanhash_sse2_64,
 #endif
 #endif
 #ifdef WANT_X8664_SSE4
 #ifdef WANT_X8664_SSE4
-	[ALGO_SSE4_64]		= (sha256_func)scanhash_sse4_64
+	[ALGO_SSE4_64]		= (sha256_func)scanhash_sse4_64,
+#endif
+#ifdef WANT_SCRYPT
+	[ALGO_SCRYPT]		= (sha256_func)scanhash_scrypt
 #endif
 #endif
 };
 };
 #endif
 #endif
@@ -662,6 +671,9 @@ char *set_algo(const char *arg, enum sha256_algos *algo)
 {
 {
 	enum sha256_algos i;
 	enum sha256_algos i;
 
 
+	if (opt_scrypt)
+		return "Can only use scrypt algorithm";
+
 	if (!strcmp(arg, "auto")) {
 	if (!strcmp(arg, "auto")) {
 		*algo = pick_fastest_algo();
 		*algo = pick_fastest_algo();
 		return NULL;
 		return NULL;
@@ -676,6 +688,13 @@ char *set_algo(const char *arg, enum sha256_algos *algo)
 	return "Unknown algorithm";
 	return "Unknown algorithm";
 }
 }
 
 
+#ifdef WANT_SCRYPT
+void set_scrypt_algo(enum sha256_algos *algo)
+{
+	*algo = ALGO_SCRYPT;
+}
+#endif
+
 void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo)
 void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo)
 {
 {
 	strncpy(buf, algo_names[*algo], OPT_SHOW_LEN);
 	strncpy(buf, algo_names[*algo], OPT_SHOW_LEN);
@@ -765,7 +784,7 @@ static bool cpu_thread_prepare(struct thr_info *thr)
 
 
 static uint64_t cpu_can_limit_work(__maybe_unused struct thr_info *thr)
 static uint64_t cpu_can_limit_work(__maybe_unused struct thr_info *thr)
 {
 {
-	return 0xfffff;
+	return 0xffff;
 }
 }
 
 
 static bool cpu_thread_init(struct thr_info *thr)
 static bool cpu_thread_init(struct thr_info *thr)

+ 6 - 0
driver-cpu.h

@@ -34,6 +34,10 @@
 #define WANT_X8664_SSE4 1
 #define WANT_X8664_SSE4 1
 #endif
 #endif
 
 
+#ifdef USE_SCRYPT
+#define WANT_SCRYPT
+#endif
+
 enum sha256_algos {
 enum sha256_algos {
 	ALGO_C,			/* plain C */
 	ALGO_C,			/* plain C */
 	ALGO_4WAY,		/* parallel SSE2 */
 	ALGO_4WAY,		/* parallel SSE2 */
@@ -44,6 +48,7 @@ enum sha256_algos {
 	ALGO_SSE2_64,		/* SSE2 for x86_64 */
 	ALGO_SSE2_64,		/* SSE2 for x86_64 */
 	ALGO_SSE4_64,		/* SSE4 for x86_64 */
 	ALGO_SSE4_64,		/* SSE4 for x86_64 */
 	ALGO_ALTIVEC_4WAY,	/* parallel Altivec */
 	ALGO_ALTIVEC_4WAY,	/* parallel Altivec */
+	ALGO_SCRYPT,		/* scrypt */
 };
 };
 
 
 extern const char *algo_names[];
 extern const char *algo_names[];
@@ -55,5 +60,6 @@ extern void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo);
 extern char *force_nthreads_int(const char *arg, int *i);
 extern char *force_nthreads_int(const char *arg, int *i);
 extern void init_max_name_len();
 extern void init_max_name_len();
 extern double bench_algo_stage3(enum sha256_algos algo);
 extern double bench_algo_stage3(enum sha256_algos algo);
+extern void set_scrypt_algo(enum sha256_algos *algo);
 
 
 #endif /* __DEVICE_CPU_H__ */
 #endif /* __DEVICE_CPU_H__ */

+ 161 - 24
driver-opencl.c

@@ -344,6 +344,83 @@ char *set_worksize(char *arg)
 	return NULL;
 	return NULL;
 }
 }
 
 
+#ifdef USE_SCRYPT
+char *set_shaders(char *arg)
+{
+	int i, val = 0, device = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set lookup gap";
+	val = atoi(nextptr);
+
+	gpus[device++].shaders = val;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		val = atoi(nextptr);
+
+		gpus[device++].shaders = val;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++)
+			gpus[i].shaders = gpus[0].shaders;
+	}
+
+	return NULL;
+}
+
+char *set_lookup_gap(char *arg)
+{
+	int i, val = 0, device = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set lookup gap";
+	val = atoi(nextptr);
+
+	gpus[device++].lookup_gap = val;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		val = atoi(nextptr);
+
+		gpus[device++].lookup_gap = val;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++)
+			gpus[i].lookup_gap = gpus[0].lookup_gap;
+	}
+
+	return NULL;
+}
+
+char *set_thread_concurrency(char *arg)
+{
+	int i, val = 0, device = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set thread concurrency";
+	val = atoi(nextptr);
+
+	gpus[device++].thread_concurrency = val;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		val = atoi(nextptr);
+
+		gpus[device++].thread_concurrency = val;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++)
+			gpus[i].thread_concurrency = gpus[0].thread_concurrency;
+	}
+
+	return NULL;
+}
+#endif
+
 static enum cl_kernels select_kernel(char *arg)
 static enum cl_kernels select_kernel(char *arg)
 {
 {
 	if (!strcmp(arg, "diablo"))
 	if (!strcmp(arg, "diablo"))
@@ -354,6 +431,10 @@ static enum cl_kernels select_kernel(char *arg)
 		return KL_POCLBM;
 		return KL_POCLBM;
 	if (!strcmp(arg, "phatk"))
 	if (!strcmp(arg, "phatk"))
 		return KL_PHATK;
 		return KL_PHATK;
+#ifdef USE_SCRYPT
+	if (!strcmp(arg, "scrypt"))
+		return KL_SCRYPT;
+#endif
 	return KL_NONE;
 	return KL_NONE;
 }
 }
 
 
@@ -363,6 +444,8 @@ char *set_kernel(char *arg)
 	int i, device = 0;
 	int i, device = 0;
 	char *nextptr;
 	char *nextptr;
 
 
+	if (opt_scrypt)
+		return "Cannot use sha256 kernel with scrypt";
 	nextptr = strtok(arg, ",");
 	nextptr = strtok(arg, ",");
 	if (nextptr == NULL)
 	if (nextptr == NULL)
 		return "Invalid parameters for set kernel";
 		return "Invalid parameters for set kernel";
@@ -1203,11 +1286,40 @@ static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t
 	return status;
 	return status;
 }
 }
 
 
+#ifdef USE_SCRYPT
+static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+	unsigned char *midstate = blk->work->midstate;
+	cl_kernel *kernel = &clState->kernel;
+	unsigned int num = 0;
+	cl_uint le_target;
+	cl_int status = 0;
+
+	le_target = *(cl_uint *)(blk->work->target + 28);
+	clState->cldata = blk->work->data;
+	status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
+
+	CL_SET_ARG(clState->CLbuffer0);
+	CL_SET_ARG(clState->outputBuffer);
+	CL_SET_ARG(clState->padbuffer8);
+	CL_SET_VARG(4, &midstate[0]);
+	CL_SET_VARG(4, &midstate[16]);
+	CL_SET_ARG(le_target);
+
+	return status;
+}
+#endif
+
 static void set_threads_hashes(unsigned int vectors, unsigned int *threads,
 static void set_threads_hashes(unsigned int vectors, unsigned int *threads,
 			       int64_t *hashes, size_t *globalThreads,
 			       int64_t *hashes, size_t *globalThreads,
 			       unsigned int minthreads, int intensity)
 			       unsigned int minthreads, int intensity)
 {
 {
-	*threads = 1 << (15 + intensity);
+	if (opt_scrypt) {
+		if (intensity < 0)
+			intensity = 0;
+		*threads = 1 << intensity;
+	} else
+		*threads = 1 << (15 + intensity);
 	if (*threads < minthreads)
 	if (*threads < minthreads)
 		*threads = minthreads;
 		*threads = minthreads;
 	*globalThreads = *threads;
 	*globalThreads = *threads;
@@ -1490,19 +1602,25 @@ static bool opencl_thread_prepare(struct thr_info *thr)
 	if (!cgpu->kname)
 	if (!cgpu->kname)
 	{
 	{
 		switch (clStates[i]->chosen_kernel) {
 		switch (clStates[i]->chosen_kernel) {
-		case KL_DIABLO:
-			cgpu->kname = "diablo";
-			break;
-		case KL_DIAKGCN:
-			cgpu->kname = "diakgcn";
-			break;
-		case KL_PHATK:
-			cgpu->kname = "phatk";
-			break;
-		case KL_POCLBM:
-			cgpu->kname = "poclbm";
-		default:
-			break;
+			case KL_DIABLO:
+				cgpu->kname = "diablo";
+				break;
+			case KL_DIAKGCN:
+				cgpu->kname = "diakgcn";
+				break;
+			case KL_PHATK:
+				cgpu->kname = "phatk";
+				break;
+#ifdef USE_SCRYPT
+			case KL_SCRYPT:
+				cgpu->kname = "scrypt";
+				break;
+#endif
+			case KL_POCLBM:
+				cgpu->kname = "poclbm";
+				break;
+			default:
+				break;
 		}
 		}
 	}
 	}
 	applog(LOG_INFO, "initCl() finished. Found %s", name);
 	applog(LOG_INFO, "initCl() finished. Found %s", name);
@@ -1520,7 +1638,7 @@ static bool opencl_thread_init(struct thr_info *thr)
 	struct cgpu_info *gpu = thr->cgpu;
 	struct cgpu_info *gpu = thr->cgpu;
 	struct opencl_thread_data *thrdata;
 	struct opencl_thread_data *thrdata;
 	_clState *clState = clStates[thr_id];
 	_clState *clState = clStates[thr_id];
-	cl_int status;
+	cl_int status = 0;
 	thrdata = calloc(1, sizeof(*thrdata));
 	thrdata = calloc(1, sizeof(*thrdata));
 	thr->cgpu_data = thrdata;
 	thr->cgpu_data = thrdata;
 
 
@@ -1539,6 +1657,11 @@ static bool opencl_thread_init(struct thr_info *thr)
 		case KL_DIAKGCN:
 		case KL_DIAKGCN:
 			thrdata->queue_kernel_parameters = &queue_diakgcn_kernel;
 			thrdata->queue_kernel_parameters = &queue_diakgcn_kernel;
 			break;
 			break;
+#ifdef USE_SCRYPT
+		case KL_SCRYPT:
+			thrdata->queue_kernel_parameters = &queue_scrypt_kernel;
+			break;
+#endif
 		default:
 		default:
 		case KL_DIABLO:
 		case KL_DIABLO:
 			thrdata->queue_kernel_parameters = &queue_diablo_kernel;
 			thrdata->queue_kernel_parameters = &queue_diablo_kernel;
@@ -1553,7 +1676,7 @@ static bool opencl_thread_init(struct thr_info *thr)
 		return false;
 		return false;
 	}
 	}
 
 
-	status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0,
+	status |= clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0,
 			BUFFERSIZE, blank_res, 0, NULL, NULL);
 			BUFFERSIZE, blank_res, 0, NULL, NULL);
 	if (unlikely(status != CL_SUCCESS)) {
 	if (unlikely(status != CL_SUCCESS)) {
 		applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
 		applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
@@ -1582,7 +1705,12 @@ static void opencl_free_work(struct thr_info *thr, struct work *work)
 
 
 static bool opencl_prepare_work(struct thr_info __maybe_unused *thr, struct work *work)
 static bool opencl_prepare_work(struct thr_info __maybe_unused *thr, struct work *work)
 {
 {
-	precalc_hash(&work->blk, (uint32_t *)(work->midstate), (uint32_t *)(work->data + 64));
+#ifdef USE_SCRYPT
+	if (opt_scrypt)
+		work->blk.work = work;
+	else
+#endif
+		precalc_hash(&work->blk, (uint32_t *)(work->midstate), (uint32_t *)(work->data + 64));
 	return true;
 	return true;
 }
 }
 
 
@@ -1597,6 +1725,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 	_clState *clState = clStates[thr_id];
 	_clState *clState = clStates[thr_id];
 	const cl_kernel *kernel = &clState->kernel;
 	const cl_kernel *kernel = &clState->kernel;
 	const int dynamic_us = opt_dynamic_interval * 1000;
 	const int dynamic_us = opt_dynamic_interval * 1000;
+	cl_bool blocking;
 
 
 	cl_int status;
 	cl_int status;
 	size_t globalThreads[1];
 	size_t globalThreads[1];
@@ -1604,14 +1733,20 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 	unsigned int threads;
 	unsigned int threads;
 	int64_t hashes;
 	int64_t hashes;
 
 
+	if (gpu->dynamic)
+		blocking = CL_TRUE;
+	else
+		blocking = CL_FALSE;
+
 	/* This finish flushes the readbuffer set with CL_FALSE later */
 	/* This finish flushes the readbuffer set with CL_FALSE later */
-	clFinish(clState->commandQueue);
-	gettimeofday(&gpu->tv_gpuend, NULL);
+	if (!blocking)
+		clFinish(clState->commandQueue);
 
 
 	if (gpu->dynamic) {
 	if (gpu->dynamic) {
 		struct timeval diff;
 		struct timeval diff;
 		suseconds_t gpu_us;
 		suseconds_t gpu_us;
 
 
+		gettimeofday(&gpu->tv_gpuend, NULL);
 		timersub(&gpu->tv_gpuend, &gpu->tv_gpustart, &diff);
 		timersub(&gpu->tv_gpuend, &gpu->tv_gpustart, &diff);
 		gpu_us = diff.tv_sec * 1000000 + diff.tv_usec;
 		gpu_us = diff.tv_sec * 1000000 + diff.tv_usec;
 		if (likely(gpu_us >= 0)) {
 		if (likely(gpu_us >= 0)) {
@@ -1633,6 +1768,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 			   localThreads[0], gpu->intensity);
 			   localThreads[0], gpu->intensity);
 	if (hashes > gpu->max_hashes)
 	if (hashes > gpu->max_hashes)
 		gpu->max_hashes = hashes;
 		gpu->max_hashes = hashes;
+
 	status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]);
 	status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]);
 	if (unlikely(status != CL_SUCCESS)) {
 	if (unlikely(status != CL_SUCCESS)) {
 		applog(LOG_ERR, "Error: clSetKernelArg of all params failed.");
 		applog(LOG_ERR, "Error: clSetKernelArg of all params failed.");
@@ -1642,7 +1778,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 	/* MAXBUFFERS entry is used as a flag to say nonces exist */
 	/* MAXBUFFERS entry is used as a flag to say nonces exist */
 	if (thrdata->res[FOUND]) {
 	if (thrdata->res[FOUND]) {
 		/* Clear the buffer again */
 		/* Clear the buffer again */
-		status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
+		status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, blocking, 0,
 				BUFFERSIZE, blank_res, 0, NULL, NULL);
 				BUFFERSIZE, blank_res, 0, NULL, NULL);
 		if (unlikely(status != CL_SUCCESS)) {
 		if (unlikely(status != CL_SUCCESS)) {
 			applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
 			applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
@@ -1657,7 +1793,8 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 			postcalc_hash_async(thr, work, thrdata->res);
 			postcalc_hash_async(thr, work, thrdata->res);
 		}
 		}
 		memset(thrdata->res, 0, BUFFERSIZE);
 		memset(thrdata->res, 0, BUFFERSIZE);
-		clFinish(clState->commandQueue);
+		if (!blocking)
+			clFinish(clState->commandQueue);
 	}
 	}
 
 
 	gettimeofday(&gpu->tv_gpustart, NULL);
 	gettimeofday(&gpu->tv_gpustart, NULL);
@@ -1672,14 +1809,14 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 		status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL,
 		status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL,
 						globalThreads, localThreads, 0,  NULL, NULL);
 						globalThreads, localThreads, 0,  NULL, NULL);
 	if (unlikely(status != CL_SUCCESS)) {
 	if (unlikely(status != CL_SUCCESS)) {
-		applog(LOG_ERR, "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)");
+		applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
 		return -1;
 		return -1;
 	}
 	}
 
 
-	status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
+	status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, blocking, 0,
 			BUFFERSIZE, thrdata->res, 0, NULL, NULL);
 			BUFFERSIZE, thrdata->res, 0, NULL, NULL);
 	if (unlikely(status != CL_SUCCESS)) {
 	if (unlikely(status != CL_SUCCESS)) {
-		applog(LOG_ERR, "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer)");
+		applog(LOG_ERR, "Error: clEnqueueReadBuffer failed error %d. (clEnqueueReadBuffer)", status);
 		return -1;
 		return -1;
 	}
 	}
 
 

+ 5 - 0
driver-opencl.h

@@ -18,6 +18,11 @@ extern char *set_temp_target(char *arg);
 extern char *set_intensity(char *arg);
 extern char *set_intensity(char *arg);
 extern char *set_vector(char *arg);
 extern char *set_vector(char *arg);
 extern char *set_worksize(char *arg);
 extern char *set_worksize(char *arg);
+#ifdef USE_SCRYPT
+extern char *set_shaders(char *arg);
+extern char *set_lookup_gap(char *arg);
+extern char *set_thread_concurrency(char *arg);
+#endif
 extern char *set_kernel(char *arg);
 extern char *set_kernel(char *arg);
 void manage_gpu(void);
 void manage_gpu(void);
 extern void pause_dynamic_threads(int gpu);
 extern void pause_dynamic_threads(int gpu);

+ 13 - 3
findnonce.c

@@ -45,7 +45,8 @@ const uint32_t SHA256_K[64] = {
 	d = d + h; \
 	d = d + h; \
 	h = h + (rotate(a, 30) ^ rotate(a, 19) ^ rotate(a, 10)) + ((a & b) | (c & (a | b)))
 	h = h + (rotate(a, 30) ^ rotate(a, 19) ^ rotate(a, 10)) + ((a & b) | (c & (a | b)))
 
 
-void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) {
+void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data)
+{
 	cl_uint A, B, C, D, E, F, G, H;
 	cl_uint A, B, C, D, E, F, G, H;
 
 
 	A = state[0];
 	A = state[0];
@@ -228,9 +229,18 @@ static void *postcalc_hash(void *userdata)
 	pthread_detach(pthread_self());
 	pthread_detach(pthread_self());
 
 
 	for (entry = 0; entry < FOUND; entry++) {
 	for (entry = 0; entry < FOUND; entry++) {
-		if (pcd->res[entry])
-			send_nonce(pcd, pcd->res[entry]);
+		uint32_t nonce = pcd->res[entry];
+
+		if (nonce) {
+			applog(LOG_DEBUG, "OCL NONCE %u", nonce);
+#ifdef USE_SCRYPT
+			if (opt_scrypt)
+				submit_nonce(thr, pcd->work, nonce);
+			else
+#endif
+				send_nonce(pcd, nonce);
 		nonces++;
 		nonces++;
+		}
 	}
 	}
 
 
 	free(pcd);
 	free(pcd);

+ 3 - 3
findnonce.h

@@ -4,10 +4,10 @@
 #include "config.h"
 #include "config.h"
 
 
 #define MAXTHREADS (0xFFFFFFFEULL)
 #define MAXTHREADS (0xFFFFFFFEULL)
-#define MAXBUFFERS (0xFF)
+#define MAXBUFFERS (0xFFF)
 #define BUFFERSIZE (sizeof(uint32_t) * MAXBUFFERS)
 #define BUFFERSIZE (sizeof(uint32_t) * MAXBUFFERS)
-#define FOUND (0x80)
-/* #define NFLAG (0x7F) Just for reference */
+#define FOUND (0x800)
+/* #define NFLAG (0x7FF) Just for reference */
 
 
 #ifdef HAVE_OPENCL
 #ifdef HAVE_OPENCL
 extern void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data);
 extern void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data);

+ 1 - 1
make-release

@@ -31,7 +31,7 @@ cd ..
 zip -r "$IDIR/${sw}.zip" "$sw"
 zip -r "$IDIR/${sw}.zip" "$sw"
 tar cjvpf "$IDIR/${sw}.tbz2" "$sw"
 tar cjvpf "$IDIR/${sw}.tbz2" "$sw"
 SRCDIR="$TMPDIR/$sw"
 SRCDIR="$TMPDIR/$sw"
-for txt in AUTHORS COPYING NEWS README API-README FPGA-README; do
+for txt in AUTHORS COPYING NEWS README API-README FPGA-README SCRYPT-README; do
 	cp -v "$txt" "$OUTDIR/${txt}.txt"
 	cp -v "$txt" "$OUTDIR/${txt}.txt"
 done
 done
 cp -av "bitstreams" "$OUTDIR/"
 cp -av "bitstreams" "$OUTDIR/"

+ 71 - 7
miner.c

@@ -109,6 +109,12 @@ int nDevs;
 int opt_g_threads = 2;
 int opt_g_threads = 2;
 int gpu_threads;
 int gpu_threads;
 #endif
 #endif
+#ifdef USE_SCRYPT
+static char detect_algo = 1;
+bool opt_scrypt;
+#else
+static char detect_algo;
+#endif
 bool opt_restart = true;
 bool opt_restart = true;
 static bool opt_nogpu;
 static bool opt_nogpu;
 
 
@@ -852,6 +858,11 @@ static struct opt_table opt_config_table[] = {
 	OPT_WITH_ARG("--gpu-vddc",
 	OPT_WITH_ARG("--gpu-vddc",
 		     set_gpu_vddc, NULL, NULL,
 		     set_gpu_vddc, NULL, NULL,
 		     "Set the GPU voltage in Volts - one value for all or separate by commas for per card"),
 		     "Set the GPU voltage in Volts - one value for all or separate by commas for per card"),
+#endif
+#ifdef USE_SCRYPT
+	OPT_WITH_ARG("--lookup-gap",
+		     set_lookup_gap, NULL, NULL,
+		     "Set GPU lookup gap for scrypt mining, comma separated"),
 #endif
 #endif
 	OPT_WITH_ARG("--intensity|-I",
 	OPT_WITH_ARG("--intensity|-I",
 		     set_intensity, NULL, NULL,
 		     set_intensity, NULL, NULL,
@@ -865,7 +876,7 @@ static struct opt_table opt_config_table[] = {
 #ifdef HAVE_OPENCL
 #ifdef HAVE_OPENCL
 	OPT_WITH_ARG("--kernel|-k",
 	OPT_WITH_ARG("--kernel|-k",
 		     set_kernel, NULL, NULL,
 		     set_kernel, NULL, NULL,
-		     "Override kernel to use (diablo, poclbm, phatk or diakgcn) - one value or comma separated"),
+		     "Override sha256 kernel to use (diablo, poclbm, phatk or diakgcn) - one value or comma separated"),
 #endif
 #endif
 #ifdef USE_ICARUS
 #ifdef USE_ICARUS
 	OPT_WITH_ARG("--icarus-timing",
 	OPT_WITH_ARG("--icarus-timing",
@@ -958,6 +969,16 @@ static struct opt_table opt_config_table[] = {
 	OPT_WITH_ARG("--sched-stop",
 	OPT_WITH_ARG("--sched-stop",
 		     set_schedtime, NULL, &schedstop,
 		     set_schedtime, NULL, &schedstop,
 		     "Set a time of day in HH:MM to stop mining (will quit without a start time)"),
 		     "Set a time of day in HH:MM to stop mining (will quit without a start time)"),
+#ifdef USE_SCRYPT
+	OPT_WITHOUT_ARG("--scrypt",
+			opt_set_bool, &opt_scrypt,
+			"Use the scrypt algorithm for mining (non-bitcoin)"),
+#ifdef HAVE_OPENCL
+	OPT_WITH_ARG("--shaders",
+		     set_shaders, NULL, NULL,
+		     "GPU shaders per card for tuning scrypt, comma separated"),
+#endif
+#endif
 	OPT_WITH_ARG("--sharelog",
 	OPT_WITH_ARG("--sharelog",
 		     set_sharelog, NULL, NULL,
 		     set_sharelog, NULL, NULL,
 		     "Append share log to file"),
 		     "Append share log to file"),
@@ -999,6 +1020,11 @@ static struct opt_table opt_config_table[] = {
 			opt_hidden
 			opt_hidden
 #endif
 #endif
 	),
 	),
+#if defined(USE_SCRYPT) && defined(HAVE_OPENCL)
+	OPT_WITH_ARG("--thread-concurrency",
+		     set_thread_concurrency, NULL, NULL,
+		     "Set GPU thread concurrency for scrypt mining, comma separated"),
+#endif
 	OPT_WITH_ARG("--url|-o",
 	OPT_WITH_ARG("--url|-o",
 		     set_url, NULL, NULL,
 		     set_url, NULL, NULL,
 		     "URL for bitcoin JSON-RPC server"),
 		     "URL for bitcoin JSON-RPC server"),
@@ -1280,6 +1306,13 @@ static bool work_decode(const json_t *val, struct work *work)
 {
 {
 	unsigned char bits = 0, i;
 	unsigned char bits = 0, i;
 	
 	
+	if (unlikely(detect_algo == 1)) {
+		json_t *tmp = json_object_get(val, "algorithm");
+		const char *v = tmp ? json_string_value(tmp) : "";
+		if (strncasecmp(v, "scrypt", 6))
+			detect_algo = 2;
+	}
+	
 	if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data), true))) {
 	if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data), true))) {
 		applog(LOG_ERR, "JSON inval data");
 		applog(LOG_ERR, "JSON inval data");
 		goto err_out;
 		goto err_out;
@@ -1595,9 +1628,6 @@ static void curses_print_devstatus(int thr_id)
 	char logline[255];
 	char logline[255];
 	int ypos;
 	int ypos;
 
 
-	cgpu->utility = cgpu->accepted / ( total_secs ? total_secs : 1 ) * 60;
-	cgpu->utility_diff1 = cgpu->accepted_weighed / ( total_secs ?: 1 ) * 60;
-
 	/* Check this isn't out of the window size */
 	/* Check this isn't out of the window size */
 	ypos = cgpu->cgminer_id;
 	ypos = cgpu->cgminer_id;
 	ypos += devsummaryYOffset;
 	ypos += devsummaryYOffset;
@@ -1606,6 +1636,10 @@ static void curses_print_devstatus(int thr_id)
 	ypos += devcursor;
 	ypos += devcursor;
 	if (ypos >= statusy - 1)
 	if (ypos >= statusy - 1)
 		return;
 		return;
+
+	cgpu->utility = cgpu->accepted / ( total_secs ? total_secs : 1 ) * 60;
+	cgpu->utility_diff1 = cgpu->accepted_weighed / ( total_secs ?: 1 ) * 60;
+
 	if (wmove(statuswin, ypos, 0) == ERR)
 	if (wmove(statuswin, ypos, 0) == ERR)
 		return;
 		return;
 	wprintw(statuswin, " %s %*d: ", cgpu->api->name, dev_width, cgpu->device_id);
 	wprintw(statuswin, " %s %*d: ", cgpu->api->name, dev_width, cgpu->device_id);
@@ -1899,8 +1933,13 @@ static bool submit_upstream_work(const struct work *work, CURL *curl)
 
 
 	if (!QUIET) {
 	if (!QUIET) {
 		hash32 = (uint32_t *)(work->hash);
 		hash32 = (uint32_t *)(work->hash);
-		sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[6]), (unsigned long)(hash32[5]),
-			work->block? " BLOCK!" : "");
+		if (opt_scrypt) {
+			sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[7]), (unsigned long)(hash32[6]),
+				work->block? " BLOCK!" : "");
+		} else {
+			sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[6]), (unsigned long)(hash32[5]),
+				work->block? " BLOCK!" : "");
+		}
 	}
 	}
 
 
 	/* Theoretically threads could race when modifying accepted and
 	/* Theoretically threads could race when modifying accepted and
@@ -3149,6 +3188,11 @@ void write_config(FILE *fcfg)
 				case KL_DIABLO:
 				case KL_DIABLO:
 					fprintf(fcfg, "diablo");
 					fprintf(fcfg, "diablo");
 					break;
 					break;
+#ifdef USE_SCRYPT
+				case KL_SCRYPT:
+					fprintf(fcfg, "scrypt");
+					break;
+#endif
 			}
 			}
 		}
 		}
 #ifdef HAVE_ADL
 #ifdef HAVE_ADL
@@ -4326,6 +4370,13 @@ bool hashtest(const struct work *work, bool checktarget)
 
 
 bool test_nonce(struct work *work, uint32_t nonce, bool checktarget)
 bool test_nonce(struct work *work, uint32_t nonce, bool checktarget)
 {
 {
+	if (opt_scrypt) {
+		uint32_t *work_nonce = (uint32_t *)(work->data + 64 + 12);
+
+		*work_nonce = nonce;
+		return true;
+	}
+
 	work->data[64 + 12 + 0] = (nonce >> 0) & 0xff;
 	work->data[64 + 12 + 0] = (nonce >> 0) & 0xff;
 	work->data[64 + 12 + 1] = (nonce >> 8) & 0xff;
 	work->data[64 + 12 + 1] = (nonce >> 8) & 0xff;
 	work->data[64 + 12 + 2] = (nonce >> 16) & 0xff;
 	work->data[64 + 12 + 2] = (nonce >> 16) & 0xff;
@@ -4339,7 +4390,7 @@ bool submit_nonce(struct thr_info *thr, struct work *work, uint32_t nonce)
 	/* Do one last check before attempting to submit the work */
 	/* Do one last check before attempting to submit the work */
 	/* Side effect: sets work->data for us */
 	/* Side effect: sets work->data for us */
 	if (!test_nonce(work, nonce, true)) {
 	if (!test_nonce(work, nonce, true)) {
-		applog(LOG_INFO, "Share below target");
+		applog(LOG_INFO, "Pool %d share below target", work->pool->pool_no);
 		return true;
 		return true;
 	}
 	}
 	return submit_work_sync(thr, work);
 	return submit_work_sync(thr, work);
@@ -5607,6 +5658,11 @@ int main(int argc, char *argv[])
 		opt_log_output = true;
 		opt_log_output = true;
 
 
 #ifdef WANT_CPUMINE
 #ifdef WANT_CPUMINE
+#ifdef USE_SCRYPT
+	if (opt_scrypt)
+		set_scrypt_algo(&opt_algo);
+	else
+#endif
 	if (0 <= opt_bench_algo) {
 	if (0 <= opt_bench_algo) {
 		double rate = bench_algo_stage3(opt_bench_algo);
 		double rate = bench_algo_stage3(opt_bench_algo);
 
 
@@ -5856,6 +5912,14 @@ int main(int argc, char *argv[])
 		}
 		}
 	} while (!pools_active);
 	} while (!pools_active);
 
 
+#ifdef USE_SCRYPT
+	if (detect_algo == 1 && !opt_scrypt) {
+		applog(LOG_NOTICE, "Detected scrypt algorithm");
+		opt_scrypt = true;
+	}
+#endif
+	detect_algo = 0;
+
 begin_bench:
 begin_bench:
 	total_mhashes_done = 0;
 	total_mhashes_done = 0;
 	for (i = 0; i < total_devices; i++) {
 	for (i = 0; i < total_devices; i++) {

+ 21 - 0
miner.h

@@ -261,6 +261,7 @@ enum cl_kernels {
 	KL_PHATK,
 	KL_PHATK,
 	KL_DIAKGCN,
 	KL_DIAKGCN,
 	KL_DIABLO,
 	KL_DIABLO,
+	KL_SCRYPT,
 };
 };
 
 
 enum dev_reason {
 enum dev_reason {
@@ -357,10 +358,17 @@ struct cgpu_info {
 	int virtual_adl;
 	int virtual_adl;
 	int intensity;
 	int intensity;
 	bool dynamic;
 	bool dynamic;
+
 	cl_uint vwidth;
 	cl_uint vwidth;
 	size_t work_size;
 	size_t work_size;
 	enum cl_kernels kernel;
 	enum cl_kernels kernel;
+	cl_ulong max_alloc;
 
 
+#ifdef USE_SCRYPT
+	int lookup_gap;
+	int thread_concurrency;
+	int shaders;
+#endif
 	struct timeval tv_gpustart;;
 	struct timeval tv_gpustart;;
 	struct timeval tv_gpuend;
 	struct timeval tv_gpuend;
 	double gpu_us_average;
 	double gpu_us_average;
@@ -612,8 +620,13 @@ extern void add_pool_details(bool live, char *url, char *user, char *pass);
 
 
 #define MIN_INTENSITY -10
 #define MIN_INTENSITY -10
 #define _MIN_INTENSITY_STR "-10"
 #define _MIN_INTENSITY_STR "-10"
+#ifdef USE_SCRYPT
+#define MAX_INTENSITY 20
+#define _MAX_INTENSITY_STR "20"
+#else
 #define MAX_INTENSITY 14
 #define MAX_INTENSITY 14
 #define _MAX_INTENSITY_STR "14"
 #define _MAX_INTENSITY_STR "14"
+#endif
 
 
 extern struct list_head scan_devices;
 extern struct list_head scan_devices;
 extern int nDevs;
 extern int nDevs;
@@ -625,6 +638,11 @@ extern bool opt_quiet;
 extern struct thr_info *thr_info;
 extern struct thr_info *thr_info;
 extern struct cgpu_info gpus[MAX_GPUDEVICES];
 extern struct cgpu_info gpus[MAX_GPUDEVICES];
 extern int gpu_threads;
 extern int gpu_threads;
+#ifdef USE_SCRYPT
+extern bool opt_scrypt;
+#else
+#define opt_scrypt (0)
+#endif
 extern double total_secs;
 extern double total_secs;
 extern int mining_threads;
 extern int mining_threads;
 extern struct cgpu_info *cpus;
 extern struct cgpu_info *cpus;
@@ -672,6 +690,9 @@ typedef struct {
 	cl_uint B1addK6, PreVal0addK7, W16addK16, W17addK17;
 	cl_uint B1addK6, PreVal0addK7, W16addK16, W17addK17;
 	cl_uint zeroA, zeroB;
 	cl_uint zeroA, zeroB;
 	cl_uint oneA, twoA, threeA, fourA, fiveA, sixA, sevenA;
 	cl_uint oneA, twoA, threeA, fourA, fiveA, sixA, sevenA;
+#ifdef USE_SCRYPT
+	struct work *work;
+#endif
 } dev_blk_ctx;
 } dev_blk_ctx;
 #else
 #else
 typedef struct {
 typedef struct {

+ 2 - 2
mkinstalldirs

@@ -81,9 +81,9 @@ case $dirmode in
       echo "mkdir -p -- $*"
       echo "mkdir -p -- $*"
       exec mkdir -p -- "$@"
       exec mkdir -p -- "$@"
     else
     else
-      # On NextStep and OpenStep, the `mkdir' command does not
+      # On NextStep and OpenStep, the 'mkdir' command does not
       # recognize any option.  It will interpret all options as
       # recognize any option.  It will interpret all options as
-      # directories to create, and then abort because `.' already
+      # directories to create, and then abort because '.' already
       # exists.
       # exists.
       test -d ./-p && rmdir ./-p
       test -d ./-p && rmdir ./-p
       test -d ./--version && rmdir ./--version
       test -d ./--version && rmdir ./--version

+ 128 - 33
ocl.c

@@ -385,6 +385,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 {
 {
 	_clState *clState = calloc(1, sizeof(_clState));
 	_clState *clState = calloc(1, sizeof(_clState));
 	bool patchbfi = false, prog_built = false;
 	bool patchbfi = false, prog_built = false;
+	struct cgpu_info *cgpu = &gpus[gpu];
 	cl_platform_id platform = NULL;
 	cl_platform_id platform = NULL;
 	char pbuff[256], vbuff[255];
 	char pbuff[256], vbuff[255];
 	cl_platform_id* platforms;
 	cl_platform_id* platforms;
@@ -486,6 +487,18 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 		return NULL;
 		return NULL;
 	}
 	}
 
 
+	/////////////////////////////////////////////////////////////////
+	// Create an OpenCL command queue
+	/////////////////////////////////////////////////////////////////
+	clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu],
+						     CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status);
+	if (status != CL_SUCCESS) /* Try again without OOE enable */
+		clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
+	if (status != CL_SUCCESS) {
+		applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status);
+		return NULL;
+	}
+
 	/* Check for BFI INT support. Hopefully people don't mix devices with
 	/* Check for BFI INT support. Hopefully people don't mix devices with
 	 * and without it! */
 	 * and without it! */
 	char * extensions = malloc(1024);
 	char * extensions = malloc(1024);
@@ -528,18 +541,30 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 	}
 	}
 	applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size);
 	applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size);
 
 
+	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL);
+	if (status != CL_SUCCESS) {
+		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_MEM_ALLOC_SIZE", status);
+		return NULL;
+	}
+	applog(LOG_DEBUG, "Max mem alloc size is %u", cgpu->max_alloc);
+
 	/* Create binary filename based on parameters passed to opencl
 	/* Create binary filename based on parameters passed to opencl
 	 * compiler to ensure we only load a binary that matches what would
 	 * compiler to ensure we only load a binary that matches what would
 	 * have otherwise created. The filename is:
 	 * have otherwise created. The filename is:
 	 * name + kernelname +/- g(offset) + v + vectors + w + work_size + l + sizeof(long) + .bin
 	 * name + kernelname +/- g(offset) + v + vectors + w + work_size + l + sizeof(long) + .bin
+	 * For scrypt the filename is:
+	 * name + kernelname + g + lg + lookup_gap + tc + thread_concurrency + w + work_size + l + sizeof(long) + .bin
 	 */
 	 */
 	char binaryfilename[255];
 	char binaryfilename[255];
 	char filename[255];
 	char filename[255];
 	char numbuf[10];
 	char numbuf[10];
 
 
-	if (gpus[gpu].kernel == KL_NONE) {
-		/* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */
-		if (!strstr(name, "Tahiti") &&
+	if (cgpu->kernel == KL_NONE) {
+		if (opt_scrypt) {
+			applog(LOG_INFO, "Selecting scrypt kernel");
+			clState->chosen_kernel = KL_SCRYPT;
+		} else if (!strstr(name, "Tahiti") &&
+			/* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */
 			(strstr(vbuff, "844.4") ||  // Linux 64 bit ATI 2.6 SDK
 			(strstr(vbuff, "844.4") ||  // Linux 64 bit ATI 2.6 SDK
 			 strstr(vbuff, "851.4") ||  // Windows 64 bit ""
 			 strstr(vbuff, "851.4") ||  // Windows 64 bit ""
 			 strstr(vbuff, "831.4") ||
 			 strstr(vbuff, "831.4") ||
@@ -556,9 +581,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 			applog(LOG_INFO, "Selecting phatk kernel");
 			applog(LOG_INFO, "Selecting phatk kernel");
 			clState->chosen_kernel = KL_PHATK;
 			clState->chosen_kernel = KL_PHATK;
 		}
 		}
-		gpus[gpu].kernel = clState->chosen_kernel;
+		cgpu->kernel = clState->chosen_kernel;
 	} else {
 	} else {
-		clState->chosen_kernel = gpus[gpu].kernel;
+		clState->chosen_kernel = cgpu->kernel;
 		if (clState->chosen_kernel == KL_PHATK &&
 		if (clState->chosen_kernel == KL_PHATK &&
 		    (strstr(vbuff, "844.4") || strstr(vbuff, "851.4") ||
 		    (strstr(vbuff, "844.4") || strstr(vbuff, "851.4") ||
 		     strstr(vbuff, "831.4") || strstr(vbuff, "898.1") ||
 		     strstr(vbuff, "831.4") || strstr(vbuff, "898.1") ||
@@ -591,6 +616,12 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 			strcpy(filename, DIAKGCN_KERNNAME".cl");
 			strcpy(filename, DIAKGCN_KERNNAME".cl");
 			strcpy(binaryfilename, DIAKGCN_KERNNAME);
 			strcpy(binaryfilename, DIAKGCN_KERNNAME);
 			break;
 			break;
+		case KL_SCRYPT:
+			strcpy(filename, SCRYPT_KERNNAME".cl");
+			strcpy(binaryfilename, SCRYPT_KERNNAME);
+			/* Scrypt only supports vector 1 */
+			cgpu->vwidth = 1;
+			break;
 		case KL_NONE: /* Shouldn't happen */
 		case KL_NONE: /* Shouldn't happen */
 		case KL_DIABLO:
 		case KL_DIABLO:
 			strcpy(filename, DIABLO_KERNNAME".cl");
 			strcpy(filename, DIABLO_KERNNAME".cl");
@@ -598,24 +629,61 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 			break;
 			break;
 	}
 	}
 
 
-	if (gpus[gpu].vwidth)
-		clState->vwidth = gpus[gpu].vwidth;
+	if (cgpu->vwidth)
+		clState->vwidth = cgpu->vwidth;
 	else {
 	else {
 		clState->vwidth = preferred_vwidth;
 		clState->vwidth = preferred_vwidth;
-		gpus[gpu].vwidth = preferred_vwidth;
+		cgpu->vwidth = preferred_vwidth;
 	}
 	}
 
 
-	if ((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) &&
-		clState->vwidth == 1 && clState->hasOpenCL11plus)
+	if (((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) &&
+		clState->vwidth == 1 && clState->hasOpenCL11plus) || opt_scrypt)
 			clState->goffset = true;
 			clState->goffset = true;
 
 
-	if (gpus[gpu].work_size && gpus[gpu].work_size <= clState->max_work_size)
-		clState->wsize = gpus[gpu].work_size;
+	if (cgpu->work_size && cgpu->work_size <= clState->max_work_size)
+		clState->wsize = cgpu->work_size;
 	else if (strstr(name, "Tahiti"))
 	else if (strstr(name, "Tahiti"))
 		clState->wsize = 64;
 		clState->wsize = 64;
 	else
 	else
 		clState->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth;
 		clState->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth;
-	gpus[gpu].work_size = clState->wsize;
+	cgpu->work_size = clState->wsize;
+
+#ifdef USE_SCRYPT
+	if (opt_scrypt) {
+		cl_ulong ma = cgpu->max_alloc, mt;
+		int pow2 = 0;
+
+		if (!cgpu->lookup_gap) {
+			applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu);
+			cgpu->lookup_gap = 2;
+		}
+		if (!cgpu->thread_concurrency) {
+			cgpu->thread_concurrency = ma / 32768 / cgpu->lookup_gap;
+			if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) {
+				cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders;
+				if (cgpu->thread_concurrency > cgpu->shaders * 5)
+					cgpu->thread_concurrency = cgpu->shaders * 5;
+			}
+				
+			applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu,  cgpu->thread_concurrency);
+		}
+
+		/* If we have memory to spare, try to find a power of 2 value
+		 * >= required amount to map nicely to an intensity */
+		mt = cgpu->thread_concurrency * 32768 * cgpu->lookup_gap;
+		if (ma > mt) {
+			while (ma >>= 1)
+				pow2++;
+			ma = 1;
+			while (--pow2 && ma < mt)
+				ma <<= 1;
+			if (ma >= mt) {
+				cgpu->max_alloc = ma;
+				applog(LOG_DEBUG, "Max alloc decreased to %lu", cgpu->max_alloc);
+			}
+		}
+	}
+#endif
 
 
 	FILE *binaryfile;
 	FILE *binaryfile;
 	size_t *binary_sizes;
 	size_t *binary_sizes;
@@ -644,14 +712,18 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 	strcat(binaryfilename, name);
 	strcat(binaryfilename, name);
 	if (clState->goffset)
 	if (clState->goffset)
 		strcat(binaryfilename, "g");
 		strcat(binaryfilename, "g");
-	strcat(binaryfilename, "v");
-	sprintf(numbuf, "%d", clState->vwidth);
-	strcat(binaryfilename, numbuf);
-	strcat(binaryfilename, "w");
-	sprintf(numbuf, "%d", (int)clState->wsize);
+	if (opt_scrypt) {
+#ifdef USE_SCRYPT
+		sprintf(numbuf, "lg%dtc%d", cgpu->lookup_gap, cgpu->thread_concurrency);
+		strcat(binaryfilename, numbuf);
+#endif
+	} else {
+		sprintf(numbuf, "v%d", clState->vwidth);
+		strcat(binaryfilename, numbuf);
+	}
+	sprintf(numbuf, "w%d", (int)clState->wsize);
 	strcat(binaryfilename, numbuf);
 	strcat(binaryfilename, numbuf);
-	strcat(binaryfilename, "l");
-	sprintf(numbuf, "%d", (int)sizeof(long));
+	sprintf(numbuf, "l%d", (int)sizeof(long));
 	strcat(binaryfilename, numbuf);
 	strcat(binaryfilename, numbuf);
 	strcat(binaryfilename, ".bin");
 	strcat(binaryfilename, ".bin");
 
 
@@ -712,8 +784,16 @@ build:
 	/* create a cl program executable for all the devices specified */
 	/* create a cl program executable for all the devices specified */
 	char *CompilerOptions = calloc(1, 256);
 	char *CompilerOptions = calloc(1, 256);
 
 
-	sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d",
-		(int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth);
+#ifdef USE_SCRYPT
+	if (opt_scrypt)
+		sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d",
+			cgpu->lookup_gap, cgpu->thread_concurrency, (int)clState->wsize);
+	else
+#endif
+	{
+		sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d",
+			(int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth);
+	}
 	applog(LOG_DEBUG, "Setting worksize to %d", clState->wsize);
 	applog(LOG_DEBUG, "Setting worksize to %d", clState->wsize);
 	if (clState->vwidth > 1)
 	if (clState->vwidth > 1)
 		applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth);
 		applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth);
@@ -892,18 +972,33 @@ built:
 		return NULL;
 		return NULL;
 	}
 	}
 
 
-	/////////////////////////////////////////////////////////////////
-	// Create an OpenCL command queue
-	/////////////////////////////////////////////////////////////////
-	clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu],
-						     CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status);
-	if (status != CL_SUCCESS) /* Try again without OOE enable */
-		clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
-	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status);
-		return NULL;
-	}
+#ifdef USE_SCRYPT
+	if (opt_scrypt) {
+		size_t ipt = (1024 / cgpu->lookup_gap + (1024 % cgpu->lookup_gap > 0));
+		size_t bufsize = 128 * ipt * cgpu->thread_concurrency;
+
+		/* Use the max alloc value which has been rounded to a power of
+		 * 2 greater >= required amount earlier */
+		if (bufsize > cgpu->max_alloc) {
+			applog(LOG_WARNING, "Maximum buffer memory device %d supports says %u, your scrypt settings come to %u",
+			       gpu, cgpu->max_alloc, bufsize);
+		} else
+			bufsize = cgpu->max_alloc;
+		applog(LOG_DEBUG, "Creating scrypt buffer sized %d", bufsize);
+		clState->padbufsize = bufsize;
+		clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status);
+		if (status != CL_SUCCESS) {
+			applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease CT or increase LG", status);
+			return NULL;
+		}
 
 
+		clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status);
+		if (status != CL_SUCCESS) {
+			applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status);
+			return NULL;
+		}
+	}
+#endif
 	clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);
 	clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);
 	if (status != CL_SUCCESS) {
 	if (status != CL_SUCCESS) {
 		applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status);
 		applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status);

+ 6 - 0
ocl.h

@@ -15,6 +15,12 @@ typedef struct {
 	cl_command_queue commandQueue;
 	cl_command_queue commandQueue;
 	cl_program program;
 	cl_program program;
 	cl_mem outputBuffer;
 	cl_mem outputBuffer;
+#ifdef USE_SCRYPT
+	cl_mem CLbuffer0;
+	cl_mem padbuffer8;
+	size_t padbufsize;
+	void * cldata;
+#endif
 	bool hasBitAlign;
 	bool hasBitAlign;
 	bool hasOpenCL11plus;
 	bool hasOpenCL11plus;
 	bool goffset;
 	bool goffset;

+ 2 - 2
phatk120223.cl → phatk120724.cl

@@ -387,8 +387,8 @@ void search(	const uint state0, const uint state1, const uint state2, const uint
 	W[117] += W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64),Vals[1],Vals[2]) -
 	W[117] += W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64),Vals[1],Vals[2]) -
 		(-(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(59+64))  + s1(64+59)+ ch(59+64)));
 		(-(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(59+64))  + s1(64+59)+ ch(59+64)));
 
 
-#define FOUND (0x80)
-#define NFLAG (0x7F)
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
 
 
 #ifdef VECTORS4
 #ifdef VECTORS4
 	bool result = W[117].x & W[117].y & W[117].z & W[117].w;
 	bool result = W[117].x & W[117].y & W[117].z & W[117].w;

+ 2 - 2
poclbm120327.cl → poclbm120724.cl

@@ -1311,8 +1311,8 @@ Vals[1]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
 Vals[1]+=K[59];
 Vals[1]+=K[59];
 Vals[1]+=Vals[5];
 Vals[1]+=Vals[5];
 
 
-#define FOUND (0x80)
-#define NFLAG (0x7F)
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
 
 
 #if defined(VECTORS2) || defined(VECTORS4)
 #if defined(VECTORS2) || defined(VECTORS4)
 	Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
 	Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);

+ 453 - 0
scrypt.c

@@ -0,0 +1,453 @@
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */
+
+#include "config.h"
+#include "miner.h"
+
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+
+#define byteswap(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
+
+typedef struct SHA256Context {
+	uint32_t state[8];
+	uint32_t buf[16];
+} SHA256_CTX;
+
+/*
+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
+ */
+static inline void
+be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
+{
+	uint32_t i;
+
+	for (i = 0; i < len; i++)
+		dst[i] = byteswap(src[i]);
+}
+
+/* Elementary functions used by SHA256 */
+#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
+#define SHR(x, n)	(x >> n)
+#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
+#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
+#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
+
+/* SHA256 round function */
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	t0 = h + S1(e) + Ch(e, f, g) + k;		\
+	t1 = S0(a) + Maj(a, b, c);			\
+	d += t0;					\
+	h  = t0 + t1;
+
+/* Adjusted round function for rotating state */
+#define RNDr(S, W, i, k)			\
+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
+	    W[i] + k)
+
+/*
+ * SHA256 block compression function.  The 256-bit state is transformed via
+ * the 512-bit input block to produce a new state.
+ */
+static void
+SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap)
+{
+	uint32_t W[64];
+	uint32_t S[8];
+	uint32_t t0, t1;
+	int i;
+
+	/* 1. Prepare message schedule W. */
+	if(swap)
+		for (i = 0; i < 16; i++)
+			W[i] = byteswap(block[i]);
+	else
+		memcpy(W, block, 64);
+	for (i = 16; i < 64; i += 2) {
+		W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
+	}
+
+	/* 2. Initialize working variables. */
+	memcpy(S, state, 32);
+
+	/* 3. Mix. */
+	RNDr(S, W, 0, 0x428a2f98);
+	RNDr(S, W, 1, 0x71374491);
+	RNDr(S, W, 2, 0xb5c0fbcf);
+	RNDr(S, W, 3, 0xe9b5dba5);
+	RNDr(S, W, 4, 0x3956c25b);
+	RNDr(S, W, 5, 0x59f111f1);
+	RNDr(S, W, 6, 0x923f82a4);
+	RNDr(S, W, 7, 0xab1c5ed5);
+	RNDr(S, W, 8, 0xd807aa98);
+	RNDr(S, W, 9, 0x12835b01);
+	RNDr(S, W, 10, 0x243185be);
+	RNDr(S, W, 11, 0x550c7dc3);
+	RNDr(S, W, 12, 0x72be5d74);
+	RNDr(S, W, 13, 0x80deb1fe);
+	RNDr(S, W, 14, 0x9bdc06a7);
+	RNDr(S, W, 15, 0xc19bf174);
+	RNDr(S, W, 16, 0xe49b69c1);
+	RNDr(S, W, 17, 0xefbe4786);
+	RNDr(S, W, 18, 0x0fc19dc6);
+	RNDr(S, W, 19, 0x240ca1cc);
+	RNDr(S, W, 20, 0x2de92c6f);
+	RNDr(S, W, 21, 0x4a7484aa);
+	RNDr(S, W, 22, 0x5cb0a9dc);
+	RNDr(S, W, 23, 0x76f988da);
+	RNDr(S, W, 24, 0x983e5152);
+	RNDr(S, W, 25, 0xa831c66d);
+	RNDr(S, W, 26, 0xb00327c8);
+	RNDr(S, W, 27, 0xbf597fc7);
+	RNDr(S, W, 28, 0xc6e00bf3);
+	RNDr(S, W, 29, 0xd5a79147);
+	RNDr(S, W, 30, 0x06ca6351);
+	RNDr(S, W, 31, 0x14292967);
+	RNDr(S, W, 32, 0x27b70a85);
+	RNDr(S, W, 33, 0x2e1b2138);
+	RNDr(S, W, 34, 0x4d2c6dfc);
+	RNDr(S, W, 35, 0x53380d13);
+	RNDr(S, W, 36, 0x650a7354);
+	RNDr(S, W, 37, 0x766a0abb);
+	RNDr(S, W, 38, 0x81c2c92e);
+	RNDr(S, W, 39, 0x92722c85);
+	RNDr(S, W, 40, 0xa2bfe8a1);
+	RNDr(S, W, 41, 0xa81a664b);
+	RNDr(S, W, 42, 0xc24b8b70);
+	RNDr(S, W, 43, 0xc76c51a3);
+	RNDr(S, W, 44, 0xd192e819);
+	RNDr(S, W, 45, 0xd6990624);
+	RNDr(S, W, 46, 0xf40e3585);
+	RNDr(S, W, 47, 0x106aa070);
+	RNDr(S, W, 48, 0x19a4c116);
+	RNDr(S, W, 49, 0x1e376c08);
+	RNDr(S, W, 50, 0x2748774c);
+	RNDr(S, W, 51, 0x34b0bcb5);
+	RNDr(S, W, 52, 0x391c0cb3);
+	RNDr(S, W, 53, 0x4ed8aa4a);
+	RNDr(S, W, 54, 0x5b9cca4f);
+	RNDr(S, W, 55, 0x682e6ff3);
+	RNDr(S, W, 56, 0x748f82ee);
+	RNDr(S, W, 57, 0x78a5636f);
+	RNDr(S, W, 58, 0x84c87814);
+	RNDr(S, W, 59, 0x8cc70208);
+	RNDr(S, W, 60, 0x90befffa);
+	RNDr(S, W, 61, 0xa4506ceb);
+	RNDr(S, W, 62, 0xbef9a3f7);
+	RNDr(S, W, 63, 0xc67178f2);
+
+	/* 4. Mix local working variables into global state */
+	for (i = 0; i < 8; i++)
+		state[i] += S[i];
+}
+
+static inline void
+SHA256_InitState(uint32_t * state)
+{
+	/* Magic initialization constants */
+	state[0] = 0x6A09E667;
+	state[1] = 0xBB67AE85;
+	state[2] = 0x3C6EF372;
+	state[3] = 0xA54FF53A;
+	state[4] = 0x510E527F;
+	state[5] = 0x9B05688C;
+	state[6] = 0x1F83D9AB;
+	state[7] = 0x5BE0CD19;
+}
+
+static const uint32_t passwdpad[12] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000};
+static const uint32_t outerpad[8] = {0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300};
+
+/**
+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
+ */
+static inline void
+PBKDF2_SHA256_80_128(const uint32_t * passwd, uint32_t * buf)
+{
+	SHA256_CTX PShictx, PShoctx;
+	uint32_t tstate[8];
+	uint32_t ihash[8];
+	uint32_t i;
+	uint32_t pad[16];
+	
+	static const uint32_t innerpad[11] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa0040000};
+
+	/* If Klen > 64, the key is really SHA256(K). */
+	SHA256_InitState(tstate);
+	SHA256_Transform(tstate, passwd, 1);
+	memcpy(pad, passwd+16, 16);
+	memcpy(pad+4, passwdpad, 48);
+	SHA256_Transform(tstate, pad, 1);
+	memcpy(ihash, tstate, 32);
+
+	SHA256_InitState(PShictx.state);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x36363636;
+	for (; i < 16; i++)
+		pad[i] = 0x36363636;
+	SHA256_Transform(PShictx.state, pad, 0);
+	SHA256_Transform(PShictx.state, passwd, 1);
+	be32enc_vect(PShictx.buf, passwd+16, 4);
+	be32enc_vect(PShictx.buf+5, innerpad, 11);
+
+	SHA256_InitState(PShoctx.state);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for (; i < 16; i++)
+		pad[i] = 0x5c5c5c5c;
+	SHA256_Transform(PShoctx.state, pad, 0);
+	memcpy(PShoctx.buf+8, outerpad, 32);
+
+	/* Iterate through the blocks. */
+	for (i = 0; i < 4; i++) {
+		uint32_t istate[8];
+		uint32_t ostate[8];
+		
+		memcpy(istate, PShictx.state, 32);
+		PShictx.buf[4] = i + 1;
+		SHA256_Transform(istate, PShictx.buf, 0);
+		memcpy(PShoctx.buf, istate, 32);
+
+		memcpy(ostate, PShoctx.state, 32);
+		SHA256_Transform(ostate, PShoctx.buf, 0);
+		be32enc_vect(buf+i*8, ostate, 8);
+	}
+}
+
+
+static inline uint32_t
+PBKDF2_SHA256_80_128_32(const uint32_t * passwd, const uint32_t * salt)
+{
+	uint32_t tstate[8];
+	uint32_t ostate[8];
+	uint32_t ihash[8];
+	uint32_t i;
+
+	/* Compute HMAC state after processing P and S. */
+	uint32_t pad[16];
+	
+	static const uint32_t ihash_finalblk[16] = {0x00000001,0x80000000,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x00000620};
+
+	/* If Klen > 64, the key is really SHA256(K). */
+	SHA256_InitState(tstate);
+	SHA256_Transform(tstate, passwd, 1);
+	memcpy(pad, passwd+16, 16);
+	memcpy(pad+4, passwdpad, 48);
+	SHA256_Transform(tstate, pad, 1);
+	memcpy(ihash, tstate, 32);
+
+	SHA256_InitState(ostate);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
+	for (; i < 16; i++)
+		pad[i] = 0x5c5c5c5c;
+	SHA256_Transform(ostate, pad, 0);
+
+	SHA256_InitState(tstate);
+	for (i = 0; i < 8; i++)
+		pad[i] = ihash[i] ^ 0x36363636;
+	for (; i < 16; i++)
+		pad[i] = 0x36363636;
+	SHA256_Transform(tstate, pad, 0);
+	SHA256_Transform(tstate, salt, 1);
+	SHA256_Transform(tstate, salt+16, 1);
+	SHA256_Transform(tstate, ihash_finalblk, 0);
+	memcpy(pad, tstate, 32);
+	memcpy(pad+8, outerpad, 32);
+
+	/* Feed the inner hash to the outer SHA256 operation. */
+	SHA256_Transform(ostate, pad, 0);
+	/* Finish the outer SHA256 operation. */
+	return byteswap(ostate[7]);
+}
+
+
+/**
+ * salsa20_8(B):
+ * Apply the salsa20/8 core to the provided block.
+ */
+static inline void
+salsa20_8(uint32_t B[16], const uint32_t Bx[16])
+{
+	uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
+	size_t i;
+
+	x00 = (B[ 0] ^= Bx[ 0]);
+	x01 = (B[ 1] ^= Bx[ 1]);
+	x02 = (B[ 2] ^= Bx[ 2]);
+	x03 = (B[ 3] ^= Bx[ 3]);
+	x04 = (B[ 4] ^= Bx[ 4]);
+	x05 = (B[ 5] ^= Bx[ 5]);
+	x06 = (B[ 6] ^= Bx[ 6]);
+	x07 = (B[ 7] ^= Bx[ 7]);
+	x08 = (B[ 8] ^= Bx[ 8]);
+	x09 = (B[ 9] ^= Bx[ 9]);
+	x10 = (B[10] ^= Bx[10]);
+	x11 = (B[11] ^= Bx[11]);
+	x12 = (B[12] ^= Bx[12]);
+	x13 = (B[13] ^= Bx[13]);
+	x14 = (B[14] ^= Bx[14]);
+	x15 = (B[15] ^= Bx[15]);
+	for (i = 0; i < 8; i += 2) {
+#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
+		/* Operate on columns. */
+		x04 ^= R(x00+x12, 7);	x09 ^= R(x05+x01, 7);	x14 ^= R(x10+x06, 7);	x03 ^= R(x15+x11, 7);
+		x08 ^= R(x04+x00, 9);	x13 ^= R(x09+x05, 9);	x02 ^= R(x14+x10, 9);	x07 ^= R(x03+x15, 9);
+		x12 ^= R(x08+x04,13);	x01 ^= R(x13+x09,13);	x06 ^= R(x02+x14,13);	x11 ^= R(x07+x03,13);
+		x00 ^= R(x12+x08,18);	x05 ^= R(x01+x13,18);	x10 ^= R(x06+x02,18);	x15 ^= R(x11+x07,18);
+
+		/* Operate on rows. */
+		x01 ^= R(x00+x03, 7);	x06 ^= R(x05+x04, 7);	x11 ^= R(x10+x09, 7);	x12 ^= R(x15+x14, 7);
+		x02 ^= R(x01+x00, 9);	x07 ^= R(x06+x05, 9);	x08 ^= R(x11+x10, 9);	x13 ^= R(x12+x15, 9);
+		x03 ^= R(x02+x01,13);	x04 ^= R(x07+x06,13);	x09 ^= R(x08+x11,13);	x14 ^= R(x13+x12,13);
+		x00 ^= R(x03+x02,18);	x05 ^= R(x04+x07,18);	x10 ^= R(x09+x08,18);	x15 ^= R(x14+x13,18);
+#undef R
+	}
+	B[ 0] += x00;
+	B[ 1] += x01;
+	B[ 2] += x02;
+	B[ 3] += x03;
+	B[ 4] += x04;
+	B[ 5] += x05;
+	B[ 6] += x06;
+	B[ 7] += x07;
+	B[ 8] += x08;
+	B[ 9] += x09;
+	B[10] += x10;
+	B[11] += x11;
+	B[12] += x12;
+	B[13] += x13;
+	B[14] += x14;
+	B[15] += x15;
+}
+
+/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
+   scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
+ */
+static uint32_t scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad)
+{
+	uint32_t * V;
+	uint32_t X[32];
+	uint32_t i;
+	uint32_t j;
+	uint32_t k;
+	uint64_t *p1, *p2;
+
+	p1 = (uint64_t *)X;
+	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
+
+	PBKDF2_SHA256_80_128(input, X);
+
+	for (i = 0; i < 1024; i += 2) {
+		memcpy(&V[i * 32], X, 128);
+
+		salsa20_8(&X[0], &X[16]);
+		salsa20_8(&X[16], &X[0]);
+
+		memcpy(&V[(i + 1) * 32], X, 128);
+
+		salsa20_8(&X[0], &X[16]);
+		salsa20_8(&X[16], &X[0]);
+	}
+	for (i = 0; i < 1024; i += 2) {
+		j = X[16] & 1023;
+		p2 = (uint64_t *)(&V[j * 32]);
+		for(k = 0; k < 16; k++)
+			p1[k] ^= p2[k];
+
+		salsa20_8(&X[0], &X[16]);
+		salsa20_8(&X[16], &X[0]);
+
+		j = X[16] & 1023;
+		p2 = (uint64_t *)(&V[j * 32]);
+		for(k = 0; k < 16; k++)
+			p1[k] ^= p2[k];
+
+		salsa20_8(&X[0], &X[16]);
+		salsa20_8(&X[16], &X[0]);
+	}
+
+	return PBKDF2_SHA256_80_128_32(input, X);
+}
+
+bool scanhash_scrypt(struct thr_info *thr, const unsigned char *pmidstate, unsigned char *pdata,
+	unsigned char *phash1, unsigned char *phash,
+	const unsigned char *ptarget,
+	uint32_t max_nonce, uint32_t *last_nonce,
+	uint32_t n)
+{
+	uint32_t *nonce = (uint32_t *)(pdata + 76);
+	unsigned char *scratchbuf;
+	uint32_t data[20];
+	uint32_t tmp_hash7;
+	uint32_t Htarg = ((const uint32_t *)ptarget)[7];
+	bool ret = false;
+	int i;
+
+	be32enc_vect(data, (const uint32_t *)pdata, 19);
+
+	scratchbuf = malloc(131583);
+	if (unlikely(!scratchbuf)) {
+		applog(LOG_ERR, "Failed to malloc scratchbuf in scanhash_scrypt");
+		return ret;
+	}
+
+	while(1) {
+		*nonce = ++n;
+		data[19] = n;
+		tmp_hash7 = scrypt_1024_1_1_256_sp(data, scratchbuf);
+
+		if (unlikely(tmp_hash7 <= Htarg)) {
+			((uint32_t *)pdata)[19] = byteswap(n);
+			*last_nonce = n;
+			ret = true;
+			break;
+		}
+
+		if (unlikely((n >= max_nonce) || thr->work_restart)) {
+			*last_nonce = n;
+			break;
+		}
+	}
+out_ret:
+	free(scratchbuf);;
+	return ret;
+}
+

+ 757 - 0
scrypt120724.cl

@@ -0,0 +1,757 @@
+#define rotl(x,y) rotate(x,y)
+#define Ch(x,y,z) bitselect(z,y,x)
+#define Maj(x,y,z) Ch((x^z),y,z)
+
+#define EndianSwap(n) (rotl(n&0x00FF00FF,24U)|rotl(n&0xFF00FF00,8U))
+
+#define Tr2(x)		(rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))
+#define Tr1(x)		(rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))
+#define Wr2(x)		(rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))
+#define Wr1(x)		(rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))
+
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	h += Tr1(e) + Ch(e, f, g) + k;		\
+	d += h;					\
+	h += Tr2(a) + Maj(a, b, c);
+
+void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
+{
+	uint4 S0 = *state0;
+	uint4 S1 = *state1;
+	
+#define A S0.x
+#define B S0.y
+#define C S0.z
+#define D S0.w
+#define E S1.x
+#define F S1.y
+#define G S1.z
+#define H S1.w
+
+	uint4 W[4];
+
+	W[ 0].x = block0.x;
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x428a2f98U);
+	W[ 0].y = block0.y;
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x71374491U);
+	W[ 0].z = block0.z;
+	RND(G,H,A,B,C,D,E,F, W[0].z+0xb5c0fbcfU);
+	W[ 0].w = block0.w;
+	RND(F,G,H,A,B,C,D,E, W[0].w+0xe9b5dba5U);
+
+	W[ 1].x = block1.x;
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU);
+	W[ 1].y = block1.y;
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U);
+	W[ 1].z = block1.z;
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U);
+	W[ 1].w = block1.w;
+	RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U);
+
+	W[ 2].x = block2.x;
+	RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U);
+	W[ 2].y = block2.y;
+	RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U);
+	W[ 2].z = block2.z;
+	RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU);
+	W[ 2].w = block2.w;
+	RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U);
+
+	W[ 3].x = block3.x;
+	RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U);
+	W[ 3].y = block3.y;
+	RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU);
+	W[ 3].z = block3.z;
+	RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U);
+	W[ 3].w = block3.w;
+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U);
+
+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U);
+
+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U);
+
+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U);
+
+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU);
+
+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU);
+
+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU);
+
+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU);
+
+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU);
+
+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U);
+
+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU);
+
+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U);
+
+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U);
+
+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U);
+
+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U);
+
+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U);
+
+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U);
+
+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U);
+
+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U);
+
+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU);
+
+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U);
+
+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U);
+
+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU);
+
+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU);
+
+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U);
+
+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U);
+
+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU);
+
+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U);
+
+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U);
+
+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U);
+
+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U);
+
+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U);
+
+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U);
+
+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U);
+
+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U);
+
+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU);
+
+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U);
+
+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U);
+
+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU);
+
+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU);
+
+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U);
+
+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU);
+
+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU);
+
+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U);
+
+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U);
+
+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU);
+
+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU);
+
+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U);
+
+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U);
+	
+#undef A
+#undef B
+#undef C
+#undef D
+#undef E
+#undef F
+#undef G
+#undef H
+
+	*state0 += S0;
+	*state1 += S1;
+}
+
+void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
+{
+#define A (*state0).x
+#define B (*state0).y
+#define C (*state0).z
+#define D (*state0).w
+#define E (*state1).x
+#define F (*state1).y
+#define G (*state1).z
+#define H (*state1).w
+
+	uint4 W[4];
+
+	W[0].x = block0.x;
+	D=0x98c7e2a2U+W[0].x;
+	H=0xfc08884dU+W[0].x;
+
+	W[0].y = block0.y;
+	C=0xcd2a11aeU+Tr1(D)+Ch(D,0x510e527fU,0x9b05688cU)+W[0].y;
+	G=0xC3910C8EU+C+Tr2(H)+Ch(H,0xfb6feee7U,0x2a01a605U);
+
+	W[0].z = block0.z;
+	B=0x0c2e12e0U+Tr1(C)+Ch(C,D,0x510e527fU)+W[0].z;
+	F=0x4498517BU+B+Tr2(G)+Maj(G,H,0x6a09e667U);
+
+	W[0].w = block0.w;
+	A=0xa4ce148bU+Tr1(B)+Ch(B,C,D)+W[0].w; 
+	E=0x95F61999U+A+Tr2(F)+Maj(F,G,H);
+
+	W[1].x = block1.x;
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU);
+	W[1].y = block1.y;
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U);
+	W[1].z = block1.z;
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U);
+	W[1].w = block1.w;
+	RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U);
+	
+	W[2].x = block2.x;
+	RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U);
+	W[2].y = block2.y;
+	RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U);
+	W[2].z = block2.z;
+	RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU);
+	W[2].w = block2.w;
+	RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U);
+	
+	W[3].x = block3.x;
+	RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U);
+	W[3].y = block3.y;
+	RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU);
+	W[3].z = block3.z;
+	RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U);
+	W[3].w = block3.w;
+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U);
+
+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U);
+
+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U);
+
+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U);
+
+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU);
+
+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU);
+
+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU);
+
+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU);
+
+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU);
+
+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U);
+
+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU);
+
+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U);
+
+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U);
+
+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U);
+
+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U);
+
+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U);
+
+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U);
+
+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U);
+
+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U);
+
+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU);
+
+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U);
+
+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U);
+
+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU);
+
+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU);
+
+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U);
+
+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U);
+
+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU);
+
+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U);
+
+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U);
+
+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U);
+
+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U);
+
+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U);
+
+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U);
+
+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U);
+
+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U);
+
+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU);
+
+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U);
+
+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U);
+
+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU);
+
+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU);
+
+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U);
+
+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU);
+
+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU);
+
+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U);
+
+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U);
+
+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU);
+
+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU);
+
+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U);
+
+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U);
+	
+#undef A
+#undef B
+#undef C
+#undef D
+#undef E
+#undef F
+#undef G
+#undef H
+
+	*state0 += (uint4)(0x6A09E667U,0xBB67AE85U,0x3C6EF372U,0xA54FF53AU);
+	*state1 += (uint4)(0x510E527FU,0x9B05688CU,0x1F83D9ABU,0x5BE0CD19U);
+}
+
+__constant uint fixedW[64] =
+{
+	0x428a2f99,0xf1374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
+	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf794,
+	0xf59b89c2,0x73924787,0x23c6886e,0xa42ca65c,0x15ed3627,0x4d6edcbf,0xe28217fc,0xef02488f,
+	0xb707775c,0x0468c23f,0xe7e72b4c,0x49e1f1a2,0x4b99c816,0x926d1570,0xaa0fc072,0xadb36e2c,
+	0xad87a3ea,0xbcb1d3a3,0x7b993186,0x562b9420,0xbff3ca0c,0xda4b0c23,0x6cd8711a,0x8f337caa,
+	0xc91b1417,0xc359dce1,0xa83253a7,0x3b13c12d,0x9d3d725d,0xd9031a84,0xb1a03340,0x16f58012,
+	0xe64fb6a2,0xe84d923a,0xe93a5730,0x09837686,0x078ff753,0x29833341,0xd5de0b7e,0x6948ccf4,
+	0xe0a1adbe,0x7c728e11,0x511c78e4,0x315b45bd,0xfca71413,0xea28f96a,0x79703128,0x4e1ef848,
+};
+
+void SHA256_fixed(uint4*restrict state0,uint4*restrict state1)
+{
+	uint4 S0 = *state0;
+	uint4 S1 = *state1;
+
+#define A S0.x
+#define B S0.y
+#define C S0.z
+#define D S0.w
+#define E S1.x
+#define F S1.y
+#define G S1.z
+#define H S1.w
+
+	RND(A,B,C,D,E,F,G,H, fixedW[0]);
+	RND(H,A,B,C,D,E,F,G, fixedW[1]);
+	RND(G,H,A,B,C,D,E,F, fixedW[2]);
+	RND(F,G,H,A,B,C,D,E, fixedW[3]);
+	RND(E,F,G,H,A,B,C,D, fixedW[4]);
+	RND(D,E,F,G,H,A,B,C, fixedW[5]);
+	RND(C,D,E,F,G,H,A,B, fixedW[6]);
+	RND(B,C,D,E,F,G,H,A, fixedW[7]);
+	RND(A,B,C,D,E,F,G,H, fixedW[8]);
+	RND(H,A,B,C,D,E,F,G, fixedW[9]);
+	RND(G,H,A,B,C,D,E,F, fixedW[10]);
+	RND(F,G,H,A,B,C,D,E, fixedW[11]);
+	RND(E,F,G,H,A,B,C,D, fixedW[12]);
+	RND(D,E,F,G,H,A,B,C, fixedW[13]);
+	RND(C,D,E,F,G,H,A,B, fixedW[14]);
+	RND(B,C,D,E,F,G,H,A, fixedW[15]);
+	RND(A,B,C,D,E,F,G,H, fixedW[16]);
+	RND(H,A,B,C,D,E,F,G, fixedW[17]);
+	RND(G,H,A,B,C,D,E,F, fixedW[18]);
+	RND(F,G,H,A,B,C,D,E, fixedW[19]);
+	RND(E,F,G,H,A,B,C,D, fixedW[20]);
+	RND(D,E,F,G,H,A,B,C, fixedW[21]);
+	RND(C,D,E,F,G,H,A,B, fixedW[22]);
+	RND(B,C,D,E,F,G,H,A, fixedW[23]);
+	RND(A,B,C,D,E,F,G,H, fixedW[24]);
+	RND(H,A,B,C,D,E,F,G, fixedW[25]);
+	RND(G,H,A,B,C,D,E,F, fixedW[26]);
+	RND(F,G,H,A,B,C,D,E, fixedW[27]);
+	RND(E,F,G,H,A,B,C,D, fixedW[28]);
+	RND(D,E,F,G,H,A,B,C, fixedW[29]);
+	RND(C,D,E,F,G,H,A,B, fixedW[30]);
+	RND(B,C,D,E,F,G,H,A, fixedW[31]);
+	RND(A,B,C,D,E,F,G,H, fixedW[32]);
+	RND(H,A,B,C,D,E,F,G, fixedW[33]);
+	RND(G,H,A,B,C,D,E,F, fixedW[34]);
+	RND(F,G,H,A,B,C,D,E, fixedW[35]);
+	RND(E,F,G,H,A,B,C,D, fixedW[36]);
+	RND(D,E,F,G,H,A,B,C, fixedW[37]);
+	RND(C,D,E,F,G,H,A,B, fixedW[38]);
+	RND(B,C,D,E,F,G,H,A, fixedW[39]);
+	RND(A,B,C,D,E,F,G,H, fixedW[40]);
+	RND(H,A,B,C,D,E,F,G, fixedW[41]);
+	RND(G,H,A,B,C,D,E,F, fixedW[42]);
+	RND(F,G,H,A,B,C,D,E, fixedW[43]);
+	RND(E,F,G,H,A,B,C,D, fixedW[44]);
+	RND(D,E,F,G,H,A,B,C, fixedW[45]);
+	RND(C,D,E,F,G,H,A,B, fixedW[46]);
+	RND(B,C,D,E,F,G,H,A, fixedW[47]);
+	RND(A,B,C,D,E,F,G,H, fixedW[48]);
+	RND(H,A,B,C,D,E,F,G, fixedW[49]);
+	RND(G,H,A,B,C,D,E,F, fixedW[50]);
+	RND(F,G,H,A,B,C,D,E, fixedW[51]);
+	RND(E,F,G,H,A,B,C,D, fixedW[52]);
+	RND(D,E,F,G,H,A,B,C, fixedW[53]);
+	RND(C,D,E,F,G,H,A,B, fixedW[54]);
+	RND(B,C,D,E,F,G,H,A, fixedW[55]);
+	RND(A,B,C,D,E,F,G,H, fixedW[56]);
+	RND(H,A,B,C,D,E,F,G, fixedW[57]);
+	RND(G,H,A,B,C,D,E,F, fixedW[58]);
+	RND(F,G,H,A,B,C,D,E, fixedW[59]);
+	RND(E,F,G,H,A,B,C,D, fixedW[60]);
+	RND(D,E,F,G,H,A,B,C, fixedW[61]);
+	RND(C,D,E,F,G,H,A,B, fixedW[62]);
+	RND(B,C,D,E,F,G,H,A, fixedW[63]);
+	
+#undef A
+#undef B
+#undef C
+#undef D
+#undef E
+#undef F
+#undef G
+#undef H
+	*state0 += S0;
+	*state1 += S1;
+}
+
+void shittify(uint4 B[8])
+{
+	uint4 tmp[4];
+	tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w);
+	tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w);
+	tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w);
+	tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w);
+	
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i] = EndianSwap(tmp[i]);
+
+	tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w);
+	tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w);
+	tmp[2] = (uint4)(B[7].x,B[4].y,B[5].z,B[6].w);
+	tmp[3] = (uint4)(B[4].x,B[5].y,B[6].z,B[7].w);
+	
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i+4] = EndianSwap(tmp[i]);
+}
+
+void unshittify(uint4 B[8])
+{
+	uint4 tmp[4];
+	tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w);
+	tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w);
+	tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w);
+	tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w);
+	
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i] = EndianSwap(tmp[i]);
+
+	tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w);
+	tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w);
+	tmp[2] = (uint4)(B[5].x,B[4].y,B[7].z,B[6].w);
+	tmp[3] = (uint4)(B[6].x,B[5].y,B[4].z,B[7].w);
+	
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i+4] = EndianSwap(tmp[i]);
+}
+
+void salsa(uint4 B[8])
+{
+	uint4 w[4];
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		w[i] = (B[i]^=B[i+4]);
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+	{
+		w[0] ^= rotl(w[3]     +w[2]     , 7U);
+		w[1] ^= rotl(w[0]     +w[3]     , 9U);
+		w[2] ^= rotl(w[1]     +w[0]     ,13U);
+		w[3] ^= rotl(w[2]     +w[1]     ,18U);
+		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+	}
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		w[i] = (B[i+4]^=(B[i]+=w[i]));
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+	{
+		w[0] ^= rotl(w[3]     +w[2]     , 7U);
+		w[1] ^= rotl(w[0]     +w[3]     , 9U);
+		w[2] ^= rotl(w[1]     +w[0]     ,13U);
+		w[3] ^= rotl(w[2]     +w[1]     ,18U);
+		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+	}
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i+4] += w[i];
+}
+
+#define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
+#define CO Coord(z,x,y)
+
+void scrypt_core(uint4 X[8], __global uint4*restrict lookup)
+{
+	shittify(X);
+	const uint zSIZE = 8;
+	const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0));
+	const uint xSIZE = CONCURRENT_THREADS;
+	uint x = get_global_id(0)%xSIZE;
+
+	for(uint y=0; y<1024/LOOKUP_GAP; ++y)
+	{
+#pragma unroll
+		for(uint z=0; z<zSIZE; ++z)
+			lookup[CO] = X[z];
+		for(uint i=0; i<LOOKUP_GAP; ++i) 
+			salsa(X);
+	}
+#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
+	{
+		uint y = (1024/LOOKUP_GAP);
+#pragma unroll
+		for(uint z=0; z<zSIZE; ++z)
+			lookup[CO] = X[z];
+		for(uint i=0; i<1024%LOOKUP_GAP; ++i)
+			salsa(X); 
+	}
+#endif
+	for (uint i=0; i<1024; ++i) 
+	{
+		uint4 V[8];
+		uint j = X[7].x & 0x3FF;
+		uint y = (j/LOOKUP_GAP);
+#pragma unroll
+		for(uint z=0; z<zSIZE; ++z)
+			V[z] = lookup[CO];
+
+#if (LOOKUP_GAP == 1)
+#elif (LOOKUP_GAP == 2)
+		if (j&1)
+			salsa(V);
+#else
+		uint val = j%LOOKUP_GAP;
+		for (uint z=0; z<val; ++z) 
+			salsa(V);
+#endif
+
+#pragma unroll
+		for(uint z=0; z<zSIZE; ++z)
+			X[z] ^= V[z];
+		salsa(X);
+	}
+	unshittify(X);
+}
+
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(__global const uint4 * restrict input,
+__global uint*restrict output, __global uint4*restrict padcache,
+const uint4 midstate0, const uint4 midstate16, const uint target)
+{
+	uint gid = get_global_id(0);
+	uint4 X[8];
+	uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1;
+	uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid);
+	uint4 pad0 = midstate0, pad1 = midstate16;
+
+	SHA256(&pad0,&pad1, data, (uint4)(0x80000000U,0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0,0x280));
+	SHA256_fresh(&ostate0,&ostate1, pad0^0x5C5C5C5CU, pad1^0x5C5C5C5CU, 0x5C5C5C5CU, 0x5C5C5C5CU);
+	SHA256_fresh(&tstate0,&tstate1, pad0^0x36363636U, pad1^0x36363636U, 0x36363636U, 0x36363636U);
+
+	tmp0 = tstate0;
+	tmp1 = tstate1;
+	SHA256(&tstate0, &tstate1, input[0],input[1],input[2],input[3]);
+
+#pragma unroll
+	for (uint i=0; i<4; i++) 
+	{
+		pad0 = tstate0;
+		pad1 = tstate1;
+		X[i*2 ] = ostate0;
+		X[i*2+1] = ostate1;
+
+		SHA256(&pad0,&pad1, data, (uint4)(i+1,0x80000000U,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0,0x4a0U));
+		SHA256(X+i*2,X+i*2+1, pad0, pad1, (uint4)(0x80000000U, 0U, 0U, 0U), (uint4)(0U, 0U, 0U, 0x300U));
+	}
+	scrypt_core(X,padcache);
+	SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);
+	SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);
+	SHA256_fixed(&tmp0,&tmp1);
+	SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(0x80000000U, 0U, 0U, 0U), (uint4)(0U, 0U, 0U, 0x300U));
+
+	bool found = (EndianSwap(ostate1.w) <= target);
+	if (found)
+		output[FOUND] = output[NFLAG & gid] = gid;
+}
+
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
+ * 2012 Con Kolivas.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */