13 years ago · 8b557bd1d1
--- a/Makefile.am
+++ b/Makefile.am
@@ -10,7 +10,7 @@ endif
 
															 EXTRA_DIST	= example.conf m4/gnulib-cache.m4 linux-usb-bfgminer \
														
 
															 		  ADL_SDK/readme.txt api-example.php miner.php	\
														
 
															 		  API.class API.java api-example.c windows-build.txt \
														
 
															-		  bitstreams/* API-README FPGA-README
														
 
															+		  bitstreams/* API-README FPGA-README SCRYPT-README
														
 
															 SUBDIRS		= lib compat ccan
														
@@ -56,6 +56,10 @@ bfgminer_SOURCES	+= \
 
															 # the CPU portion extracted from original main.c
														
 
															 bfgminer_SOURCES += driver-cpu.h driver-cpu.c
														
 
															+if HAS_SCRYPT
														
 
															+bfgminer_SOURCES += scrypt.c
														
 
															+endif
														
 
															+
														
 
															 if HAS_YASM
														
 
															 AM_CFLAGS	= -DHAS_YASM
														
 
															 if HAVE_x86_64
														
--- a/NEWS
+++ b/NEWS
@@ -1,3 +1,76 @@
 
															+Version 2.6.0 - July 29, 2012
														
 
															+
														
 
															+- Add scrypt documentation in the form of a separate readme.
														
 
															+- Fix build error without scrypt enabled.
														
 
															+- Limit thread concurrency for scrypt to 5xshaders if shaders is specified.
														
 
															+- Simplify repeated use of gpus[gpu]. in ocl.c
														
 
															+- Find the nearest power of 2 maximum alloc size for the scrypt buffer that can
														
 
															+successfully be allocated and is large enough to accomodate the thread
														
 
															+concurrency chosen, thus mapping it to an intensity.
														
 
															+- Don't make opt_scrypt mandatory blocking with opencl code.
														
 
															+- Update kernel versions reflecting changes in the API.
														
 
															+- Make the thread concurrency and lookup gap options hidden on the command line
														
 
															+and autotune parameters with a newly parsed --shaders option.
														
 
															+- Fix target testing with scrypt kernel as it would have been missing shares
														
 
															+below target.
														
 
															+- Always create the largest possible padbuffer for scrypt kernels even if not
														
 
															+needed for thread_concurrency, giving us some headroom for intensity levels.
														
 
															+- Use the detected maximum allocable memory on a GPU to determine the optimal
														
 
															+scrypt settings when lookup_gap and thread_concurrency parameters are not given.
														
 
															+- Check the maximum allocable memory size per opencl device.
														
 
															+- Add debugging output if buffer allocation fails for scrypt and round up
														
 
															+bufsize to a multiple of 256.
														
 
															+- Nonce testing for btc got screwed up, leading to no accepted shares. Fix it.
														
 
															+- Display size of scrypt buffer used in debug.
														
 
															+- Allow intensities up to 20 if scrypt is compiled in.
														
 
															+- Add name to scrypt kernel copyright.
														
 
															+- Allow lookup gap and thread concurrency to be passed per device and store
														
 
															+details in kernel binary filename.
														
 
															+- Ignore negative intensities for scrypt.
														
 
															+- Change the scale of intensity for scrypt kernel and fix a build warning.
														
 
															+- Correct target value passed to scrypt kernel.
														
 
															+- Use 256 output slots for kernels to allow 1 for each worksize.
														
 
															+- Test the target in the actual scrypt kernel itself saving further
														
 
															+calculations.
														
 
															+- Reinstate GPU only opencl device detection.
														
 
															+- Decrease lookup gap to 1. Does not seem to help in any way being 2.
														
 
															+- Fix build.
														
 
															+- Make pad0 and pad1 local variable in scrypt kernel.
														
 
															+- Constify input variable in scrypt kernel.
														
 
															+- Send correct values to scrypt kernel to get it finally working.
														
 
															+- Create command queue before compiling program in opencl.
														
 
															+- Fix external scrypt algo missing.
														
 
															+- Limit scrypt to 1 vector.
														
 
															+- Handle KL_SCRYPT in config write.
														
 
															+- Get rid of stuff.
														
 
															+- Don't enqueuewrite buffer at all for pad8 and pass work details around for
														
 
															+scrypt in dev_blk.
														
 
															+- Set the correct data for cldata and prepare for pad8 fixes.
														
 
															+- Get rid of spaces in arrays in scrypt kernel.
														
 
															+- Start with smaller amount of hashes in cpu mining to enable scrypt to return
														
 
															+today sometime.
														
 
															+- Free the scratchbuf memory allocated in scrypt and don't check if CPUs are
														
 
															+sick since they can't be. Prepare for khash hash rates in display.
														
 
															+- Add cpumining capability for scrypt.
														
 
															+- Set scrypt settings and buffer size in ocl.c code to be future modifiable.
														
 
															+- Cope with when we cannot set intensity low enough to meet dynamic interval by
														
 
															+inducing a forced sleep.
														
 
															+- Make dynamic and scrypt opencl calls blocking.
														
 
															+- Fix nonce submission code for scrypt.
														
 
															+- Make sure goffset is set for scrypt and drop padbuffer8 to something
														
 
															+manageable for now.
														
 
															+- Set up buffer8 for scrypt.
														
 
															+- Build fix for opt scrypt.
														
 
															+- Don't check postcalc nonce with sha256 in scrypt.
														
 
															+- Don't test nonce with sha and various fixes for scrypt.
														
 
															+- Make scrypt buffers and midstate compatible.
														
 
															+- Use specific output array entries in scrypt kernel.
														
 
															+- Provide initial support for the scrypt kernel to compile with and mine scrypt
														
 
															+with the --scrypt option.
														
 
															+- Enable completely compiling scrypt out.
														
 
															+- Begin import of scrypt opencl kernel from reaper.
														
 
															+
														
 
															+
														
 
															 BFGMiner Version 2.5.2 - July 29, 2012
														
 
															 - Limit total number of curls recruited per pool to the number of mining threads
														
@@ -153,6 +226,8 @@ mt_disable function.
 
															 - Use standard cfsetispeed/cfsetospeed to set baud rate on *nix
														
 
															 - miner.php split() flagged deprecated in PHP 5.3.0
														
 
															 - Bugfix: Use nmsleep instead of restart_wait, so we always wait the full time
														
 
															+- Make long timeout 10seconds on bitforce for when usleep or nanosleep just
														
 
															+can't be accurate...
														
 
															 BFGMiner Version 2.5.0 - July 7, 2012
														
--- a/README
+++ b/README
@@ -152,6 +152,7 @@ Options for both config file and command line:
 
															 --scan-time|-s <arg> Upper bound on time spent scanning current work, in seconds (default: 60)
														
 
															 --sched-start <arg> Set a time of day in HH:MM to start mining (a once off without a stop time)
														
 
															 --sched-stop <arg>  Set a time of day in HH:MM to stop mining (will quit without a start time)
														
 
															+--scrypt            Use the scrypt algorithm for mining (non-bitcoin)
														
 
															 --sharelog <arg>    Append share log to file
														
 
															 --shares <arg>      Quit after mining N shares (default: unlimited)
														
 
															 --socks-proxy <arg> Set socks4 proxy (host:port)
														
@@ -198,6 +199,14 @@ GPU only options:
 
															 --worksize|-w <arg> Override detected optimal worksize - one value or comma separated list
														
 
															+SCRYPT only options:
														
 
															+
														
 
															+--lookup-gap <arg>  Set GPU lookup gap for scrypt mining, comma separated
														
 
															+--thread-concurrency <arg> Set GPU thread concurrency for scrypt mining, comma separated
														
 
															+
														
 
															+See SCRYPT-README for more information regarding (non-bitcoin) scrypt mining.
														
 
															+
														
 
															+
														
 
															 FPGA mining boards(BitForce, Icarus, ModMiner, Ztex) only options:
														
 
															 --scan-serial|-S <arg> Serial port to probe for FPGA mining device
														
--- a/SCRYPT-README
+++ b/SCRYPT-README
@@ -0,0 +1,131 @@
 
															+If you wish to donate to the author of scrypt support, Con Kolivas, please send
														
 
															+your donations to:
														
 
															+
														
 
															+Bitcoin : 15qSxP1SQcUX3o4nhkfdbgyoWEFMomJ4rZ
														
 
															+Litecoin: Lc8TWMiKM7gRUrG8VB8pPNP1Yvt1SGZnoH
														
 
															+
														
 
															+---
														
 
															+
														
 
															+
														
 
															+Scrypt mining for GPU is completely different to sha256 used for bitcoin
														
 
															+mining. It has very different requirements to bitcoin mining and is a
														
 
															+lot more complicated to get working well. Note that it is a ram dependent
														
 
															+workload, and requires you to have enough system ram as well as fast enough
														
 
															+GPU ram.
														
 
															+
														
 
															+There are 5 main parameters to tuning scrypt, 2 of which you MUST set, and
														
 
															+the others are optional for further fine tuning. When you start scrypt mining
														
 
															+with the --scrypt option, BFGMiner will fail IN RANDOM WAYS. They are all due
														
 
															+to parameters being outside what the GPU can cope with. Not giving BFGMiner a
														
 
															+hint as to your GPU type, it will hardly ever perform well.
														
 
															+
														
 
															+
														
 
															+Step 1 on linux:
														
 
															+export GPU_MAX_ALLOC_PERCENT=100
														
 
															+If you do not do this, you may find it impossible to scrypt mine. You may find
														
 
															+a value of 40 is enough and increasing this further has little effect.
														
 
															+
														
 
															+export GPU_USE_SYNC_OBJECTS=1
														
 
															+may help CPU usage a little as well.
														
 
															+
														
 
															+--shaders XXX
														
 
															+
														
 
															+is a new option where you tell BFGMiner how many shaders your GPU has. This
														
 
															+helps BFGMiner try to choose some meaningful baseline parameters. Use this table
														
 
															+below to determine how many shaders your GPU has, and note that there are some
														
 
															+variants of these cards, and nvidia shaders are much much lower and virtually
														
 
															+pointless trying to mine on.
														
 
															+
														
 
															+GPU  Shaders
														
 
															+7750 512
														
 
															+7770 640
														
 
															+7850 1024
														
 
															+7870 1280
														
 
															+7950 1792
														
 
															+7970 2048
														
 
															+
														
 
															+6850 960
														
 
															+6870 1120
														
 
															+6950 1408
														
 
															+6970 1536
														
 
															+6990 (6970x2)
														
 
															+
														
 
															+6570 480
														
 
															+6670 480
														
 
															+6790 800
														
 
															+
														
 
															+6450 160
														
 
															+
														
 
															+5670 400
														
 
															+5750 720
														
 
															+5770 800
														
 
															+5830 1120
														
 
															+5850 1440
														
 
															+5870 1600
														
 
															+5970 (5870x2)
														
 
															+
														
 
															+These are only used as a rough guide for BFGMiner, and it is rare that this is
														
 
															+all you will need to set.
														
 
															+
														
 
															+
														
 
															+--intensity XX
														
 
															+
														
 
															+Just like in bitcoin mining, scrypt mining takes an intensity, however the
														
 
															+scale goes from 0 to 20 to mimic the "Aggression" used in mtrlt's reaper. The
														
 
															+reason this is crucial is that too high an intensity can actually be
														
 
															+disastrous with scrypt because it CAN run out of ram. Intensities over 13
														
 
															+start writing over the same ram and it is highly dependent on the GPU, but they
														
 
															+can start actually DECREASING your hashrate, or even worse, start producing
														
 
															+garbage with rejects skyrocketing.
														
 
															+
														
 
															+
														
 
															+Optional parameters to tune:
														
 
															+-g, --thread-concurrency, --lookup-gap
														
 
															+
														
 
															+-g:
														
 
															+Once you have found the optimal shaders and intensity, you can start increasing
														
 
															+the -g value till BFGMiner fails to start. Rarely will you be able to go over
														
 
															+about -g 4 and each increase in -g only increases hashrate slightly.
														
 
															+
														
 
															+--thread-concurrency:
														
 
															+This tunes the optimal size of work that scrypt can do. It is internally tuned
														
 
															+by BFGMiner to be the highest reasonable multiple of shaders that it can
														
 
															+allocate on your GPU. Ideally it should be a multiple of your shader count.
														
 
															+vliw5 architecture (R5XXX) would be best at 5x shaders, while VLIW4 (R6xxx and
														
 
															+R7xxx) are best at 4x. Setting thread concurrency overrides anything you put
														
 
															+into --shaders.
														
 
															+
														
 
															+--lookup-gap
														
 
															+This tunes a compromise between ram usage and performance. Performance peaks
														
 
															+at a gap of 2, but increasing the gap can save you some GPU ram, but almost
														
 
															+always at the cost of significant loss of hashrate. Setting lookup gap
														
 
															+overrides the default of 2, but BFGMiner will use the --shaders value to choose
														
 
															+a thread-concurrency if you haven't chosen one.
														
 
															+
														
 
															+
														
 
															+Overclocking for scrypt mining:
														
 
															+First of all, do not underclock your memory initially. Scrypt mining requires
														
 
															+memory speed and on most, but not all, GPUs, lowering memory speed lowers
														
 
															+mining performance.
														
 
															+
														
 
															+Second, absolute engine clock speeds do NOT correlate with hashrate. The ratio
														
 
															+of engine clock speed to memory matters, so if you set your memory to the
														
 
															+default value, and then start overclocking as you are running it, you should
														
 
															+find a sweet spot where the hashrate peaks and then it might actually drop if
														
 
															+you increase the engine clock speed further. Unless you wish to run with a
														
 
															+dynamic intensity, do not go over 13 without testing it while it's running to
														
 
															+see that it increases hashrate AND utility WITHOUT increasing your rejects.
														
 
															+
														
 
															+
														
 
															+Suggested values for 7970 for example:
														
 
															+export GPU_MAX_ALLOC_PERCENT=100
														
 
															+--shaders 2048 -g 5 --gpu-engine 1135 --gpu-memclock 1375
														
 
															+
														
 
															+
														
 
															+---
														
 
															+
														
 
															+If you wish to donate to the author of scrypt support, Con Kolivas, please send
														
 
															+your donations to:
														
 
															+
														
 
															+Bitcoin : 15qSxP1SQcUX3o4nhkfdbgyoWEFMomJ4rZ
														
 
															+Litecoin: Lc8TWMiKM7gRUrG8VB8pPNP1Yvt1SGZnoH
														
--- a/autogen.sh
+++ b/autogen.sh
@@ -1,12 +1,17 @@
 
															 #!/bin/sh
														
 
															+cwd="$PWD"
														
 
															+bs_dir="$(dirname $(readlink -f $0))"
														
 
															+rm -rf "${bs_dir}"/autom4te.cache
														
 
															+rm -f "${bs_dir}"/aclocal.m4 "${bs_dir}"/ltmain.sh
														
 
															-# You need autoconf 2.5x, preferably 2.57 or later
														
 
															-# You need automake 1.7 or later. 1.6 might work.
														
 
															-
														
 
															-set -e
														
 
															-
														
 
															-aclocal -I m4
														
 
															-autoheader
														
 
															-automake --add-missing --copy
														
 
															-autoconf
														
 
															-
														
 
															+echo 'Running autoreconf -if...'
														
 
															+autoreconf -if || exit 1
														
 
															+if test -z "$NOCONFIGURE" ; then
														
 
															+	echo 'Configuring...'
														
 
															+	cd "${bs_dir}" &> /dev/null
														
 
															+	test "$?" = "0" || e=1
														
 
															+	test "$cwd" != "$bs_dir" && cd "$bs_dir" &> /dev/null
														
 
															+	./configure $@
														
 
															+	test "$e" = "1" && exit 1
														
 
															+	cd "$cwd"
														
 
															+fi
														
--- a/configure.ac
+++ b/configure.ac
@@ -1,8 +1,8 @@
 
															 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
														
 
															 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
														
 
															 m4_define([v_maj], [2])
														
 
															-m4_define([v_min], [5])
														
 
															-m4_define([v_mic], [2])
														
 
															+m4_define([v_min], [6])
														
 
															+m4_define([v_mic], [0])
														
 
															 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
														
 
															 m4_define([v_ver], [v_maj.v_min.v_mic])
														
 
															 m4_define([lt_rev], m4_eval(v_maj + v_min))
														
@@ -197,6 +197,18 @@ if test "x$ztex" = xyes; then
 
															 fi
														
 
															 AM_CONDITIONAL([HAS_ZTEX], [test x$ztex = xyes])
														
 
															+
														
 
															+scrypt="no"
														
 
															+
														
 
															+AC_ARG_ENABLE([scrypt],
														
 
															+	[AC_HELP_STRING([--enable-scrypt],[Compile support for scrypt mining (default disabled)])],
														
 
															+	[scrypt=$enableval]
														
 
															+	)
														
 
															+if test "x$scrypt" = xyes; then
														
 
															+	AC_DEFINE([USE_SCRYPT], [1], [Defined to 1 if scrypt support is wanted])
														
 
															+fi
														
 
															+
														
 
															+
														
 
															 curses="auto"
														
 
															 AC_ARG_WITH([curses],
														
@@ -223,6 +235,7 @@ fi
 
															 AM_CONDITIONAL([NEED_FPGAUTILS], [test x$icarus$bitforce$modminer$ztex != xnononono])
														
 
															+AM_CONDITIONAL([HAS_SCRYPT], [test x$scrypt = xyes])
														
 
															 AM_CONDITIONAL([HAVE_CURSES], [test x$curses = xyes])
														
 
															 AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
														
 
															 AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
														
@@ -343,10 +356,11 @@ fi
 
															 AC_DEFINE_UNQUOTED([CGMINER_PREFIX], ["$prefix/bin"], [Path to bfgminer install])
														
 
															-AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120223"], [Filename for phatk kernel])
														
 
															-AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120327"], [Filename for poclbm kernel])
														
 
															-AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120427"], [Filename for diakgcn kernel])
														
 
															-AC_DEFINE_UNQUOTED([DIABLO_KERNNAME], ["diablo120328"], [Filename for diablo kernel])
														
 
															+AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120724"], [Filename for phatk kernel])
														
 
															+AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120724"], [Filename for poclbm kernel])
														
 
															+AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120724"], [Filename for diakgcn kernel])
														
 
															+AC_DEFINE_UNQUOTED([DIABLO_KERNNAME], ["diablo120724"], [Filename for diablo kernel])
														
 
															+AC_DEFINE_UNQUOTED([SCRYPT_KERNNAME], ["scrypt120724"], [Filename for scrypt kernel])
														
 
															 AC_SUBST(JANSSON_LIBS)
														
@@ -385,6 +399,11 @@ echo "Configuration Options Summary:"
 
															 echo
														
 
															 echo "  curses.TUI...........: $cursesmsg"
														
 
															+if test "x$scrypt" != xno; then
														
 
															+	echo "  scrypt...............: Enabled"
														
 
															+else
														
 
															+	echo "  scrypt...............: Disabled"
														
 
															+fi
														
 
															 echo
														
--- a/diablo120724.cl
+++ b/diablo120724.cl
@@ -1242,8 +1242,8 @@ void search(
 
															     ZA[924] = (ZCh(ZA[922], ZA[920], ZA[918]) + ZA[923]) + ZR26(ZA[922]);
														
 
															-#define FOUND (0x80)
														
 
															-#define NFLAG (0x7F)
														
 
															+#define FOUND (0x800)
														
 
															+#define NFLAG (0x7FF)
														
 
															 #if defined(VECTORS4)
														
 
															 	bool result = any(ZA[924] == 0x136032EDU);
														
--- a/diakgcn120724.cl
+++ b/diakgcn120724.cl
@@ -571,8 +571,8 @@ __kernel
 
															 	V[7] += V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
														
 
															-#define FOUND (0x80)
														
 
															-#define NFLAG (0x7F)
														
 
															+#define FOUND (0x800)
														
 
															+#define NFLAG (0x7FF)
														
 
															 #ifdef VECTORS4
														
 
															 	if ((V[7].x == 0x136032edU) ^ (V[7].y == 0x136032edU) ^ (V[7].z == 0x136032edU) ^ (V[7].w == 0x136032edU))
														
--- a/driver-cpu.c
+++ b/driver-cpu.c
@@ -131,6 +131,9 @@ extern bool scanhash_sse2_32(struct thr_info*, const unsigned char *pmidstate, u
 
															 	uint32_t max_nonce, uint32_t *last_nonce,
														
 
															 	uint32_t nonce);
														
 
															+extern bool scanhash_scrypt(struct thr_info *thr, int thr_id, unsigned char *pdata, unsigned char *scratchbuf,
														
 
															+	const unsigned char *ptarget,
														
 
															+	uint32_t max_nonce, unsigned long *hashes_done);
														
@@ -161,6 +164,9 @@ const char *algo_names[] = {
 
															 #ifdef WANT_ALTIVEC_4WAY
														
 
															     [ALGO_ALTIVEC_4WAY] = "altivec_4way",
														
 
															 #endif
														
 
															+#ifdef WANT_SCRYPT
														
 
															+    [ALGO_SCRYPT] = "scrypt",
														
 
															+#endif
														
 
															 };
														
 
															 static const sha256_func sha256_funcs[] = {
														
@@ -185,7 +191,10 @@ static const sha256_func sha256_funcs[] = {
 
															 	[ALGO_SSE2_64]		= (sha256_func)scanhash_sse2_64,
														
 
															 #endif
														
 
															 #ifdef WANT_X8664_SSE4
														
 
															-	[ALGO_SSE4_64]		= (sha256_func)scanhash_sse4_64
														
 
															+	[ALGO_SSE4_64]		= (sha256_func)scanhash_sse4_64,
														
 
															+#endif
														
 
															+#ifdef WANT_SCRYPT
														
 
															+	[ALGO_SCRYPT]		= (sha256_func)scanhash_scrypt
														
 
															 #endif
														
 
															 };
														
 
															 #endif
														
@@ -662,6 +671,9 @@ char *set_algo(const char *arg, enum sha256_algos *algo)
 
															 {
														
 
															 	enum sha256_algos i;
														
 
															+	if (opt_scrypt)
														
 
															+		return "Can only use scrypt algorithm";
														
 
															+
														
 
															 	if (!strcmp(arg, "auto")) {
														
 
															 		*algo = pick_fastest_algo();
														
 
															 		return NULL;
														
@@ -676,6 +688,13 @@ char *set_algo(const char *arg, enum sha256_algos *algo)
 
															 	return "Unknown algorithm";
														
 
															 }
														
 
															+#ifdef WANT_SCRYPT
														
 
															+void set_scrypt_algo(enum sha256_algos *algo)
														
 
															+{
														
 
															+	*algo = ALGO_SCRYPT;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															 void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo)
														
 
															 {
														
 
															 	strncpy(buf, algo_names[*algo], OPT_SHOW_LEN);
														
@@ -765,7 +784,7 @@ static bool cpu_thread_prepare(struct thr_info *thr)
 
															 static uint64_t cpu_can_limit_work(__maybe_unused struct thr_info *thr)
														
 
															 {
														
 
															-	return 0xfffff;
														
 
															+	return 0xffff;
														
 
															 }
														
 
															 static bool cpu_thread_init(struct thr_info *thr)
														
--- a/driver-cpu.h
+++ b/driver-cpu.h
@@ -34,6 +34,10 @@
 
															 #define WANT_X8664_SSE4 1
														
 
															 #endif
														
 
															+#ifdef USE_SCRYPT
														
 
															+#define WANT_SCRYPT
														
 
															+#endif
														
 
															+
														
 
															 enum sha256_algos {
														
 
															 	ALGO_C,			/* plain C */
														
 
															 	ALGO_4WAY,		/* parallel SSE2 */
														
@@ -44,6 +48,7 @@ enum sha256_algos {
 
															 	ALGO_SSE2_64,		/* SSE2 for x86_64 */
														
 
															 	ALGO_SSE4_64,		/* SSE4 for x86_64 */
														
 
															 	ALGO_ALTIVEC_4WAY,	/* parallel Altivec */
														
 
															+	ALGO_SCRYPT,		/* scrypt */
														
 
															 };
														
 
															 extern const char *algo_names[];
														
@@ -55,5 +60,6 @@ extern void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo);
 
															 extern char *force_nthreads_int(const char *arg, int *i);
														
 
															 extern void init_max_name_len();
														
 
															 extern double bench_algo_stage3(enum sha256_algos algo);
														
 
															+extern void set_scrypt_algo(enum sha256_algos *algo);
														
 
															 #endif /* __DEVICE_CPU_H__ */
														
--- a/driver-opencl.c
+++ b/driver-opencl.c
@@ -344,6 +344,83 @@ char *set_worksize(char *arg)
 
															 	return NULL;
														
 
															 }
														
 
															+#ifdef USE_SCRYPT
														
 
															+char *set_shaders(char *arg)
														
 
															+{
														
 
															+	int i, val = 0, device = 0;
														
 
															+	char *nextptr;
														
 
															+
														
 
															+	nextptr = strtok(arg, ",");
														
 
															+	if (nextptr == NULL)
														
 
															+		return "Invalid parameters for set lookup gap";
														
 
															+	val = atoi(nextptr);
														
 
															+
														
 
															+	gpus[device++].shaders = val;
														
 
															+
														
 
															+	while ((nextptr = strtok(NULL, ",")) != NULL) {
														
 
															+		val = atoi(nextptr);
														
 
															+
														
 
															+		gpus[device++].shaders = val;
														
 
															+	}
														
 
															+	if (device == 1) {
														
 
															+		for (i = device; i < MAX_GPUDEVICES; i++)
														
 
															+			gpus[i].shaders = gpus[0].shaders;
														
 
															+	}
														
 
															+
														
 
															+	return NULL;
														
 
															+}
														
 
															+
														
 
															+char *set_lookup_gap(char *arg)
														
 
															+{
														
 
															+	int i, val = 0, device = 0;
														
 
															+	char *nextptr;
														
 
															+
														
 
															+	nextptr = strtok(arg, ",");
														
 
															+	if (nextptr == NULL)
														
 
															+		return "Invalid parameters for set lookup gap";
														
 
															+	val = atoi(nextptr);
														
 
															+
														
 
															+	gpus[device++].lookup_gap = val;
														
 
															+
														
 
															+	while ((nextptr = strtok(NULL, ",")) != NULL) {
														
 
															+		val = atoi(nextptr);
														
 
															+
														
 
															+		gpus[device++].lookup_gap = val;
														
 
															+	}
														
 
															+	if (device == 1) {
														
 
															+		for (i = device; i < MAX_GPUDEVICES; i++)
														
 
															+			gpus[i].lookup_gap = gpus[0].lookup_gap;
														
 
															+	}
														
 
															+
														
 
															+	return NULL;
														
 
															+}
														
 
															+
														
 
															+char *set_thread_concurrency(char *arg)
														
 
															+{
														
 
															+	int i, val = 0, device = 0;
														
 
															+	char *nextptr;
														
 
															+
														
 
															+	nextptr = strtok(arg, ",");
														
 
															+	if (nextptr == NULL)
														
 
															+		return "Invalid parameters for set thread concurrency";
														
 
															+	val = atoi(nextptr);
														
 
															+
														
 
															+	gpus[device++].thread_concurrency = val;
														
 
															+
														
 
															+	while ((nextptr = strtok(NULL, ",")) != NULL) {
														
 
															+		val = atoi(nextptr);
														
 
															+
														
 
															+		gpus[device++].thread_concurrency = val;
														
 
															+	}
														
 
															+	if (device == 1) {
														
 
															+		for (i = device; i < MAX_GPUDEVICES; i++)
														
 
															+			gpus[i].thread_concurrency = gpus[0].thread_concurrency;
														
 
															+	}
														
 
															+
														
 
															+	return NULL;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															 static enum cl_kernels select_kernel(char *arg)
														
 
															 {
														
 
															 	if (!strcmp(arg, "diablo"))
														
@@ -354,6 +431,10 @@ static enum cl_kernels select_kernel(char *arg)
 
															 		return KL_POCLBM;
														
 
															 	if (!strcmp(arg, "phatk"))
														
 
															 		return KL_PHATK;
														
 
															+#ifdef USE_SCRYPT
														
 
															+	if (!strcmp(arg, "scrypt"))
														
 
															+		return KL_SCRYPT;
														
 
															+#endif
														
 
															 	return KL_NONE;
														
 
															 }
														
@@ -363,6 +444,8 @@ char *set_kernel(char *arg)
 
															 	int i, device = 0;
														
 
															 	char *nextptr;
														
 
															+	if (opt_scrypt)
														
 
															+		return "Cannot use sha256 kernel with scrypt";
														
 
															 	nextptr = strtok(arg, ",");
														
 
															 	if (nextptr == NULL)
														
 
															 		return "Invalid parameters for set kernel";
														
@@ -1203,11 +1286,40 @@ static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint t
 
															 	return status;
														
 
															 }
														
 
															+#ifdef USE_SCRYPT
														
 
															+static cl_int queue_scrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
														
 
															+{
														
 
															+	unsigned char *midstate = blk->work->midstate;
														
 
															+	cl_kernel *kernel = &clState->kernel;
														
 
															+	unsigned int num = 0;
														
 
															+	cl_uint le_target;
														
 
															+	cl_int status = 0;
														
 
															+
														
 
															+	le_target = *(cl_uint *)(blk->work->target + 28);
														
 
															+	clState->cldata = blk->work->data;
														
 
															+	status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL,NULL);
														
 
															+
														
 
															+	CL_SET_ARG(clState->CLbuffer0);
														
 
															+	CL_SET_ARG(clState->outputBuffer);
														
 
															+	CL_SET_ARG(clState->padbuffer8);
														
 
															+	CL_SET_VARG(4, &midstate[0]);
														
 
															+	CL_SET_VARG(4, &midstate[16]);
														
 
															+	CL_SET_ARG(le_target);
														
 
															+
														
 
															+	return status;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															 static void set_threads_hashes(unsigned int vectors, unsigned int *threads,
														
 
															 			       int64_t *hashes, size_t *globalThreads,
														
 
															 			       unsigned int minthreads, int intensity)
														
 
															 {
														
 
															-	*threads = 1 << (15 + intensity);
														
 
															+	if (opt_scrypt) {
														
 
															+		if (intensity < 0)
														
 
															+			intensity = 0;
														
 
															+		*threads = 1 << intensity;
														
 
															+	} else
														
 
															+		*threads = 1 << (15 + intensity);
														
 
															 	if (*threads < minthreads)
														
 
															 		*threads = minthreads;
														
 
															 	*globalThreads = *threads;
														
@@ -1490,19 +1602,25 @@ static bool opencl_thread_prepare(struct thr_info *thr)
 
															 	if (!cgpu->kname)
														
 
															 	{
														
 
															 		switch (clStates[i]->chosen_kernel) {
														
 
															-		case KL_DIABLO:
														
 
															-			cgpu->kname = "diablo";
														
 
															-			break;
														
 
															-		case KL_DIAKGCN:
														
 
															-			cgpu->kname = "diakgcn";
														
 
															-			break;
														
 
															-		case KL_PHATK:
														
 
															-			cgpu->kname = "phatk";
														
 
															-			break;
														
 
															-		case KL_POCLBM:
														
 
															-			cgpu->kname = "poclbm";
														
 
															-		default:
														
 
															-			break;
														
 
															+			case KL_DIABLO:
														
 
															+				cgpu->kname = "diablo";
														
 
															+				break;
														
 
															+			case KL_DIAKGCN:
														
 
															+				cgpu->kname = "diakgcn";
														
 
															+				break;
														
 
															+			case KL_PHATK:
														
 
															+				cgpu->kname = "phatk";
														
 
															+				break;
														
 
															+#ifdef USE_SCRYPT
														
 
															+			case KL_SCRYPT:
														
 
															+				cgpu->kname = "scrypt";
														
 
															+				break;
														
 
															+#endif
														
 
															+			case KL_POCLBM:
														
 
															+				cgpu->kname = "poclbm";
														
 
															+				break;
														
 
															+			default:
														
 
															+				break;
														
 
															 		}
														
 
															 	}
														
 
															 	applog(LOG_INFO, "initCl() finished. Found %s", name);
														
@@ -1520,7 +1638,7 @@ static bool opencl_thread_init(struct thr_info *thr)
 
															 	struct cgpu_info *gpu = thr->cgpu;
														
 
															 	struct opencl_thread_data *thrdata;
														
 
															 	_clState *clState = clStates[thr_id];
														
 
															-	cl_int status;
														
 
															+	cl_int status = 0;
														
 
															 	thrdata = calloc(1, sizeof(*thrdata));
														
 
															 	thr->cgpu_data = thrdata;
														
@@ -1539,6 +1657,11 @@ static bool opencl_thread_init(struct thr_info *thr)
 
															 		case KL_DIAKGCN:
														
 
															 			thrdata->queue_kernel_parameters = &queue_diakgcn_kernel;
														
 
															 			break;
														
 
															+#ifdef USE_SCRYPT
														
 
															+		case KL_SCRYPT:
														
 
															+			thrdata->queue_kernel_parameters = &queue_scrypt_kernel;
														
 
															+			break;
														
 
															+#endif
														
 
															 		default:
														
 
															 		case KL_DIABLO:
														
 
															 			thrdata->queue_kernel_parameters = &queue_diablo_kernel;
														
@@ -1553,7 +1676,7 @@ static bool opencl_thread_init(struct thr_info *thr)
 
															 		return false;
														
 
															 	}
														
 
															-	status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0,
														
 
															+	status |= clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0,
														
 
															 			BUFFERSIZE, blank_res, 0, NULL, NULL);
														
 
															 	if (unlikely(status != CL_SUCCESS)) {
														
 
															 		applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
														
@@ -1582,7 +1705,12 @@ static void opencl_free_work(struct thr_info *thr, struct work *work)
 
															 static bool opencl_prepare_work(struct thr_info __maybe_unused *thr, struct work *work)
														
 
															 {
														
 
															-	precalc_hash(&work->blk, (uint32_t *)(work->midstate), (uint32_t *)(work->data + 64));
														
 
															+#ifdef USE_SCRYPT
														
 
															+	if (opt_scrypt)
														
 
															+		work->blk.work = work;
														
 
															+	else
														
 
															+#endif
														
 
															+		precalc_hash(&work->blk, (uint32_t *)(work->midstate), (uint32_t *)(work->data + 64));
														
 
															 	return true;
														
 
															 }
														
@@ -1597,6 +1725,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 
															 	_clState *clState = clStates[thr_id];
														
 
															 	const cl_kernel *kernel = &clState->kernel;
														
 
															 	const int dynamic_us = opt_dynamic_interval * 1000;
														
 
															+	cl_bool blocking;
														
 
															 	cl_int status;
														
 
															 	size_t globalThreads[1];
														
@@ -1604,14 +1733,20 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 
															 	unsigned int threads;
														
 
															 	int64_t hashes;
														
 
															+	if (gpu->dynamic)
														
 
															+		blocking = CL_TRUE;
														
 
															+	else
														
 
															+		blocking = CL_FALSE;
														
 
															+
														
 
															 	/* This finish flushes the readbuffer set with CL_FALSE later */
														
 
															-	clFinish(clState->commandQueue);
														
 
															-	gettimeofday(&gpu->tv_gpuend, NULL);
														
 
															+	if (!blocking)
														
 
															+		clFinish(clState->commandQueue);
														
 
															 	if (gpu->dynamic) {
														
 
															 		struct timeval diff;
														
 
															 		suseconds_t gpu_us;
														
 
															+		gettimeofday(&gpu->tv_gpuend, NULL);
														
 
															 		timersub(&gpu->tv_gpuend, &gpu->tv_gpustart, &diff);
														
 
															 		gpu_us = diff.tv_sec * 1000000 + diff.tv_usec;
														
 
															 		if (likely(gpu_us >= 0)) {
														
@@ -1633,6 +1768,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 
															 			   localThreads[0], gpu->intensity);
														
 
															 	if (hashes > gpu->max_hashes)
														
 
															 		gpu->max_hashes = hashes;
														
 
															+
														
 
															 	status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]);
														
 
															 	if (unlikely(status != CL_SUCCESS)) {
														
 
															 		applog(LOG_ERR, "Error: clSetKernelArg of all params failed.");
														
@@ -1642,7 +1778,7 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 
															 	/* MAXBUFFERS entry is used as a flag to say nonces exist */
														
 
															 	if (thrdata->res[FOUND]) {
														
 
															 		/* Clear the buffer again */
														
 
															-		status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
														
 
															+		status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, blocking, 0,
														
 
															 				BUFFERSIZE, blank_res, 0, NULL, NULL);
														
 
															 		if (unlikely(status != CL_SUCCESS)) {
														
 
															 			applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
														
@@ -1657,7 +1793,8 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 
															 			postcalc_hash_async(thr, work, thrdata->res);
														
 
															 		}
														
 
															 		memset(thrdata->res, 0, BUFFERSIZE);
														
 
															-		clFinish(clState->commandQueue);
														
 
															+		if (!blocking)
														
 
															+			clFinish(clState->commandQueue);
														
 
															 	}
														
 
															 	gettimeofday(&gpu->tv_gpustart, NULL);
														
@@ -1672,14 +1809,14 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 
															 		status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL,
														
 
															 						globalThreads, localThreads, 0,  NULL, NULL);
														
 
															 	if (unlikely(status != CL_SUCCESS)) {
														
 
															-		applog(LOG_ERR, "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)");
														
 
															+		applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status);
														
 
															 		return -1;
														
 
															 	}
														
 
															-	status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
														
 
															+	status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, blocking, 0,
														
 
															 			BUFFERSIZE, thrdata->res, 0, NULL, NULL);
														
 
															 	if (unlikely(status != CL_SUCCESS)) {
														
 
															-		applog(LOG_ERR, "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer)");
														
 
															+		applog(LOG_ERR, "Error: clEnqueueReadBuffer failed error %d. (clEnqueueReadBuffer)", status);
														
 
															 		return -1;
														
 
															 	}
														
--- a/driver-opencl.h
+++ b/driver-opencl.h
@@ -18,6 +18,11 @@ extern char *set_temp_target(char *arg);
 
															 extern char *set_intensity(char *arg);
														
 
															 extern char *set_vector(char *arg);
														
 
															 extern char *set_worksize(char *arg);
														
 
															+#ifdef USE_SCRYPT
														
 
															+extern char *set_shaders(char *arg);
														
 
															+extern char *set_lookup_gap(char *arg);
														
 
															+extern char *set_thread_concurrency(char *arg);
														
 
															+#endif
														
 
															 extern char *set_kernel(char *arg);
														
 
															 void manage_gpu(void);
														
 
															 extern void pause_dynamic_threads(int gpu);
														
--- a/findnonce.c
+++ b/findnonce.c
@@ -45,7 +45,8 @@ const uint32_t SHA256_K[64] = {
 
															 	d = d + h; \
														
 
															 	h = h + (rotate(a, 30) ^ rotate(a, 19) ^ rotate(a, 10)) + ((a & b) | (c & (a | b)))
														
 
															-void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) {
														
 
															+void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data)
														
 
															+{
														
 
															 	cl_uint A, B, C, D, E, F, G, H;
														
 
															 	A = state[0];
														
@@ -228,9 +229,18 @@ static void *postcalc_hash(void *userdata)
 
															 	pthread_detach(pthread_self());
														
 
															 	for (entry = 0; entry < FOUND; entry++) {
														
 
															-		if (pcd->res[entry])
														
 
															-			send_nonce(pcd, pcd->res[entry]);
														
 
															+		uint32_t nonce = pcd->res[entry];
														
 
															+
														
 
															+		if (nonce) {
														
 
															+			applog(LOG_DEBUG, "OCL NONCE %u", nonce);
														
 
															+#ifdef USE_SCRYPT
														
 
															+			if (opt_scrypt)
														
 
															+				submit_nonce(thr, pcd->work, nonce);
														
 
															+			else
														
 
															+#endif
														
 
															+				send_nonce(pcd, nonce);
														
 
															 		nonces++;
														
 
															+		}
														
 
															 	}
														
 
															 	free(pcd);
														
--- a/findnonce.h
+++ b/findnonce.h
@@ -4,10 +4,10 @@
 
															 #include "config.h"
														
 
															 #define MAXTHREADS (0xFFFFFFFEULL)
														
 
															-#define MAXBUFFERS (0xFF)
														
 
															+#define MAXBUFFERS (0xFFF)
														
 
															 #define BUFFERSIZE (sizeof(uint32_t) * MAXBUFFERS)
														
 
															-#define FOUND (0x80)
														
 
															-/* #define NFLAG (0x7F) Just for reference */
														
 
															+#define FOUND (0x800)
														
 
															+/* #define NFLAG (0x7FF) Just for reference */
														
 
															 #ifdef HAVE_OPENCL
														
 
															 extern void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data);
														
--- a/make-release
+++ b/make-release
@@ -31,7 +31,7 @@ cd ..
 
															 zip -r "$IDIR/${sw}.zip" "$sw"
														
 
															 tar cjvpf "$IDIR/${sw}.tbz2" "$sw"
														
 
															 SRCDIR="$TMPDIR/$sw"
														
 
															-for txt in AUTHORS COPYING NEWS README API-README FPGA-README; do
														
 
															+for txt in AUTHORS COPYING NEWS README API-README FPGA-README SCRYPT-README; do
														
 
															 	cp -v "$txt" "$OUTDIR/${txt}.txt"
														
 
															 done
														
 
															 cp -av "bitstreams" "$OUTDIR/"
														
--- a/miner.c
+++ b/miner.c
@@ -109,6 +109,12 @@ int nDevs;
 
															 int opt_g_threads = 2;
														
 
															 int gpu_threads;
														
 
															 #endif
														
 
															+#ifdef USE_SCRYPT
														
 
															+static char detect_algo = 1;
														
 
															+bool opt_scrypt;
														
 
															+#else
														
 
															+static char detect_algo;
														
 
															+#endif
														
 
															 bool opt_restart = true;
														
 
															 static bool opt_nogpu;
														
@@ -852,6 +858,11 @@ static struct opt_table opt_config_table[] = {
 
															 	OPT_WITH_ARG("--gpu-vddc",
														
 
															 		     set_gpu_vddc, NULL, NULL,
														
 
															 		     "Set the GPU voltage in Volts - one value for all or separate by commas for per card"),
														
 
															+#endif
														
 
															+#ifdef USE_SCRYPT
														
 
															+	OPT_WITH_ARG("--lookup-gap",
														
 
															+		     set_lookup_gap, NULL, NULL,
														
 
															+		     "Set GPU lookup gap for scrypt mining, comma separated"),
														
 
															 #endif
														
 
															 	OPT_WITH_ARG("--intensity|-I",
														
 
															 		     set_intensity, NULL, NULL,
														
@@ -865,7 +876,7 @@ static struct opt_table opt_config_table[] = {
 
															 #ifdef HAVE_OPENCL
														
 
															 	OPT_WITH_ARG("--kernel|-k",
														
 
															 		     set_kernel, NULL, NULL,
														
 
															-		     "Override kernel to use (diablo, poclbm, phatk or diakgcn) - one value or comma separated"),
														
 
															+		     "Override sha256 kernel to use (diablo, poclbm, phatk or diakgcn) - one value or comma separated"),
														
 
															 #endif
														
 
															 #ifdef USE_ICARUS
														
 
															 	OPT_WITH_ARG("--icarus-timing",
														
@@ -958,6 +969,16 @@ static struct opt_table opt_config_table[] = {
 
															 	OPT_WITH_ARG("--sched-stop",
														
 
															 		     set_schedtime, NULL, &schedstop,
														
 
															 		     "Set a time of day in HH:MM to stop mining (will quit without a start time)"),
														
 
															+#ifdef USE_SCRYPT
														
 
															+	OPT_WITHOUT_ARG("--scrypt",
														
 
															+			opt_set_bool, &opt_scrypt,
														
 
															+			"Use the scrypt algorithm for mining (non-bitcoin)"),
														
 
															+#ifdef HAVE_OPENCL
														
 
															+	OPT_WITH_ARG("--shaders",
														
 
															+		     set_shaders, NULL, NULL,
														
 
															+		     "GPU shaders per card for tuning scrypt, comma separated"),
														
 
															+#endif
														
 
															+#endif
														
 
															 	OPT_WITH_ARG("--sharelog",
														
 
															 		     set_sharelog, NULL, NULL,
														
 
															 		     "Append share log to file"),
														
@@ -999,6 +1020,11 @@ static struct opt_table opt_config_table[] = {
 
															 			opt_hidden
														
 
															 #endif
														
 
															 	),
														
 
															+#if defined(USE_SCRYPT) && defined(HAVE_OPENCL)
														
 
															+	OPT_WITH_ARG("--thread-concurrency",
														
 
															+		     set_thread_concurrency, NULL, NULL,
														
 
															+		     "Set GPU thread concurrency for scrypt mining, comma separated"),
														
 
															+#endif
														
 
															 	OPT_WITH_ARG("--url|-o",
														
 
															 		     set_url, NULL, NULL,
														
 
															 		     "URL for bitcoin JSON-RPC server"),
														
@@ -1280,6 +1306,13 @@ static bool work_decode(const json_t *val, struct work *work)
 
															 {
														
 
															 	unsigned char bits = 0, i;
														
 
															+	if (unlikely(detect_algo == 1)) {
														
 
															+		json_t *tmp = json_object_get(val, "algorithm");
														
 
															+		const char *v = tmp ? json_string_value(tmp) : "";
														
 
															+		if (strncasecmp(v, "scrypt", 6))
														
 
															+			detect_algo = 2;
														
 
															+	}
														
 
															+	
														
 
															 	if (unlikely(!jobj_binary(val, "data", work->data, sizeof(work->data), true))) {
														
 
															 		applog(LOG_ERR, "JSON inval data");
														
 
															 		goto err_out;
														
@@ -1595,9 +1628,6 @@ static void curses_print_devstatus(int thr_id)
 
															 	char logline[255];
														
 
															 	int ypos;
														
 
															-	cgpu->utility = cgpu->accepted / ( total_secs ? total_secs : 1 ) * 60;
														
 
															-	cgpu->utility_diff1 = cgpu->accepted_weighed / ( total_secs ?: 1 ) * 60;
														
 
															-
														
 
															 	/* Check this isn't out of the window size */
														
 
															 	ypos = cgpu->cgminer_id;
														
 
															 	ypos += devsummaryYOffset;
														
@@ -1606,6 +1636,10 @@ static void curses_print_devstatus(int thr_id)
 
															 	ypos += devcursor;
														
 
															 	if (ypos >= statusy - 1)
														
 
															 		return;
														
 
															+
														
 
															+	cgpu->utility = cgpu->accepted / ( total_secs ? total_secs : 1 ) * 60;
														
 
															+	cgpu->utility_diff1 = cgpu->accepted_weighed / ( total_secs ?: 1 ) * 60;
														
 
															+
														
 
															 	if (wmove(statuswin, ypos, 0) == ERR)
														
 
															 		return;
														
 
															 	wprintw(statuswin, " %s %*d: ", cgpu->api->name, dev_width, cgpu->device_id);
														
@@ -1899,8 +1933,13 @@ static bool submit_upstream_work(const struct work *work, CURL *curl)
 
															 	if (!QUIET) {
														
 
															 		hash32 = (uint32_t *)(work->hash);
														
 
															-		sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[6]), (unsigned long)(hash32[5]),
														
 
															-			work->block? " BLOCK!" : "");
														
 
															+		if (opt_scrypt) {
														
 
															+			sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[7]), (unsigned long)(hash32[6]),
														
 
															+				work->block? " BLOCK!" : "");
														
 
															+		} else {
														
 
															+			sprintf(hashshow, "%08lx.%08lx%s", (unsigned long)(hash32[6]), (unsigned long)(hash32[5]),
														
 
															+				work->block? " BLOCK!" : "");
														
 
															+		}
														
 
															 	}
														
 
															 	/* Theoretically threads could race when modifying accepted and
														
@@ -3149,6 +3188,11 @@ void write_config(FILE *fcfg)
 
															 				case KL_DIABLO:
														
 
															 					fprintf(fcfg, "diablo");
														
 
															 					break;
														
 
															+#ifdef USE_SCRYPT
														
 
															+				case KL_SCRYPT:
														
 
															+					fprintf(fcfg, "scrypt");
														
 
															+					break;
														
 
															+#endif
														
 
															 			}
														
 
															 		}
														
 
															 #ifdef HAVE_ADL
														
@@ -4326,6 +4370,13 @@ bool hashtest(const struct work *work, bool checktarget)
 
															 bool test_nonce(struct work *work, uint32_t nonce, bool checktarget)
														
 
															 {
														
 
															+	if (opt_scrypt) {
														
 
															+		uint32_t *work_nonce = (uint32_t *)(work->data + 64 + 12);
														
 
															+
														
 
															+		*work_nonce = nonce;
														
 
															+		return true;
														
 
															+	}
														
 
															+
														
 
															 	work->data[64 + 12 + 0] = (nonce >> 0) & 0xff;
														
 
															 	work->data[64 + 12 + 1] = (nonce >> 8) & 0xff;
														
 
															 	work->data[64 + 12 + 2] = (nonce >> 16) & 0xff;
														
@@ -4339,7 +4390,7 @@ bool submit_nonce(struct thr_info *thr, struct work *work, uint32_t nonce)
 
															 	/* Do one last check before attempting to submit the work */
														
 
															 	/* Side effect: sets work->data for us */
														
 
															 	if (!test_nonce(work, nonce, true)) {
														
 
															-		applog(LOG_INFO, "Share below target");
														
 
															+		applog(LOG_INFO, "Pool %d share below target", work->pool->pool_no);
														
 
															 		return true;
														
 
															 	}
														
 
															 	return submit_work_sync(thr, work);
														
@@ -5607,6 +5658,11 @@ int main(int argc, char *argv[])
 
															 		opt_log_output = true;
														
 
															 #ifdef WANT_CPUMINE
														
 
															+#ifdef USE_SCRYPT
														
 
															+	if (opt_scrypt)
														
 
															+		set_scrypt_algo(&opt_algo);
														
 
															+	else
														
 
															+#endif
														
 
															 	if (0 <= opt_bench_algo) {
														
 
															 		double rate = bench_algo_stage3(opt_bench_algo);
														
@@ -5856,6 +5912,14 @@ int main(int argc, char *argv[])
 
															 		}
														
 
															 	} while (!pools_active);
														
 
															+#ifdef USE_SCRYPT
														
 
															+	if (detect_algo == 1 && !opt_scrypt) {
														
 
															+		applog(LOG_NOTICE, "Detected scrypt algorithm");
														
 
															+		opt_scrypt = true;
														
 
															+	}
														
 
															+#endif
														
 
															+	detect_algo = 0;
														
 
															+
														
 
															 begin_bench:
														
 
															 	total_mhashes_done = 0;
														
 
															 	for (i = 0; i < total_devices; i++) {
														
--- a/miner.h
+++ b/miner.h
@@ -261,6 +261,7 @@ enum cl_kernels {
 
															 	KL_PHATK,
														
 
															 	KL_DIAKGCN,
														
 
															 	KL_DIABLO,
														
 
															+	KL_SCRYPT,
														
 
															 };
														
 
															 enum dev_reason {
														
@@ -357,10 +358,17 @@ struct cgpu_info {
 
															 	int virtual_adl;
														
 
															 	int intensity;
														
 
															 	bool dynamic;
														
 
															+
														
 
															 	cl_uint vwidth;
														
 
															 	size_t work_size;
														
 
															 	enum cl_kernels kernel;
														
 
															+	cl_ulong max_alloc;
														
 
															+#ifdef USE_SCRYPT
														
 
															+	int lookup_gap;
														
 
															+	int thread_concurrency;
														
 
															+	int shaders;
														
 
															+#endif
														
 
															 	struct timeval tv_gpustart;;
														
 
															 	struct timeval tv_gpuend;
														
 
															 	double gpu_us_average;
														
@@ -612,8 +620,13 @@ extern void add_pool_details(bool live, char *url, char *user, char *pass);
 
															 #define MIN_INTENSITY -10
														
 
															 #define _MIN_INTENSITY_STR "-10"
														
 
															+#ifdef USE_SCRYPT
														
 
															+#define MAX_INTENSITY 20
														
 
															+#define _MAX_INTENSITY_STR "20"
														
 
															+#else
														
 
															 #define MAX_INTENSITY 14
														
 
															 #define _MAX_INTENSITY_STR "14"
														
 
															+#endif
														
 
															 extern struct list_head scan_devices;
														
 
															 extern int nDevs;
														
@@ -625,6 +638,11 @@ extern bool opt_quiet;
 
															 extern struct thr_info *thr_info;
														
 
															 extern struct cgpu_info gpus[MAX_GPUDEVICES];
														
 
															 extern int gpu_threads;
														
 
															+#ifdef USE_SCRYPT
														
 
															+extern bool opt_scrypt;
														
 
															+#else
														
 
															+#define opt_scrypt (0)
														
 
															+#endif
														
 
															 extern double total_secs;
														
 
															 extern int mining_threads;
														
 
															 extern struct cgpu_info *cpus;
														
@@ -672,6 +690,9 @@ typedef struct {
 
															 	cl_uint B1addK6, PreVal0addK7, W16addK16, W17addK17;
														
 
															 	cl_uint zeroA, zeroB;
														
 
															 	cl_uint oneA, twoA, threeA, fourA, fiveA, sixA, sevenA;
														
 
															+#ifdef USE_SCRYPT
														
 
															+	struct work *work;
														
 
															+#endif
														
 
															 } dev_blk_ctx;
														
 
															 #else
														
 
															 typedef struct {
														
--- a/mkinstalldirs
+++ b/mkinstalldirs
@@ -81,9 +81,9 @@ case $dirmode in
 
															       echo "mkdir -p -- $*"
														
 
															       exec mkdir -p -- "$@"
														
 
															     else
														
 
															-      # On NextStep and OpenStep, the `mkdir' command does not
														
 
															+      # On NextStep and OpenStep, the 'mkdir' command does not
														
 
															       # recognize any option.  It will interpret all options as
														
 
															-      # directories to create, and then abort because `.' already
														
 
															+      # directories to create, and then abort because '.' already
														
 
															       # exists.
														
 
															       test -d ./-p && rmdir ./-p
														
 
															       test -d ./--version && rmdir ./--version
														
--- a/ocl.c
+++ b/ocl.c
@@ -385,6 +385,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 
															 {
														
 
															 	_clState *clState = calloc(1, sizeof(_clState));
														
 
															 	bool patchbfi = false, prog_built = false;
														
 
															+	struct cgpu_info *cgpu = &gpus[gpu];
														
 
															 	cl_platform_id platform = NULL;
														
 
															 	char pbuff[256], vbuff[255];
														
 
															 	cl_platform_id* platforms;
														
@@ -486,6 +487,18 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 
															 		return NULL;
														
 
															 	}
														
 
															+	/////////////////////////////////////////////////////////////////
														
 
															+	// Create an OpenCL command queue
														
 
															+	/////////////////////////////////////////////////////////////////
														
 
															+	clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu],
														
 
															+						     CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status);
														
 
															+	if (status != CL_SUCCESS) /* Try again without OOE enable */
														
 
															+		clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
														
 
															+	if (status != CL_SUCCESS) {
														
 
															+		applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status);
														
 
															+		return NULL;
														
 
															+	}
														
 
															+
														
 
															 	/* Check for BFI INT support. Hopefully people don't mix devices with
														
 
															 	 * and without it! */
														
 
															 	char * extensions = malloc(1024);
														
@@ -528,18 +541,30 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 
															 	}
														
 
															 	applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size);
														
 
															+	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL);
														
 
															+	if (status != CL_SUCCESS) {
														
 
															+		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_MEM_ALLOC_SIZE", status);
														
 
															+		return NULL;
														
 
															+	}
														
 
															+	applog(LOG_DEBUG, "Max mem alloc size is %u", cgpu->max_alloc);
														
 
															+
														
 
															 	/* Create binary filename based on parameters passed to opencl
														
 
															 	 * compiler to ensure we only load a binary that matches what would
														
 
															 	 * have otherwise created. The filename is:
														
 
															 	 * name + kernelname +/- g(offset) + v + vectors + w + work_size + l + sizeof(long) + .bin
														
 
															+	 * For scrypt the filename is:
														
 
															+	 * name + kernelname + g + lg + lookup_gap + tc + thread_concurrency + w + work_size + l + sizeof(long) + .bin
														
 
															 	 */
														
 
															 	char binaryfilename[255];
														
 
															 	char filename[255];
														
 
															 	char numbuf[10];
														
 
															-	if (gpus[gpu].kernel == KL_NONE) {
														
 
															-		/* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */
														
 
															-		if (!strstr(name, "Tahiti") &&
														
 
															+	if (cgpu->kernel == KL_NONE) {
														
 
															+		if (opt_scrypt) {
														
 
															+			applog(LOG_INFO, "Selecting scrypt kernel");
														
 
															+			clState->chosen_kernel = KL_SCRYPT;
														
 
															+		} else if (!strstr(name, "Tahiti") &&
														
 
															+			/* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */
														
 
															 			(strstr(vbuff, "844.4") ||  // Linux 64 bit ATI 2.6 SDK
														
 
															 			 strstr(vbuff, "851.4") ||  // Windows 64 bit ""
														
 
															 			 strstr(vbuff, "831.4") ||
														
@@ -556,9 +581,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 
															 			applog(LOG_INFO, "Selecting phatk kernel");
														
 
															 			clState->chosen_kernel = KL_PHATK;
														
 
															 		}
														
 
															-		gpus[gpu].kernel = clState->chosen_kernel;
														
 
															+		cgpu->kernel = clState->chosen_kernel;
														
 
															 	} else {
														
 
															-		clState->chosen_kernel = gpus[gpu].kernel;
														
 
															+		clState->chosen_kernel = cgpu->kernel;
														
 
															 		if (clState->chosen_kernel == KL_PHATK &&
														
 
															 		    (strstr(vbuff, "844.4") || strstr(vbuff, "851.4") ||
														
 
															 		     strstr(vbuff, "831.4") || strstr(vbuff, "898.1") ||
														
@@ -591,6 +616,12 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 
															 			strcpy(filename, DIAKGCN_KERNNAME".cl");
														
 
															 			strcpy(binaryfilename, DIAKGCN_KERNNAME);
														
 
															 			break;
														
 
															+		case KL_SCRYPT:
														
 
															+			strcpy(filename, SCRYPT_KERNNAME".cl");
														
 
															+			strcpy(binaryfilename, SCRYPT_KERNNAME);
														
 
															+			/* Scrypt only supports vector 1 */
														
 
															+			cgpu->vwidth = 1;
														
 
															+			break;
														
 
															 		case KL_NONE: /* Shouldn't happen */
														
 
															 		case KL_DIABLO:
														
 
															 			strcpy(filename, DIABLO_KERNNAME".cl");
														
@@ -598,24 +629,61 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 
															 			break;
														
 
															 	}
														
 
															-	if (gpus[gpu].vwidth)
														
 
															-		clState->vwidth = gpus[gpu].vwidth;
														
 
															+	if (cgpu->vwidth)
														
 
															+		clState->vwidth = cgpu->vwidth;
														
 
															 	else {
														
 
															 		clState->vwidth = preferred_vwidth;
														
 
															-		gpus[gpu].vwidth = preferred_vwidth;
														
 
															+		cgpu->vwidth = preferred_vwidth;
														
 
															 	}
														
 
															-	if ((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) &&
														
 
															-		clState->vwidth == 1 && clState->hasOpenCL11plus)
														
 
															+	if (((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) &&
														
 
															+		clState->vwidth == 1 && clState->hasOpenCL11plus) || opt_scrypt)
														
 
															 			clState->goffset = true;
														
 
															-	if (gpus[gpu].work_size && gpus[gpu].work_size <= clState->max_work_size)
														
 
															-		clState->wsize = gpus[gpu].work_size;
														
 
															+	if (cgpu->work_size && cgpu->work_size <= clState->max_work_size)
														
 
															+		clState->wsize = cgpu->work_size;
														
 
															 	else if (strstr(name, "Tahiti"))
														
 
															 		clState->wsize = 64;
														
 
															 	else
														
 
															 		clState->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth;
														
 
															-	gpus[gpu].work_size = clState->wsize;
														
 
															+	cgpu->work_size = clState->wsize;
														
 
															+
														
 
															+#ifdef USE_SCRYPT
														
 
															+	if (opt_scrypt) {
														
 
															+		cl_ulong ma = cgpu->max_alloc, mt;
														
 
															+		int pow2 = 0;
														
 
															+
														
 
															+		if (!cgpu->lookup_gap) {
														
 
															+			applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu);
														
 
															+			cgpu->lookup_gap = 2;
														
 
															+		}
														
 
															+		if (!cgpu->thread_concurrency) {
														
 
															+			cgpu->thread_concurrency = ma / 32768 / cgpu->lookup_gap;
														
 
															+			if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) {
														
 
															+				cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders;
														
 
															+				if (cgpu->thread_concurrency > cgpu->shaders * 5)
														
 
															+					cgpu->thread_concurrency = cgpu->shaders * 5;
														
 
															+			}
														
 
															+				
														
 
															+			applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %u",gpu,  cgpu->thread_concurrency);
														
 
															+		}
														
 
															+
														
 
															+		/* If we have memory to spare, try to find a power of 2 value
														
 
															+		 * >= required amount to map nicely to an intensity */
														
 
															+		mt = cgpu->thread_concurrency * 32768 * cgpu->lookup_gap;
														
 
															+		if (ma > mt) {
														
 
															+			while (ma >>= 1)
														
 
															+				pow2++;
														
 
															+			ma = 1;
														
 
															+			while (--pow2 && ma < mt)
														
 
															+				ma <<= 1;
														
 
															+			if (ma >= mt) {
														
 
															+				cgpu->max_alloc = ma;
														
 
															+				applog(LOG_DEBUG, "Max alloc decreased to %lu", cgpu->max_alloc);
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+#endif
														
 
															 	FILE *binaryfile;
														
 
															 	size_t *binary_sizes;
														
@@ -644,14 +712,18 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 
															 	strcat(binaryfilename, name);
														
 
															 	if (clState->goffset)
														
 
															 		strcat(binaryfilename, "g");
														
 
															-	strcat(binaryfilename, "v");
														
 
															-	sprintf(numbuf, "%d", clState->vwidth);
														
 
															-	strcat(binaryfilename, numbuf);
														
 
															-	strcat(binaryfilename, "w");
														
 
															-	sprintf(numbuf, "%d", (int)clState->wsize);
														
 
															+	if (opt_scrypt) {
														
 
															+#ifdef USE_SCRYPT
														
 
															+		sprintf(numbuf, "lg%dtc%d", cgpu->lookup_gap, cgpu->thread_concurrency);
														
 
															+		strcat(binaryfilename, numbuf);
														
 
															+#endif
														
 
															+	} else {
														
 
															+		sprintf(numbuf, "v%d", clState->vwidth);
														
 
															+		strcat(binaryfilename, numbuf);
														
 
															+	}
														
 
															+	sprintf(numbuf, "w%d", (int)clState->wsize);
														
 
															 	strcat(binaryfilename, numbuf);
														
 
															-	strcat(binaryfilename, "l");
														
 
															-	sprintf(numbuf, "%d", (int)sizeof(long));
														
 
															+	sprintf(numbuf, "l%d", (int)sizeof(long));
														
 
															 	strcat(binaryfilename, numbuf);
														
 
															 	strcat(binaryfilename, ".bin");
														
@@ -712,8 +784,16 @@ build:
 
															 	/* create a cl program executable for all the devices specified */
														
 
															 	char *CompilerOptions = calloc(1, 256);
														
 
															-	sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d",
														
 
															-		(int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth);
														
 
															+#ifdef USE_SCRYPT
														
 
															+	if (opt_scrypt)
														
 
															+		sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d",
														
 
															+			cgpu->lookup_gap, cgpu->thread_concurrency, (int)clState->wsize);
														
 
															+	else
														
 
															+#endif
														
 
															+	{
														
 
															+		sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d",
														
 
															+			(int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth);
														
 
															+	}
														
 
															 	applog(LOG_DEBUG, "Setting worksize to %d", clState->wsize);
														
 
															 	if (clState->vwidth > 1)
														
 
															 		applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth);
														
@@ -892,18 +972,33 @@ built:
 
															 		return NULL;
														
 
															 	}
														
 
															-	/////////////////////////////////////////////////////////////////
														
 
															-	// Create an OpenCL command queue
														
 
															-	/////////////////////////////////////////////////////////////////
														
 
															-	clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu],
														
 
															-						     CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status);
														
 
															-	if (status != CL_SUCCESS) /* Try again without OOE enable */
														
 
															-		clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
														
 
															-	if (status != CL_SUCCESS) {
														
 
															-		applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status);
														
 
															-		return NULL;
														
 
															-	}
														
 
															+#ifdef USE_SCRYPT
														
 
															+	if (opt_scrypt) {
														
 
															+		size_t ipt = (1024 / cgpu->lookup_gap + (1024 % cgpu->lookup_gap > 0));
														
 
															+		size_t bufsize = 128 * ipt * cgpu->thread_concurrency;
														
 
															+
														
 
															+		/* Use the max alloc value which has been rounded to a power of
														
 
															+		 * 2 greater >= required amount earlier */
														
 
															+		if (bufsize > cgpu->max_alloc) {
														
 
															+			applog(LOG_WARNING, "Maximum buffer memory device %d supports says %u, your scrypt settings come to %u",
														
 
															+			       gpu, cgpu->max_alloc, bufsize);
														
 
															+		} else
														
 
															+			bufsize = cgpu->max_alloc;
														
 
															+		applog(LOG_DEBUG, "Creating scrypt buffer sized %d", bufsize);
														
 
															+		clState->padbufsize = bufsize;
														
 
															+		clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status);
														
 
															+		if (status != CL_SUCCESS) {
														
 
															+			applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease CT or increase LG", status);
														
 
															+			return NULL;
														
 
															+		}
														
 
															+		clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status);
														
 
															+		if (status != CL_SUCCESS) {
														
 
															+			applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status);
														
 
															+			return NULL;
														
 
															+		}
														
 
															+	}
														
 
															+#endif
														
 
															 	clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);
														
 
															 	if (status != CL_SUCCESS) {
														
 
															 		applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status);
														
--- a/ocl.h
+++ b/ocl.h
@@ -15,6 +15,12 @@ typedef struct {
 
															 	cl_command_queue commandQueue;
														
 
															 	cl_program program;
														
 
															 	cl_mem outputBuffer;
														
 
															+#ifdef USE_SCRYPT
														
 
															+	cl_mem CLbuffer0;
														
 
															+	cl_mem padbuffer8;
														
 
															+	size_t padbufsize;
														
 
															+	void * cldata;
														
 
															+#endif
														
 
															 	bool hasBitAlign;
														
 
															 	bool hasOpenCL11plus;
														
 
															 	bool goffset;
														
--- a/phatk120724.cl
+++ b/phatk120724.cl
@@ -387,8 +387,8 @@ void search(	const uint state0, const uint state1, const uint state2, const uint
 
															 	W[117] += W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64),Vals[1],Vals[2]) -
														
 
															 		(-(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(59+64))  + s1(64+59)+ ch(59+64)));
														
 
															-#define FOUND (0x80)
														
 
															-#define NFLAG (0x7F)
														
 
															+#define FOUND (0x800)
														
 
															+#define NFLAG (0x7FF)
														
 
															 #ifdef VECTORS4
														
 
															 	bool result = W[117].x & W[117].y & W[117].z & W[117].w;
														
--- a/poclbm120724.cl
+++ b/poclbm120724.cl
@@ -1311,8 +1311,8 @@ Vals[1]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
 
															 Vals[1]+=K[59];
														
 
															 Vals[1]+=Vals[5];
														
 
															-#define FOUND (0x80)
														
 
															-#define NFLAG (0x7F)
														
 
															+#define FOUND (0x800)
														
 
															+#define NFLAG (0x7FF)
														
 
															 #if defined(VECTORS2) || defined(VECTORS4)
														
 
															 	Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
														
--- a/scrypt.c
+++ b/scrypt.c
@@ -0,0 +1,453 @@
 
															+/*-
														
 
															+ * Copyright 2009 Colin Percival, 2011 ArtForz
														
 
															+ * All rights reserved.
														
 
															+ *
														
 
															+ * Redistribution and use in source and binary forms, with or without
														
 
															+ * modification, are permitted provided that the following conditions
														
 
															+ * are met:
														
 
															+ * 1. Redistributions of source code must retain the above copyright
														
 
															+ *    notice, this list of conditions and the following disclaimer.
														
 
															+ * 2. Redistributions in binary form must reproduce the above copyright
														
 
															+ *    notice, this list of conditions and the following disclaimer in the
														
 
															+ *    documentation and/or other materials provided with the distribution.
														
 
															+ *
														
 
															+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
														
 
															+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
														
 
															+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
														
 
															+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
														
 
															+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
														
 
															+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
														
 
															+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
														
 
															+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
														
 
															+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
														
 
															+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
														
 
															+ * SUCH DAMAGE.
														
 
															+ *
														
 
															+ * This file was originally written by Colin Percival as part of the Tarsnap
														
 
															+ * online backup system.
														
 
															+ */
														
 
															+
														
 
															+#include "config.h"
														
 
															+#include "miner.h"
														
 
															+
														
 
															+#include <stdlib.h>
														
 
															+#include <stdint.h>
														
 
															+#include <string.h>
														
 
															+
														
 
															+#define byteswap(x) ((((x) << 24) & 0xff000000u) | (((x) << 8) & 0x00ff0000u) | (((x) >> 8) & 0x0000ff00u) | (((x) >> 24) & 0x000000ffu))
														
 
															+
														
 
															+typedef struct SHA256Context {
														
 
															+	uint32_t state[8];
														
 
															+	uint32_t buf[16];
														
 
															+} SHA256_CTX;
														
 
															+
														
 
															+/*
														
 
															+ * Encode a length len/4 vector of (uint32_t) into a length len vector of
														
 
															+ * (unsigned char) in big-endian form.  Assumes len is a multiple of 4.
														
 
															+ */
														
 
															+static inline void
														
 
															+be32enc_vect(uint32_t *dst, const uint32_t *src, uint32_t len)
														
 
															+{
														
 
															+	uint32_t i;
														
 
															+
														
 
															+	for (i = 0; i < len; i++)
														
 
															+		dst[i] = byteswap(src[i]);
														
 
															+}
														
 
															+
														
 
															+/* Elementary functions used by SHA256 */
														
 
															+#define Ch(x, y, z)	((x & (y ^ z)) ^ z)
														
 
															+#define Maj(x, y, z)	((x & (y | z)) | (y & z))
														
 
															+#define SHR(x, n)	(x >> n)
														
 
															+#define ROTR(x, n)	((x >> n) | (x << (32 - n)))
														
 
															+#define S0(x)		(ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
														
 
															+#define S1(x)		(ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
														
 
															+#define s0(x)		(ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3))
														
 
															+#define s1(x)		(ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10))
														
 
															+
														
 
															+/* SHA256 round function */
														
 
															+#define RND(a, b, c, d, e, f, g, h, k)			\
														
 
															+	t0 = h + S1(e) + Ch(e, f, g) + k;		\
														
 
															+	t1 = S0(a) + Maj(a, b, c);			\
														
 
															+	d += t0;					\
														
 
															+	h  = t0 + t1;
														
 
															+
														
 
															+/* Adjusted round function for rotating state */
														
 
															+#define RNDr(S, W, i, k)			\
														
 
															+	RND(S[(64 - i) % 8], S[(65 - i) % 8],	\
														
 
															+	    S[(66 - i) % 8], S[(67 - i) % 8],	\
														
 
															+	    S[(68 - i) % 8], S[(69 - i) % 8],	\
														
 
															+	    S[(70 - i) % 8], S[(71 - i) % 8],	\
														
 
															+	    W[i] + k)
														
 
															+
														
 
															+/*
														
 
															+ * SHA256 block compression function.  The 256-bit state is transformed via
														
 
															+ * the 512-bit input block to produce a new state.
														
 
															+ */
														
 
															+static void
														
 
															+SHA256_Transform(uint32_t * state, const uint32_t block[16], int swap)
														
 
															+{
														
 
															+	uint32_t W[64];
														
 
															+	uint32_t S[8];
														
 
															+	uint32_t t0, t1;
														
 
															+	int i;
														
 
															+
														
 
															+	/* 1. Prepare message schedule W. */
														
 
															+	if(swap)
														
 
															+		for (i = 0; i < 16; i++)
														
 
															+			W[i] = byteswap(block[i]);
														
 
															+	else
														
 
															+		memcpy(W, block, 64);
														
 
															+	for (i = 16; i < 64; i += 2) {
														
 
															+		W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];
														
 
															+		W[i+1] = s1(W[i - 1]) + W[i - 6] + s0(W[i - 14]) + W[i - 15];
														
 
															+	}
														
 
															+
														
 
															+	/* 2. Initialize working variables. */
														
 
															+	memcpy(S, state, 32);
														
 
															+
														
 
															+	/* 3. Mix. */
														
 
															+	RNDr(S, W, 0, 0x428a2f98);
														
 
															+	RNDr(S, W, 1, 0x71374491);
														
 
															+	RNDr(S, W, 2, 0xb5c0fbcf);
														
 
															+	RNDr(S, W, 3, 0xe9b5dba5);
														
 
															+	RNDr(S, W, 4, 0x3956c25b);
														
 
															+	RNDr(S, W, 5, 0x59f111f1);
														
 
															+	RNDr(S, W, 6, 0x923f82a4);
														
 
															+	RNDr(S, W, 7, 0xab1c5ed5);
														
 
															+	RNDr(S, W, 8, 0xd807aa98);
														
 
															+	RNDr(S, W, 9, 0x12835b01);
														
 
															+	RNDr(S, W, 10, 0x243185be);
														
 
															+	RNDr(S, W, 11, 0x550c7dc3);
														
 
															+	RNDr(S, W, 12, 0x72be5d74);
														
 
															+	RNDr(S, W, 13, 0x80deb1fe);
														
 
															+	RNDr(S, W, 14, 0x9bdc06a7);
														
 
															+	RNDr(S, W, 15, 0xc19bf174);
														
 
															+	RNDr(S, W, 16, 0xe49b69c1);
														
 
															+	RNDr(S, W, 17, 0xefbe4786);
														
 
															+	RNDr(S, W, 18, 0x0fc19dc6);
														
 
															+	RNDr(S, W, 19, 0x240ca1cc);
														
 
															+	RNDr(S, W, 20, 0x2de92c6f);
														
 
															+	RNDr(S, W, 21, 0x4a7484aa);
														
 
															+	RNDr(S, W, 22, 0x5cb0a9dc);
														
 
															+	RNDr(S, W, 23, 0x76f988da);
														
 
															+	RNDr(S, W, 24, 0x983e5152);
														
 
															+	RNDr(S, W, 25, 0xa831c66d);
														
 
															+	RNDr(S, W, 26, 0xb00327c8);
														
 
															+	RNDr(S, W, 27, 0xbf597fc7);
														
 
															+	RNDr(S, W, 28, 0xc6e00bf3);
														
 
															+	RNDr(S, W, 29, 0xd5a79147);
														
 
															+	RNDr(S, W, 30, 0x06ca6351);
														
 
															+	RNDr(S, W, 31, 0x14292967);
														
 
															+	RNDr(S, W, 32, 0x27b70a85);
														
 
															+	RNDr(S, W, 33, 0x2e1b2138);
														
 
															+	RNDr(S, W, 34, 0x4d2c6dfc);
														
 
															+	RNDr(S, W, 35, 0x53380d13);
														
 
															+	RNDr(S, W, 36, 0x650a7354);
														
 
															+	RNDr(S, W, 37, 0x766a0abb);
														
 
															+	RNDr(S, W, 38, 0x81c2c92e);
														
 
															+	RNDr(S, W, 39, 0x92722c85);
														
 
															+	RNDr(S, W, 40, 0xa2bfe8a1);
														
 
															+	RNDr(S, W, 41, 0xa81a664b);
														
 
															+	RNDr(S, W, 42, 0xc24b8b70);
														
 
															+	RNDr(S, W, 43, 0xc76c51a3);
														
 
															+	RNDr(S, W, 44, 0xd192e819);
														
 
															+	RNDr(S, W, 45, 0xd6990624);
														
 
															+	RNDr(S, W, 46, 0xf40e3585);
														
 
															+	RNDr(S, W, 47, 0x106aa070);
														
 
															+	RNDr(S, W, 48, 0x19a4c116);
														
 
															+	RNDr(S, W, 49, 0x1e376c08);
														
 
															+	RNDr(S, W, 50, 0x2748774c);
														
 
															+	RNDr(S, W, 51, 0x34b0bcb5);
														
 
															+	RNDr(S, W, 52, 0x391c0cb3);
														
 
															+	RNDr(S, W, 53, 0x4ed8aa4a);
														
 
															+	RNDr(S, W, 54, 0x5b9cca4f);
														
 
															+	RNDr(S, W, 55, 0x682e6ff3);
														
 
															+	RNDr(S, W, 56, 0x748f82ee);
														
 
															+	RNDr(S, W, 57, 0x78a5636f);
														
 
															+	RNDr(S, W, 58, 0x84c87814);
														
 
															+	RNDr(S, W, 59, 0x8cc70208);
														
 
															+	RNDr(S, W, 60, 0x90befffa);
														
 
															+	RNDr(S, W, 61, 0xa4506ceb);
														
 
															+	RNDr(S, W, 62, 0xbef9a3f7);
														
 
															+	RNDr(S, W, 63, 0xc67178f2);
														
 
															+
														
 
															+	/* 4. Mix local working variables into global state */
														
 
															+	for (i = 0; i < 8; i++)
														
 
															+		state[i] += S[i];
														
 
															+}
														
 
															+
														
 
															+static inline void
														
 
															+SHA256_InitState(uint32_t * state)
														
 
															+{
														
 
															+	/* Magic initialization constants */
														
 
															+	state[0] = 0x6A09E667;
														
 
															+	state[1] = 0xBB67AE85;
														
 
															+	state[2] = 0x3C6EF372;
														
 
															+	state[3] = 0xA54FF53A;
														
 
															+	state[4] = 0x510E527F;
														
 
															+	state[5] = 0x9B05688C;
														
 
															+	state[6] = 0x1F83D9AB;
														
 
															+	state[7] = 0x5BE0CD19;
														
 
															+}
														
 
															+
														
 
															+static const uint32_t passwdpad[12] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80020000};
														
 
															+static const uint32_t outerpad[8] = {0x80000000, 0, 0, 0, 0, 0, 0, 0x00000300};
														
 
															+
														
 
															+/**
														
 
															+ * PBKDF2_SHA256(passwd, passwdlen, salt, saltlen, c, buf, dkLen):
														
 
															+ * Compute PBKDF2(passwd, salt, c, dkLen) using HMAC-SHA256 as the PRF, and
														
 
															+ * write the output to buf.  The value dkLen must be at most 32 * (2^32 - 1).
														
 
															+ */
														
 
															+static inline void
														
 
															+PBKDF2_SHA256_80_128(const uint32_t * passwd, uint32_t * buf)
														
 
															+{
														
 
															+	SHA256_CTX PShictx, PShoctx;
														
 
															+	uint32_t tstate[8];
														
 
															+	uint32_t ihash[8];
														
 
															+	uint32_t i;
														
 
															+	uint32_t pad[16];
														
 
															+	
														
 
															+	static const uint32_t innerpad[11] = {0x00000080, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xa0040000};
														
 
															+
														
 
															+	/* If Klen > 64, the key is really SHA256(K). */
														
 
															+	SHA256_InitState(tstate);
														
 
															+	SHA256_Transform(tstate, passwd, 1);
														
 
															+	memcpy(pad, passwd+16, 16);
														
 
															+	memcpy(pad+4, passwdpad, 48);
														
 
															+	SHA256_Transform(tstate, pad, 1);
														
 
															+	memcpy(ihash, tstate, 32);
														
 
															+
														
 
															+	SHA256_InitState(PShictx.state);
														
 
															+	for (i = 0; i < 8; i++)
														
 
															+		pad[i] = ihash[i] ^ 0x36363636;
														
 
															+	for (; i < 16; i++)
														
 
															+		pad[i] = 0x36363636;
														
 
															+	SHA256_Transform(PShictx.state, pad, 0);
														
 
															+	SHA256_Transform(PShictx.state, passwd, 1);
														
 
															+	be32enc_vect(PShictx.buf, passwd+16, 4);
														
 
															+	be32enc_vect(PShictx.buf+5, innerpad, 11);
														
 
															+
														
 
															+	SHA256_InitState(PShoctx.state);
														
 
															+	for (i = 0; i < 8; i++)
														
 
															+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
														
 
															+	for (; i < 16; i++)
														
 
															+		pad[i] = 0x5c5c5c5c;
														
 
															+	SHA256_Transform(PShoctx.state, pad, 0);
														
 
															+	memcpy(PShoctx.buf+8, outerpad, 32);
														
 
															+
														
 
															+	/* Iterate through the blocks. */
														
 
															+	for (i = 0; i < 4; i++) {
														
 
															+		uint32_t istate[8];
														
 
															+		uint32_t ostate[8];
														
 
															+		
														
 
															+		memcpy(istate, PShictx.state, 32);
														
 
															+		PShictx.buf[4] = i + 1;
														
 
															+		SHA256_Transform(istate, PShictx.buf, 0);
														
 
															+		memcpy(PShoctx.buf, istate, 32);
														
 
															+
														
 
															+		memcpy(ostate, PShoctx.state, 32);
														
 
															+		SHA256_Transform(ostate, PShoctx.buf, 0);
														
 
															+		be32enc_vect(buf+i*8, ostate, 8);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															+
														
 
															+static inline uint32_t
														
 
															+PBKDF2_SHA256_80_128_32(const uint32_t * passwd, const uint32_t * salt)
														
 
															+{
														
 
															+	uint32_t tstate[8];
														
 
															+	uint32_t ostate[8];
														
 
															+	uint32_t ihash[8];
														
 
															+	uint32_t i;
														
 
															+
														
 
															+	/* Compute HMAC state after processing P and S. */
														
 
															+	uint32_t pad[16];
														
 
															+	
														
 
															+	static const uint32_t ihash_finalblk[16] = {0x00000001,0x80000000,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0x00000620};
														
 
															+
														
 
															+	/* If Klen > 64, the key is really SHA256(K). */
														
 
															+	SHA256_InitState(tstate);
														
 
															+	SHA256_Transform(tstate, passwd, 1);
														
 
															+	memcpy(pad, passwd+16, 16);
														
 
															+	memcpy(pad+4, passwdpad, 48);
														
 
															+	SHA256_Transform(tstate, pad, 1);
														
 
															+	memcpy(ihash, tstate, 32);
														
 
															+
														
 
															+	SHA256_InitState(ostate);
														
 
															+	for (i = 0; i < 8; i++)
														
 
															+		pad[i] = ihash[i] ^ 0x5c5c5c5c;
														
 
															+	for (; i < 16; i++)
														
 
															+		pad[i] = 0x5c5c5c5c;
														
 
															+	SHA256_Transform(ostate, pad, 0);
														
 
															+
														
 
															+	SHA256_InitState(tstate);
														
 
															+	for (i = 0; i < 8; i++)
														
 
															+		pad[i] = ihash[i] ^ 0x36363636;
														
 
															+	for (; i < 16; i++)
														
 
															+		pad[i] = 0x36363636;
														
 
															+	SHA256_Transform(tstate, pad, 0);
														
 
															+	SHA256_Transform(tstate, salt, 1);
														
 
															+	SHA256_Transform(tstate, salt+16, 1);
														
 
															+	SHA256_Transform(tstate, ihash_finalblk, 0);
														
 
															+	memcpy(pad, tstate, 32);
														
 
															+	memcpy(pad+8, outerpad, 32);
														
 
															+
														
 
															+	/* Feed the inner hash to the outer SHA256 operation. */
														
 
															+	SHA256_Transform(ostate, pad, 0);
														
 
															+	/* Finish the outer SHA256 operation. */
														
 
															+	return byteswap(ostate[7]);
														
 
															+}
														
 
															+
														
 
															+
														
 
															+/**
														
 
															+ * salsa20_8(B):
														
 
															+ * Apply the salsa20/8 core to the provided block.
														
 
															+ */
														
 
															+static inline void
														
 
															+salsa20_8(uint32_t B[16], const uint32_t Bx[16])
														
 
															+{
														
 
															+	uint32_t x00,x01,x02,x03,x04,x05,x06,x07,x08,x09,x10,x11,x12,x13,x14,x15;
														
 
															+	size_t i;
														
 
															+
														
 
															+	x00 = (B[ 0] ^= Bx[ 0]);
														
 
															+	x01 = (B[ 1] ^= Bx[ 1]);
														
 
															+	x02 = (B[ 2] ^= Bx[ 2]);
														
 
															+	x03 = (B[ 3] ^= Bx[ 3]);
														
 
															+	x04 = (B[ 4] ^= Bx[ 4]);
														
 
															+	x05 = (B[ 5] ^= Bx[ 5]);
														
 
															+	x06 = (B[ 6] ^= Bx[ 6]);
														
 
															+	x07 = (B[ 7] ^= Bx[ 7]);
														
 
															+	x08 = (B[ 8] ^= Bx[ 8]);
														
 
															+	x09 = (B[ 9] ^= Bx[ 9]);
														
 
															+	x10 = (B[10] ^= Bx[10]);
														
 
															+	x11 = (B[11] ^= Bx[11]);
														
 
															+	x12 = (B[12] ^= Bx[12]);
														
 
															+	x13 = (B[13] ^= Bx[13]);
														
 
															+	x14 = (B[14] ^= Bx[14]);
														
 
															+	x15 = (B[15] ^= Bx[15]);
														
 
															+	for (i = 0; i < 8; i += 2) {
														
 
															+#define R(a,b) (((a) << (b)) | ((a) >> (32 - (b))))
														
 
															+		/* Operate on columns. */
														
 
															+		x04 ^= R(x00+x12, 7);	x09 ^= R(x05+x01, 7);	x14 ^= R(x10+x06, 7);	x03 ^= R(x15+x11, 7);
														
 
															+		x08 ^= R(x04+x00, 9);	x13 ^= R(x09+x05, 9);	x02 ^= R(x14+x10, 9);	x07 ^= R(x03+x15, 9);
														
 
															+		x12 ^= R(x08+x04,13);	x01 ^= R(x13+x09,13);	x06 ^= R(x02+x14,13);	x11 ^= R(x07+x03,13);
														
 
															+		x00 ^= R(x12+x08,18);	x05 ^= R(x01+x13,18);	x10 ^= R(x06+x02,18);	x15 ^= R(x11+x07,18);
														
 
															+
														
 
															+		/* Operate on rows. */
														
 
															+		x01 ^= R(x00+x03, 7);	x06 ^= R(x05+x04, 7);	x11 ^= R(x10+x09, 7);	x12 ^= R(x15+x14, 7);
														
 
															+		x02 ^= R(x01+x00, 9);	x07 ^= R(x06+x05, 9);	x08 ^= R(x11+x10, 9);	x13 ^= R(x12+x15, 9);
														
 
															+		x03 ^= R(x02+x01,13);	x04 ^= R(x07+x06,13);	x09 ^= R(x08+x11,13);	x14 ^= R(x13+x12,13);
														
 
															+		x00 ^= R(x03+x02,18);	x05 ^= R(x04+x07,18);	x10 ^= R(x09+x08,18);	x15 ^= R(x14+x13,18);
														
 
															+#undef R
														
 
															+	}
														
 
															+	B[ 0] += x00;
														
 
															+	B[ 1] += x01;
														
 
															+	B[ 2] += x02;
														
 
															+	B[ 3] += x03;
														
 
															+	B[ 4] += x04;
														
 
															+	B[ 5] += x05;
														
 
															+	B[ 6] += x06;
														
 
															+	B[ 7] += x07;
														
 
															+	B[ 8] += x08;
														
 
															+	B[ 9] += x09;
														
 
															+	B[10] += x10;
														
 
															+	B[11] += x11;
														
 
															+	B[12] += x12;
														
 
															+	B[13] += x13;
														
 
															+	B[14] += x14;
														
 
															+	B[15] += x15;
														
 
															+}
														
 
															+
														
 
															+/* cpu and memory intensive function to transform a 80 byte buffer into a 32 byte output
														
 
															+   scratchpad size needs to be at least 63 + (128 * r * p) + (256 * r + 64) + (128 * r * N) bytes
														
 
															+ */
														
 
															+static uint32_t scrypt_1024_1_1_256_sp(const uint32_t* input, char* scratchpad)
														
 
															+{
														
 
															+	uint32_t * V;
														
 
															+	uint32_t X[32];
														
 
															+	uint32_t i;
														
 
															+	uint32_t j;
														
 
															+	uint32_t k;
														
 
															+	uint64_t *p1, *p2;
														
 
															+
														
 
															+	p1 = (uint64_t *)X;
														
 
															+	V = (uint32_t *)(((uintptr_t)(scratchpad) + 63) & ~ (uintptr_t)(63));
														
 
															+
														
 
															+	PBKDF2_SHA256_80_128(input, X);
														
 
															+
														
 
															+	for (i = 0; i < 1024; i += 2) {
														
 
															+		memcpy(&V[i * 32], X, 128);
														
 
															+
														
 
															+		salsa20_8(&X[0], &X[16]);
														
 
															+		salsa20_8(&X[16], &X[0]);
														
 
															+
														
 
															+		memcpy(&V[(i + 1) * 32], X, 128);
														
 
															+
														
 
															+		salsa20_8(&X[0], &X[16]);
														
 
															+		salsa20_8(&X[16], &X[0]);
														
 
															+	}
														
 
															+	for (i = 0; i < 1024; i += 2) {
														
 
															+		j = X[16] & 1023;
														
 
															+		p2 = (uint64_t *)(&V[j * 32]);
														
 
															+		for(k = 0; k < 16; k++)
														
 
															+			p1[k] ^= p2[k];
														
 
															+
														
 
															+		salsa20_8(&X[0], &X[16]);
														
 
															+		salsa20_8(&X[16], &X[0]);
														
 
															+
														
 
															+		j = X[16] & 1023;
														
 
															+		p2 = (uint64_t *)(&V[j * 32]);
														
 
															+		for(k = 0; k < 16; k++)
														
 
															+			p1[k] ^= p2[k];
														
 
															+
														
 
															+		salsa20_8(&X[0], &X[16]);
														
 
															+		salsa20_8(&X[16], &X[0]);
														
 
															+	}
														
 
															+
														
 
															+	return PBKDF2_SHA256_80_128_32(input, X);
														
 
															+}
														
 
															+
														
 
															+bool scanhash_scrypt(struct thr_info *thr, const unsigned char *pmidstate, unsigned char *pdata,
														
 
															+	unsigned char *phash1, unsigned char *phash,
														
 
															+	const unsigned char *ptarget,
														
 
															+	uint32_t max_nonce, uint32_t *last_nonce,
														
 
															+	uint32_t n)
														
 
															+{
														
 
															+	uint32_t *nonce = (uint32_t *)(pdata + 76);
														
 
															+	unsigned char *scratchbuf;
														
 
															+	uint32_t data[20];
														
 
															+	uint32_t tmp_hash7;
														
 
															+	uint32_t Htarg = ((const uint32_t *)ptarget)[7];
														
 
															+	bool ret = false;
														
 
															+	int i;
														
 
															+
														
 
															+	be32enc_vect(data, (const uint32_t *)pdata, 19);
														
 
															+
														
 
															+	scratchbuf = malloc(131583);
														
 
															+	if (unlikely(!scratchbuf)) {
														
 
															+		applog(LOG_ERR, "Failed to malloc scratchbuf in scanhash_scrypt");
														
 
															+		return ret;
														
 
															+	}
														
 
															+
														
 
															+	while(1) {
														
 
															+		*nonce = ++n;
														
 
															+		data[19] = n;
														
 
															+		tmp_hash7 = scrypt_1024_1_1_256_sp(data, scratchbuf);
														
 
															+
														
 
															+		if (unlikely(tmp_hash7 <= Htarg)) {
														
 
															+			((uint32_t *)pdata)[19] = byteswap(n);
														
 
															+			*last_nonce = n;
														
 
															+			ret = true;
														
 
															+			break;
														
 
															+		}
														
 
															+
														
 
															+		if (unlikely((n >= max_nonce) || thr->work_restart)) {
														
 
															+			*last_nonce = n;
														
 
															+			break;
														
 
															+		}
														
 
															+	}
														
 
															+out_ret:
														
 
															+	free(scratchbuf);;
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
--- a/scrypt120724.cl
+++ b/scrypt120724.cl
@@ -0,0 +1,757 @@
 
															+#define rotl(x,y) rotate(x,y)

														
 
															+#define Ch(x,y,z) bitselect(z,y,x)

														
 
															+#define Maj(x,y,z) Ch((x^z),y,z)

														
 
															+

														
 
															+#define EndianSwap(n) (rotl(n&0x00FF00FF,24U)|rotl(n&0xFF00FF00,8U))

														
 
															+

														
 
															+#define Tr2(x)		(rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))

														
 
															+#define Tr1(x)		(rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))

														
 
															+#define Wr2(x)		(rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))

														
 
															+#define Wr1(x)		(rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))

														
 
															+

														
 
															+#define RND(a, b, c, d, e, f, g, h, k)			\

														
 
															+	h += Tr1(e) + Ch(e, f, g) + k;		\

														
 
															+	d += h;					\

														
 
															+	h += Tr2(a) + Maj(a, b, c);

														
 
															+

														
 
															+void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)

														
 
															+{

														
 
															+	uint4 S0 = *state0;

														
 
															+	uint4 S1 = *state1;

														
 
															+	

														
 
															+#define A S0.x

														
 
															+#define B S0.y

														
 
															+#define C S0.z

														
 
															+#define D S0.w

														
 
															+#define E S1.x

														
 
															+#define F S1.y

														
 
															+#define G S1.z

														
 
															+#define H S1.w

														
 
															+

														
 
															+	uint4 W[4];

														
 
															+

														
 
															+	W[ 0].x = block0.x;

														
 
															+	RND(A,B,C,D,E,F,G,H, W[0].x+0x428a2f98U);

														
 
															+	W[ 0].y = block0.y;

														
 
															+	RND(H,A,B,C,D,E,F,G, W[0].y+0x71374491U);

														
 
															+	W[ 0].z = block0.z;

														
 
															+	RND(G,H,A,B,C,D,E,F, W[0].z+0xb5c0fbcfU);

														
 
															+	W[ 0].w = block0.w;

														
 
															+	RND(F,G,H,A,B,C,D,E, W[0].w+0xe9b5dba5U);

														
 
															+

														
 
															+	W[ 1].x = block1.x;

														
 
															+	RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU);

														
 
															+	W[ 1].y = block1.y;

														
 
															+	RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U);

														
 
															+	W[ 1].z = block1.z;

														
 
															+	RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U);

														
 
															+	W[ 1].w = block1.w;

														
 
															+	RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U);

														
 
															+

														
 
															+	W[ 2].x = block2.x;

														
 
															+	RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U);

														
 
															+	W[ 2].y = block2.y;

														
 
															+	RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U);

														
 
															+	W[ 2].z = block2.z;

														
 
															+	RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU);

														
 
															+	W[ 2].w = block2.w;

														
 
															+	RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U);

														
 
															+

														
 
															+	W[ 3].x = block3.x;

														
 
															+	RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U);

														
 
															+	W[ 3].y = block3.y;

														
 
															+	RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU);

														
 
															+	W[ 3].z = block3.z;

														
 
															+	RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U);

														
 
															+	W[ 3].w = block3.w;

														
 
															+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U);

														
 
															+

														
 
															+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);

														
 
															+	RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U);

														
 
															+

														
 
															+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);

														
 
															+	RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U);

														
 
															+

														
 
															+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);

														
 
															+	RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U);

														
 
															+

														
 
															+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);

														
 
															+	RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU);

														
 
															+

														
 
															+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);

														
 
															+	RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU);

														
 
															+

														
 
															+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);

														
 
															+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU);

														
 
															+

														
 
															+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);

														
 
															+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU);

														
 
															+

														
 
															+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);

														
 
															+	RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU);

														
 
															+

														
 
															+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);

														
 
															+	RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U);

														
 
															+

														
 
															+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);

														
 
															+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU);

														
 
															+

														
 
															+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);

														
 
															+	RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U);

														
 
															+

														
 
															+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);

														
 
															+	RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U);

														
 
															+

														
 
															+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);

														
 
															+	RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U);

														
 
															+

														
 
															+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);

														
 
															+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U);

														
 
															+

														
 
															+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);

														
 
															+	RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U);

														
 
															+

														
 
															+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);

														
 
															+	RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U);

														
 
															+

														
 
															+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);

														
 
															+	RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U);

														
 
															+

														
 
															+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);

														
 
															+	RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U);

														
 
															+

														
 
															+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);

														
 
															+	RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU);

														
 
															+

														
 
															+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);

														
 
															+	RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U);

														
 
															+

														
 
															+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);

														
 
															+	RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U);

														
 
															+

														
 
															+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);

														
 
															+	RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU);

														
 
															+

														
 
															+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);

														
 
															+	RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU);

														
 
															+

														
 
															+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);

														
 
															+	RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U);

														
 
															+

														
 
															+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);

														
 
															+	RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U);

														
 
															+

														
 
															+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);

														
 
															+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU);

														
 
															+

														
 
															+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);

														
 
															+	RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U);

														
 
															+

														
 
															+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);

														
 
															+	RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U);

														
 
															+

														
 
															+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);

														
 
															+	RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U);

														
 
															+

														
 
															+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);

														
 
															+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U);

														
 
															+

														
 
															+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);

														
 
															+	RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U);

														
 
															+

														
 
															+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);

														
 
															+	RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U);

														
 
															+

														
 
															+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);

														
 
															+	RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U);

														
 
															+

														
 
															+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);

														
 
															+	RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U);

														
 
															+

														
 
															+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);

														
 
															+	RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU);

														
 
															+

														
 
															+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);

														
 
															+	RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U);

														
 
															+

														
 
															+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);

														
 
															+	RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U);

														
 
															+

														
 
															+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);

														
 
															+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU);

														
 
															+

														
 
															+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);

														
 
															+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU);

														
 
															+

														
 
															+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);

														
 
															+	RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U);

														
 
															+

														
 
															+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);

														
 
															+	RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU);

														
 
															+

														
 
															+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);

														
 
															+	RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU);

														
 
															+

														
 
															+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);

														
 
															+	RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U);

														
 
															+

														
 
															+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);

														
 
															+	RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U);

														
 
															+

														
 
															+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);

														
 
															+	RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU);

														
 
															+

														
 
															+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);

														
 
															+	RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU);

														
 
															+

														
 
															+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);

														
 
															+	RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U);

														
 
															+

														
 
															+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);

														
 
															+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U);

														
 
															+	

														
 
															+#undef A

														
 
															+#undef B

														
 
															+#undef C

														
 
															+#undef D

														
 
															+#undef E

														
 
															+#undef F

														
 
															+#undef G

														
 
															+#undef H

														
 
															+

														
 
															+	*state0 += S0;

														
 
															+	*state1 += S1;

														
 
															+}

														
 
															+

														
 
															+void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)

														
 
															+{

														
 
															+#define A (*state0).x

														
 
															+#define B (*state0).y

														
 
															+#define C (*state0).z

														
 
															+#define D (*state0).w

														
 
															+#define E (*state1).x

														
 
															+#define F (*state1).y

														
 
															+#define G (*state1).z

														
 
															+#define H (*state1).w

														
 
															+

														
 
															+	uint4 W[4];

														
 
															+

														
 
															+	W[0].x = block0.x;

														
 
															+	D=0x98c7e2a2U+W[0].x;

														
 
															+	H=0xfc08884dU+W[0].x;

														
 
															+

														
 
															+	W[0].y = block0.y;

														
 
															+	C=0xcd2a11aeU+Tr1(D)+Ch(D,0x510e527fU,0x9b05688cU)+W[0].y;

														
 
															+	G=0xC3910C8EU+C+Tr2(H)+Ch(H,0xfb6feee7U,0x2a01a605U);

														
 
															+

														
 
															+	W[0].z = block0.z;

														
 
															+	B=0x0c2e12e0U+Tr1(C)+Ch(C,D,0x510e527fU)+W[0].z;

														
 
															+	F=0x4498517BU+B+Tr2(G)+Maj(G,H,0x6a09e667U);

														
 
															+

														
 
															+	W[0].w = block0.w;

														
 
															+	A=0xa4ce148bU+Tr1(B)+Ch(B,C,D)+W[0].w; 

														
 
															+	E=0x95F61999U+A+Tr2(F)+Maj(F,G,H);

														
 
															+

														
 
															+	W[1].x = block1.x;

														
 
															+	RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU);

														
 
															+	W[1].y = block1.y;

														
 
															+	RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U);

														
 
															+	W[1].z = block1.z;

														
 
															+	RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U);

														
 
															+	W[1].w = block1.w;

														
 
															+	RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U);

														
 
															+	

														
 
															+	W[2].x = block2.x;

														
 
															+	RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U);

														
 
															+	W[2].y = block2.y;

														
 
															+	RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U);

														
 
															+	W[2].z = block2.z;

														
 
															+	RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU);

														
 
															+	W[2].w = block2.w;

														
 
															+	RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U);

														
 
															+	

														
 
															+	W[3].x = block3.x;

														
 
															+	RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U);

														
 
															+	W[3].y = block3.y;

														
 
															+	RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU);

														
 
															+	W[3].z = block3.z;

														
 
															+	RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U);

														
 
															+	W[3].w = block3.w;

														
 
															+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U);

														
 
															+

														
 
															+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);

														
 
															+	RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U);

														
 
															+

														
 
															+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);

														
 
															+	RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U);

														
 
															+

														
 
															+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);

														
 
															+	RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U);

														
 
															+

														
 
															+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);

														
 
															+	RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU);

														
 
															+

														
 
															+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);

														
 
															+	RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU);

														
 
															+

														
 
															+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);

														
 
															+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU);

														
 
															+

														
 
															+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);

														
 
															+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU);

														
 
															+

														
 
															+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);

														
 
															+	RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU);

														
 
															+

														
 
															+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);

														
 
															+	RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U);

														
 
															+

														
 
															+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);

														
 
															+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU);

														
 
															+

														
 
															+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);

														
 
															+	RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U);

														
 
															+

														
 
															+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);

														
 
															+	RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U);

														
 
															+

														
 
															+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);

														
 
															+	RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U);

														
 
															+

														
 
															+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);

														
 
															+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U);

														
 
															+

														
 
															+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);

														
 
															+	RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U);

														
 
															+

														
 
															+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);

														
 
															+	RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U);

														
 
															+

														
 
															+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);

														
 
															+	RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U);

														
 
															+

														
 
															+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);

														
 
															+	RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U);

														
 
															+

														
 
															+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);

														
 
															+	RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU);

														
 
															+

														
 
															+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);

														
 
															+	RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U);

														
 
															+

														
 
															+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);

														
 
															+	RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U);

														
 
															+

														
 
															+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);

														
 
															+	RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU);

														
 
															+

														
 
															+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);

														
 
															+	RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU);

														
 
															+

														
 
															+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);

														
 
															+	RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U);

														
 
															+

														
 
															+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);

														
 
															+	RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U);

														
 
															+

														
 
															+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);

														
 
															+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU);

														
 
															+

														
 
															+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);

														
 
															+	RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U);

														
 
															+

														
 
															+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);

														
 
															+	RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U);

														
 
															+

														
 
															+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);

														
 
															+	RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U);

														
 
															+

														
 
															+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);

														
 
															+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U);

														
 
															+

														
 
															+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);

														
 
															+	RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U);

														
 
															+

														
 
															+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);

														
 
															+	RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U);

														
 
															+

														
 
															+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);

														
 
															+	RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U);

														
 
															+

														
 
															+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);

														
 
															+	RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U);

														
 
															+

														
 
															+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);

														
 
															+	RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU);

														
 
															+

														
 
															+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);

														
 
															+	RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U);

														
 
															+

														
 
															+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);

														
 
															+	RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U);

														
 
															+

														
 
															+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);

														
 
															+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU);

														
 
															+

														
 
															+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);

														
 
															+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU);

														
 
															+

														
 
															+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);

														
 
															+	RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U);

														
 
															+

														
 
															+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);

														
 
															+	RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU);

														
 
															+

														
 
															+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);

														
 
															+	RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU);

														
 
															+

														
 
															+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);

														
 
															+	RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U);

														
 
															+

														
 
															+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);

														
 
															+	RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U);

														
 
															+

														
 
															+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);

														
 
															+	RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU);

														
 
															+

														
 
															+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);

														
 
															+	RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU);

														
 
															+

														
 
															+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);

														
 
															+	RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U);

														
 
															+

														
 
															+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);

														
 
															+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U);

														
 
															+	

														
 
															+#undef A

														
 
															+#undef B

														
 
															+#undef C

														
 
															+#undef D

														
 
															+#undef E

														
 
															+#undef F

														
 
															+#undef G

														
 
															+#undef H

														
 
															+

														
 
															+	*state0 += (uint4)(0x6A09E667U,0xBB67AE85U,0x3C6EF372U,0xA54FF53AU);

														
 
															+	*state1 += (uint4)(0x510E527FU,0x9B05688CU,0x1F83D9ABU,0x5BE0CD19U);

														
 
															+}

														
 
															+

														
 
															+__constant uint fixedW[64] =

														
 
															+{

														
 
															+	0x428a2f99,0xf1374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,

														
 
															+	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf794,

														
 
															+	0xf59b89c2,0x73924787,0x23c6886e,0xa42ca65c,0x15ed3627,0x4d6edcbf,0xe28217fc,0xef02488f,

														
 
															+	0xb707775c,0x0468c23f,0xe7e72b4c,0x49e1f1a2,0x4b99c816,0x926d1570,0xaa0fc072,0xadb36e2c,

														
 
															+	0xad87a3ea,0xbcb1d3a3,0x7b993186,0x562b9420,0xbff3ca0c,0xda4b0c23,0x6cd8711a,0x8f337caa,

														
 
															+	0xc91b1417,0xc359dce1,0xa83253a7,0x3b13c12d,0x9d3d725d,0xd9031a84,0xb1a03340,0x16f58012,

														
 
															+	0xe64fb6a2,0xe84d923a,0xe93a5730,0x09837686,0x078ff753,0x29833341,0xd5de0b7e,0x6948ccf4,

														
 
															+	0xe0a1adbe,0x7c728e11,0x511c78e4,0x315b45bd,0xfca71413,0xea28f96a,0x79703128,0x4e1ef848,

														
 
															+};

														
 
															+

														
 
															+void SHA256_fixed(uint4*restrict state0,uint4*restrict state1)

														
 
															+{

														
 
															+	uint4 S0 = *state0;

														
 
															+	uint4 S1 = *state1;

														
 
															+

														
 
															+#define A S0.x

														
 
															+#define B S0.y

														
 
															+#define C S0.z

														
 
															+#define D S0.w

														
 
															+#define E S1.x

														
 
															+#define F S1.y

														
 
															+#define G S1.z

														
 
															+#define H S1.w

														
 
															+

														
 
															+	RND(A,B,C,D,E,F,G,H, fixedW[0]);

														
 
															+	RND(H,A,B,C,D,E,F,G, fixedW[1]);

														
 
															+	RND(G,H,A,B,C,D,E,F, fixedW[2]);

														
 
															+	RND(F,G,H,A,B,C,D,E, fixedW[3]);

														
 
															+	RND(E,F,G,H,A,B,C,D, fixedW[4]);

														
 
															+	RND(D,E,F,G,H,A,B,C, fixedW[5]);

														
 
															+	RND(C,D,E,F,G,H,A,B, fixedW[6]);

														
 
															+	RND(B,C,D,E,F,G,H,A, fixedW[7]);

														
 
															+	RND(A,B,C,D,E,F,G,H, fixedW[8]);

														
 
															+	RND(H,A,B,C,D,E,F,G, fixedW[9]);

														
 
															+	RND(G,H,A,B,C,D,E,F, fixedW[10]);

														
 
															+	RND(F,G,H,A,B,C,D,E, fixedW[11]);

														
 
															+	RND(E,F,G,H,A,B,C,D, fixedW[12]);

														
 
															+	RND(D,E,F,G,H,A,B,C, fixedW[13]);

														
 
															+	RND(C,D,E,F,G,H,A,B, fixedW[14]);

														
 
															+	RND(B,C,D,E,F,G,H,A, fixedW[15]);

														
 
															+	RND(A,B,C,D,E,F,G,H, fixedW[16]);

														
 
															+	RND(H,A,B,C,D,E,F,G, fixedW[17]);

														
 
															+	RND(G,H,A,B,C,D,E,F, fixedW[18]);

														
 
															+	RND(F,G,H,A,B,C,D,E, fixedW[19]);

														
 
															+	RND(E,F,G,H,A,B,C,D, fixedW[20]);

														
 
															+	RND(D,E,F,G,H,A,B,C, fixedW[21]);

														
 
															+	RND(C,D,E,F,G,H,A,B, fixedW[22]);

														
 
															+	RND(B,C,D,E,F,G,H,A, fixedW[23]);

														
 
															+	RND(A,B,C,D,E,F,G,H, fixedW[24]);

														
 
															+	RND(H,A,B,C,D,E,F,G, fixedW[25]);

														
 
															+	RND(G,H,A,B,C,D,E,F, fixedW[26]);

														
 
															+	RND(F,G,H,A,B,C,D,E, fixedW[27]);

														
 
															+	RND(E,F,G,H,A,B,C,D, fixedW[28]);

														
 
															+	RND(D,E,F,G,H,A,B,C, fixedW[29]);

														
 
															+	RND(C,D,E,F,G,H,A,B, fixedW[30]);

														
 
															+	RND(B,C,D,E,F,G,H,A, fixedW[31]);

														
 
															+	RND(A,B,C,D,E,F,G,H, fixedW[32]);

														
 
															+	RND(H,A,B,C,D,E,F,G, fixedW[33]);

														
 
															+	RND(G,H,A,B,C,D,E,F, fixedW[34]);

														
 
															+	RND(F,G,H,A,B,C,D,E, fixedW[35]);

														
 
															+	RND(E,F,G,H,A,B,C,D, fixedW[36]);

														
 
															+	RND(D,E,F,G,H,A,B,C, fixedW[37]);

														
 
															+	RND(C,D,E,F,G,H,A,B, fixedW[38]);

														
 
															+	RND(B,C,D,E,F,G,H,A, fixedW[39]);

														
 
															+	RND(A,B,C,D,E,F,G,H, fixedW[40]);

														
 
															+	RND(H,A,B,C,D,E,F,G, fixedW[41]);

														
 
															+	RND(G,H,A,B,C,D,E,F, fixedW[42]);

														
 
															+	RND(F,G,H,A,B,C,D,E, fixedW[43]);

														
 
															+	RND(E,F,G,H,A,B,C,D, fixedW[44]);

														
 
															+	RND(D,E,F,G,H,A,B,C, fixedW[45]);

														
 
															+	RND(C,D,E,F,G,H,A,B, fixedW[46]);

														
 
															+	RND(B,C,D,E,F,G,H,A, fixedW[47]);

														
 
															+	RND(A,B,C,D,E,F,G,H, fixedW[48]);

														
 
															+	RND(H,A,B,C,D,E,F,G, fixedW[49]);

														
 
															+	RND(G,H,A,B,C,D,E,F, fixedW[50]);

														
 
															+	RND(F,G,H,A,B,C,D,E, fixedW[51]);

														
 
															+	RND(E,F,G,H,A,B,C,D, fixedW[52]);

														
 
															+	RND(D,E,F,G,H,A,B,C, fixedW[53]);

														
 
															+	RND(C,D,E,F,G,H,A,B, fixedW[54]);

														
 
															+	RND(B,C,D,E,F,G,H,A, fixedW[55]);

														
 
															+	RND(A,B,C,D,E,F,G,H, fixedW[56]);

														
 
															+	RND(H,A,B,C,D,E,F,G, fixedW[57]);

														
 
															+	RND(G,H,A,B,C,D,E,F, fixedW[58]);

														
 
															+	RND(F,G,H,A,B,C,D,E, fixedW[59]);

														
 
															+	RND(E,F,G,H,A,B,C,D, fixedW[60]);

														
 
															+	RND(D,E,F,G,H,A,B,C, fixedW[61]);

														
 
															+	RND(C,D,E,F,G,H,A,B, fixedW[62]);

														
 
															+	RND(B,C,D,E,F,G,H,A, fixedW[63]);

														
 
															+	

														
 
															+#undef A

														
 
															+#undef B

														
 
															+#undef C

														
 
															+#undef D

														
 
															+#undef E

														
 
															+#undef F

														
 
															+#undef G

														
 
															+#undef H

														
 
															+	*state0 += S0;

														
 
															+	*state1 += S1;

														
 
															+}

														
 
															+

														
 
															+void shittify(uint4 B[8])

														
 
															+{

														
 
															+	uint4 tmp[4];

														
 
															+	tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w);

														
 
															+	tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w);

														
 
															+	tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w);

														
 
															+	tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w);

														
 
															+	

														
 
															+#pragma unroll

														
 
															+	for(uint i=0; i<4; ++i)

														
 
															+		B[i] = EndianSwap(tmp[i]);

														
 
															+

														
 
															+	tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w);

														
 
															+	tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w);

														
 
															+	tmp[2] = (uint4)(B[7].x,B[4].y,B[5].z,B[6].w);

														
 
															+	tmp[3] = (uint4)(B[4].x,B[5].y,B[6].z,B[7].w);

														
 
															+	

														
 
															+#pragma unroll

														
 
															+	for(uint i=0; i<4; ++i)

														
 
															+		B[i+4] = EndianSwap(tmp[i]);

														
 
															+}

														
 
															+

														
 
															+void unshittify(uint4 B[8])

														
 
															+{

														
 
															+	uint4 tmp[4];

														
 
															+	tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w);

														
 
															+	tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w);

														
 
															+	tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w);

														
 
															+	tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w);

														
 
															+	

														
 
															+#pragma unroll

														
 
															+	for(uint i=0; i<4; ++i)

														
 
															+		B[i] = EndianSwap(tmp[i]);

														
 
															+

														
 
															+	tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w);

														
 
															+	tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w);

														
 
															+	tmp[2] = (uint4)(B[5].x,B[4].y,B[7].z,B[6].w);

														
 
															+	tmp[3] = (uint4)(B[6].x,B[5].y,B[4].z,B[7].w);

														
 
															+	

														
 
															+#pragma unroll

														
 
															+	for(uint i=0; i<4; ++i)

														
 
															+		B[i+4] = EndianSwap(tmp[i]);

														
 
															+}

														
 
															+

														
 
															+void salsa(uint4 B[8])

														
 
															+{

														
 
															+	uint4 w[4];

														
 
															+

														
 
															+#pragma unroll

														
 
															+	for(uint i=0; i<4; ++i)

														
 
															+		w[i] = (B[i]^=B[i+4]);

														
 
															+

														
 
															+#pragma unroll

														
 
															+	for(uint i=0; i<4; ++i)

														
 
															+	{

														
 
															+		w[0] ^= rotl(w[3]     +w[2]     , 7U);

														
 
															+		w[1] ^= rotl(w[0]     +w[3]     , 9U);

														
 
															+		w[2] ^= rotl(w[1]     +w[0]     ,13U);

														
 
															+		w[3] ^= rotl(w[2]     +w[1]     ,18U);

														
 
															+		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);

														
 
															+		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);

														
 
															+		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);

														
 
															+		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);

														
 
															+	}

														
 
															+

														
 
															+#pragma unroll

														
 
															+	for(uint i=0; i<4; ++i)

														
 
															+		w[i] = (B[i+4]^=(B[i]+=w[i]));

														
 
															+

														
 
															+#pragma unroll

														
 
															+	for(uint i=0; i<4; ++i)

														
 
															+	{

														
 
															+		w[0] ^= rotl(w[3]     +w[2]     , 7U);

														
 
															+		w[1] ^= rotl(w[0]     +w[3]     , 9U);

														
 
															+		w[2] ^= rotl(w[1]     +w[0]     ,13U);

														
 
															+		w[3] ^= rotl(w[2]     +w[1]     ,18U);

														
 
															+		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);

														
 
															+		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);

														
 
															+		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);

														
 
															+		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);

														
 
															+	}

														
 
															+

														
 
															+#pragma unroll

														
 
															+	for(uint i=0; i<4; ++i)

														
 
															+		B[i+4] += w[i];

														
 
															+}

														
 
															+

														
 
															+#define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)

														
 
															+#define CO Coord(z,x,y)

														
 
															+

														
 
															+void scrypt_core(uint4 X[8], __global uint4*restrict lookup)

														
 
															+{

														
 
															+	shittify(X);

														
 
															+	const uint zSIZE = 8;

														
 
															+	const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0));

														
 
															+	const uint xSIZE = CONCURRENT_THREADS;

														
 
															+	uint x = get_global_id(0)%xSIZE;

														
 
															+

														
 
															+	for(uint y=0; y<1024/LOOKUP_GAP; ++y)

														
 
															+	{

														
 
															+#pragma unroll

														
 
															+		for(uint z=0; z<zSIZE; ++z)

														
 
															+			lookup[CO] = X[z];

														
 
															+		for(uint i=0; i<LOOKUP_GAP; ++i) 

														
 
															+			salsa(X);

														
 
															+	}

														
 
															+#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)

														
 
															+	{

														
 
															+		uint y = (1024/LOOKUP_GAP);

														
 
															+#pragma unroll

														
 
															+		for(uint z=0; z<zSIZE; ++z)

														
 
															+			lookup[CO] = X[z];

														
 
															+		for(uint i=0; i<1024%LOOKUP_GAP; ++i)

														
 
															+			salsa(X); 

														
 
															+	}

														
 
															+#endif

														
 
															+	for (uint i=0; i<1024; ++i) 

														
 
															+	{

														
 
															+		uint4 V[8];

														
 
															+		uint j = X[7].x & 0x3FF;

														
 
															+		uint y = (j/LOOKUP_GAP);

														
 
															+#pragma unroll

														
 
															+		for(uint z=0; z<zSIZE; ++z)

														
 
															+			V[z] = lookup[CO];

														
 
															+

														
 
															+#if (LOOKUP_GAP == 1)

														
 
															+#elif (LOOKUP_GAP == 2)

														
 
															+		if (j&1)

														
 
															+			salsa(V);

														
 
															+#else

														
 
															+		uint val = j%LOOKUP_GAP;

														
 
															+		for (uint z=0; z<val; ++z) 

														
 
															+			salsa(V);

														
 
															+#endif

														
 
															+

														
 
															+#pragma unroll

														
 
															+		for(uint z=0; z<zSIZE; ++z)

														
 
															+			X[z] ^= V[z];

														
 
															+		salsa(X);

														
 
															+	}

														
 
															+	unshittify(X);

														
 
															+}

														
 
															+

														
 
															+#define FOUND (0x800)

														
 
															+#define NFLAG (0x7FF)

														
 
															+

														
 
															+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))

														
 
															+__kernel void search(__global const uint4 * restrict input,

														
 
															+__global uint*restrict output, __global uint4*restrict padcache,

														
 
															+const uint4 midstate0, const uint4 midstate16, const uint target)

														
 
															+{

														
 
															+	uint gid = get_global_id(0);

														
 
															+	uint4 X[8];

														
 
															+	uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1;

														
 
															+	uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid);

														
 
															+	uint4 pad0 = midstate0, pad1 = midstate16;

														
 
															+

														
 
															+	SHA256(&pad0,&pad1, data, (uint4)(0x80000000U,0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0,0x280));

														
 
															+	SHA256_fresh(&ostate0,&ostate1, pad0^0x5C5C5C5CU, pad1^0x5C5C5C5CU, 0x5C5C5C5CU, 0x5C5C5C5CU);

														
 
															+	SHA256_fresh(&tstate0,&tstate1, pad0^0x36363636U, pad1^0x36363636U, 0x36363636U, 0x36363636U);

														
 
															+

														
 
															+	tmp0 = tstate0;

														
 
															+	tmp1 = tstate1;

														
 
															+	SHA256(&tstate0, &tstate1, input[0],input[1],input[2],input[3]);

														
 
															+

														
 
															+#pragma unroll

														
 
															+	for (uint i=0; i<4; i++) 

														
 
															+	{

														
 
															+		pad0 = tstate0;

														
 
															+		pad1 = tstate1;

														
 
															+		X[i*2 ] = ostate0;

														
 
															+		X[i*2+1] = ostate1;

														
 
															+

														
 
															+		SHA256(&pad0,&pad1, data, (uint4)(i+1,0x80000000U,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0,0x4a0U));

														
 
															+		SHA256(X+i*2,X+i*2+1, pad0, pad1, (uint4)(0x80000000U, 0U, 0U, 0U), (uint4)(0U, 0U, 0U, 0x300U));

														
 
															+	}

														
 
															+	scrypt_core(X,padcache);

														
 
															+	SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);

														
 
															+	SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);

														
 
															+	SHA256_fixed(&tmp0,&tmp1);

														
 
															+	SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(0x80000000U, 0U, 0U, 0U), (uint4)(0U, 0U, 0U, 0x300U));

														
 
															+

														
 
															+	bool found = (EndianSwap(ostate1.w) <= target);

														
 
															+	if (found)

														
 
															+		output[FOUND] = output[NFLAG & gid] = gid;

														
 
															+}

														
 
															+

														
 
															+/*-

														
 
															+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,

														
 
															+ * 2012 Con Kolivas.

														
 
															+ * All rights reserved.

														
 
															+ *

														
 
															+ * Redistribution and use in source and binary forms, with or without

														
 
															+ * modification, are permitted provided that the following conditions

														
 
															+ * are met:

														
 
															+ * 1. Redistributions of source code must retain the above copyright

														
 
															+ *    notice, this list of conditions and the following disclaimer.

														
 
															+ * 2. Redistributions in binary form must reproduce the above copyright

														
 
															+ *    notice, this list of conditions and the following disclaimer in the

														
 
															+ *    documentation and/or other materials provided with the distribution.

														
 
															+ *

														
 
															+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND

														
 
															+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

														
 
															+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

														
 
															+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE

														
 
															+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

														
 
															+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS

														
 
															+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)

														
 
															+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

														
 
															+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY

														
 
															+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

														
 
															+ * SUCH DAMAGE.

														
 
															+ *

														
 
															+ * This file was originally written by Colin Percival as part of the Tarsnap

														
 
															+ * online backup system.

														
 
															+ */