Browse Source

Update phatk kernel to one with new parameters for slightly less overhead again.
Make the queue kernel parameters call a function pointer to select phatk or poclbm.

Con Kolivas 14 years ago
parent
commit
116a9dc025
6 changed files with 496 additions and 439 deletions
  1. 2 2
      Makefile.am
  2. 13 0
      findnonce.c
  3. 53 19
      main.c
  4. 7 5
      miner.h
  5. 2 2
      ocl.c
  6. 419 411
      phatk110722.cl

+ 2 - 2
Makefile.am

@@ -15,7 +15,7 @@ INCLUDES	= $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES)
 
 
 bin_PROGRAMS	= cgminer
 bin_PROGRAMS	= cgminer
 
 
-bin_SCRIPTS	= phatk110714.cl poclbm110717.cl
+bin_SCRIPTS	= phatk110722.cl poclbm110717.cl
 
 
 cgminer_SOURCES	= elist.h miner.h compat.h			\
 cgminer_SOURCES	= elist.h miner.h compat.h			\
 		  main.c util.c					\
 		  main.c util.c					\
@@ -23,7 +23,7 @@ cgminer_SOURCES	= elist.h miner.h compat.h			\
 		  sha256_generic.c sha256_4way.c sha256_via.c	\
 		  sha256_generic.c sha256_4way.c sha256_via.c	\
 		  sha256_cryptopp.c sha256_sse2_amd64.c		\
 		  sha256_cryptopp.c sha256_sse2_amd64.c		\
 		  sha256_sse4_amd64.c \
 		  sha256_sse4_amd64.c \
-		  phatk110714.cl poclbm110717.cl
+		  phatk110722.cl poclbm110717.cl
 
 
 cgminer_LDFLAGS	= $(PTHREAD_FLAGS)
 cgminer_LDFLAGS	= $(PTHREAD_FLAGS)
 cgminer_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @OPENCL_LIBS@ @NCURSES_LIBS@ @PDCURSES_LIBS@ lib/libgnu.a ccan/libccan.a
 cgminer_LDADD	= @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @OPENCL_LIBS@ @NCURSES_LIBS@ @PDCURSES_LIBS@ lib/libgnu.a ccan/libccan.a

+ 13 - 0
findnonce.c

@@ -72,7 +72,13 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) {
 	blk->cty_a = A;
 	blk->cty_a = A;
 	blk->cty_b = B;
 	blk->cty_b = B;
 	blk->cty_c = C;
 	blk->cty_c = C;
+
+	blk->C1addK5 = C + 0x59f111f1;
+
 	blk->cty_d = D;
 	blk->cty_d = D;
+
+	blk->D1A = D + 0xb956c25b;
+
 	blk->cty_e = E;
 	blk->cty_e = E;
 	blk->cty_f = F;
 	blk->cty_f = F;
 	blk->cty_g = G;
 	blk->cty_g = G;
@@ -94,6 +100,10 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) {
 	blk->W16 = blk->fW0 = data[0] + (rotr(data[1], 7) ^ rotr(data[1], 18) ^ (data[1] >> 3));
 	blk->W16 = blk->fW0 = data[0] + (rotr(data[1], 7) ^ rotr(data[1], 18) ^ (data[1] >> 3));
 	blk->W17 = blk->fW1 = data[1] + (rotr(data[2], 7) ^ rotr(data[2], 18) ^ (data[2] >> 3)) + 0x01100000;
 	blk->W17 = blk->fW1 = data[1] + (rotr(data[2], 7) ^ rotr(data[2], 18) ^ (data[2] >> 3)) + 0x01100000;
 	blk->W2 = data[2];
 	blk->W2 = data[2];
+
+	blk->W2A = blk->W2 + (rotr(blk->W16, 19) ^ rotr(blk->W16, 17) ^ (blk->W16 >> 10));
+	blk->W17_2 = 0x11002000 + (rotr(blk->W17, 19) ^ rotr(blk->W17, 17) ^ (blk->W17 >> 10));
+
 	blk->fW2 = data[2] + (rotr(blk->fW0, 17) ^ rotr(blk->fW0, 19) ^ (blk->fW0 >> 10));
 	blk->fW2 = data[2] + (rotr(blk->fW0, 17) ^ rotr(blk->fW0, 19) ^ (blk->fW0 >> 10));
 	blk->fW3 = 0x11002000 + (rotr(blk->fW1, 17) ^ rotr(blk->fW1, 19) ^ (blk->fW1 >> 10));
 	blk->fW3 = 0x11002000 + (rotr(blk->fW1, 17) ^ rotr(blk->fW1, 19) ^ (blk->fW1 >> 10));
 	blk->fW15 = 0x00000280 + (rotr(blk->fW0, 7) ^ rotr(blk->fW0, 18) ^ (blk->fW0 >> 3));
 	blk->fW15 = 0x00000280 + (rotr(blk->fW0, 7) ^ rotr(blk->fW0, 18) ^ (blk->fW0 >> 3));
@@ -101,6 +111,9 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) {
 
 
 	blk->PreVal4 = blk->fcty_e = E + (rotr(B, 6) ^ rotr(B, 11) ^ rotr(B, 25)) + (D ^ (B & (C ^ D))) + 0xe9b5dba5;
 	blk->PreVal4 = blk->fcty_e = E + (rotr(B, 6) ^ rotr(B, 11) ^ rotr(B, 25)) + (D ^ (B & (C ^ D))) + 0xe9b5dba5;
 	blk->T1 = blk->fcty_e2 = (rotr(F, 2) ^ rotr(F, 13) ^ rotr(F, 22)) + ((F & G) | (H & (F | G)));
 	blk->T1 = blk->fcty_e2 = (rotr(F, 2) ^ rotr(F, 13) ^ rotr(F, 22)) + ((F & G) | (H & (F | G)));
+
+	blk->PreVal4addT1 = blk->PreVal4 + blk->T1;
+	blk->T1substate0 = state[0] - blk->T1;
 }
 }
 
 
 #define P(t) (W[(t)&0xF] = W[(t-16)&0xF] + (rotate(W[(t-15)&0xF], 25) ^ rotate(W[(t-15)&0xF], 14) ^ (W[(t-15)&0xF] >> 3)) + W[(t-7)&0xF] + (rotate(W[(t-2)&0xF], 15) ^ rotate(W[(t-2)&0xF], 13) ^ (W[(t-2)&0xF] >> 10)))
 #define P(t) (W[(t)&0xF] = W[(t-16)&0xF] + (rotate(W[(t-15)&0xF], 25) ^ rotate(W[(t-15)&0xF], 14) ^ (W[(t-15)&0xF] >> 3)) + W[(t-7)&0xF] + (rotate(W[(t-2)&0xF], 15) ^ rotate(W[(t-2)&0xF], 13) ^ (W[(t-2)&0xF] >> 10)))

+ 53 - 19
main.c

@@ -2373,7 +2373,7 @@ enum {
 #ifdef HAVE_OPENCL
 #ifdef HAVE_OPENCL
 static _clState *clStates[16];
 static _clState *clStates[16];
 
 
-static inline cl_int queue_kernel_parameters(_clState *clState, dev_blk_ctx *blk)
+static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk)
 {
 {
 	cl_kernel *kernel = &clState->kernel;
 	cl_kernel *kernel = &clState->kernel;
 	cl_int status = 0;
 	cl_int status = 0;
@@ -2395,24 +2395,51 @@ static inline cl_int queue_kernel_parameters(_clState *clState, dev_blk_ctx *blk
 	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_h);
 	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_h);
 	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->nonce);
 	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->nonce);
 
 
-	if (clState->hasBitAlign == true) {
-		/* Parameters for phatk kernel */
-		status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->W2);
-		status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->W16);
-		status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->W17);
-		status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreVal4);
-		status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->T1);
-	} else {
-		/* Parameters for poclbm kernel */
-		status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW0);
-		status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW1);
-		status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW2);
-		status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW3);
-		status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW15);
-		status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW01r);
-		status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e);
-		status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e2);
-	}
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW0);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW1);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW2);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW3);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW15);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fW01r);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->fcty_e2);
+
+	status |= clSetKernelArg(*kernel, num++, sizeof(clState->outputBuffer),
+				 (void *)&clState->outputBuffer);
+
+	return status;
+}
+
+static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk)
+{
+	cl_kernel *kernel = &clState->kernel;
+	cl_int status = 0;
+	int num = 0;
+
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_a);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_b);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_c);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_d);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_e);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_f);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_g);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->ctx_h);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_b);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_c);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->C1addK5);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->D1A);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_f);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_g);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->cty_h);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->nonce);
+
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->W2A);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->W16);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->W17);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->W17_2);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->PreVal4addT1);
+	status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->T1substate0);
+
 	status |= clSetKernelArg(*kernel, num++, sizeof(clState->outputBuffer),
 	status |= clSetKernelArg(*kernel, num++, sizeof(clState->outputBuffer),
 				 (void *)&clState->outputBuffer);
 				 (void *)&clState->outputBuffer);
 
 
@@ -2432,6 +2459,8 @@ static void set_threads_hashes(unsigned int vectors, unsigned int *threads,
 
 
 static void *gpuminer_thread(void *userdata)
 static void *gpuminer_thread(void *userdata)
 {
 {
+	cl_int (*queue_kernel_parameters)(_clState *, dev_blk_ctx *);
+
 	const unsigned long cycle = opt_log_interval / 5 ? : 1;
 	const unsigned long cycle = opt_log_interval / 5 ? : 1;
 	struct timeval tv_start, tv_end, diff, tv_workstart;
 	struct timeval tv_start, tv_end, diff, tv_workstart;
 	struct thr_info *mythr = userdata;
 	struct thr_info *mythr = userdata;
@@ -2459,6 +2488,11 @@ static void *gpuminer_thread(void *userdata)
 	bool requested = true;
 	bool requested = true;
 	uint32_t total_hashes = 0, hash_div = 1;
 	uint32_t total_hashes = 0, hash_div = 1;
 
 
+	if (clState->hasBitAlign)
+		queue_kernel_parameters = &queue_phatk_kernel;
+	else
+		queue_kernel_parameters = &queue_poclbm_kernel;
+
 	if (opt_dynamic) {
 	if (opt_dynamic) {
 		/* Minimise impact on desktop if we want dynamic mode */
 		/* Minimise impact on desktop if we want dynamic mode */
 		setpriority(PRIO_PROCESS, 0, 19);
 		setpriority(PRIO_PROCESS, 0, 19);

+ 7 - 5
miner.h

@@ -249,15 +249,17 @@ extern pthread_mutex_t control_lock;
 
 
 #ifdef HAVE_OPENCL
 #ifdef HAVE_OPENCL
 typedef struct {
 typedef struct {
-    cl_uint ctx_a; cl_uint ctx_b; cl_uint ctx_c; cl_uint ctx_d;
-    cl_uint ctx_e; cl_uint ctx_f; cl_uint ctx_g; cl_uint ctx_h;
-    cl_uint cty_a; cl_uint cty_b; cl_uint cty_c; cl_uint cty_d;
-    cl_uint cty_e; cl_uint cty_f; cl_uint cty_g; cl_uint cty_h;
-    cl_uint merkle; cl_uint ntime; cl_uint nbits; cl_uint nonce;
+	cl_uint ctx_a; cl_uint ctx_b; cl_uint ctx_c; cl_uint ctx_d;
+	cl_uint ctx_e; cl_uint ctx_f; cl_uint ctx_g; cl_uint ctx_h;
+	cl_uint cty_a; cl_uint cty_b; cl_uint cty_c; cl_uint cty_d;
+	cl_uint cty_e; cl_uint cty_f; cl_uint cty_g; cl_uint cty_h;
+	cl_uint merkle; cl_uint ntime; cl_uint nbits; cl_uint nonce;
 	cl_uint fW0; cl_uint fW1; cl_uint fW2; cl_uint fW3; cl_uint fW15;
 	cl_uint fW0; cl_uint fW1; cl_uint fW2; cl_uint fW3; cl_uint fW15;
 	cl_uint fW01r; cl_uint fcty_e; cl_uint fcty_e2;
 	cl_uint fW01r; cl_uint fcty_e; cl_uint fcty_e2;
 	cl_uint W16; cl_uint W17; cl_uint W2;
 	cl_uint W16; cl_uint W17; cl_uint W2;
 	cl_uint PreVal4; cl_uint T1;
 	cl_uint PreVal4; cl_uint T1;
+	cl_uint C1addK5; cl_uint D1A; cl_uint W2A; cl_uint W17_2;
+	cl_uint PreVal4addT1; cl_uint T1substate0;
 } dev_blk_ctx;
 } dev_blk_ctx;
 #else
 #else
 typedef struct {
 typedef struct {

+ 2 - 2
ocl.c

@@ -341,7 +341,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 	char numbuf[10];
 	char numbuf[10];
 	char filename[16];
 	char filename[16];
 	if (clState->hasBitAlign)
 	if (clState->hasBitAlign)
-		strcpy(filename, "phatk110714.cl");
+		strcpy(filename, "phatk110722.cl");
 	else
 	else
 		strcpy(filename, "poclbm110717.cl");
 		strcpy(filename, "poclbm110717.cl");
 	FILE *binaryfile;
 	FILE *binaryfile;
@@ -370,7 +370,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 
 
 	strcpy(binaryfilename, name);
 	strcpy(binaryfilename, name);
 	if (clState->hasBitAlign) {
 	if (clState->hasBitAlign) {
-		strcat(binaryfilename, "phatk110714");
+		strcat(binaryfilename, "phatk110722");
 		strcat(binaryfilename, "bitalign");
 		strcat(binaryfilename, "bitalign");
 	} else
 	} else
 		strcat(binaryfilename, "poclbm110717");
 		strcat(binaryfilename, "poclbm110717");

+ 419 - 411
phatk110714.cl → phatk110722.cl

@@ -1,411 +1,419 @@
-// This file is taken and modified from the public-domain poclbm project, and
-// we have therefore decided to keep it public-domain in Phoenix.
-
-// 2011-07-11: further modified by Diapolo and still public-domain
-// -ck version to be compatible with cgminer
-// 2011-07-14: shorter code
-
-#define VECTORSX
-
-#ifdef VECTORS4
-	typedef uint4 u;
-#elif defined VECTORS2
-	typedef uint2 u;
-#else
-	typedef uint u;
-#endif
-
-__constant uint K[64] = { 
-    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-// H[6] =  0x08909ae5U + 0xb0edbdd0 + K[0] == 0xfc08884d
-// H[7] = -0x5be0cd19 - (0x90befffa) K[60] == -0xec9fcd13
-__constant uint H[8] = { 
-	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0xfc08884d, 0xec9fcd13
-};
-
-// L = 0xa54ff53a + 0xb0edbdd0 + K[0] == 0x198c7e2a2
-__constant ulong L = 0x198c7e2a2;
-
-#define BFI_INTX
-#define BITALIGNX
-
-#ifdef BITALIGN
-	#pragma OPENCL EXTENSION cl_amd_media_ops : enable
-	#define rot(x, y) amd_bitalign(x, x, (u)(32 - y))
-#else
-	#define rot(x, y) rotate(x, (u)y)
-#endif
-
-#ifdef BFI_INT
-	#define Ch(x, y, z) amd_bytealign(x, y, z)
-#else 
-	#define Ch(x, y, z) bitselect(z, y, x)
-#endif
-
-// Ma now uses the Ch function, if BFI_INT is enabled, the optimized Ch version is used
-#define Ma(x, y, z) Ch((z ^ x), y, x)
-
-// Various intermediate calculations for each SHA round
-#define s0(n) (rot(Vals[(128 - n) % 8], 30) ^ rot(Vals[(128 - n) % 8], 19) ^ rot(Vals[(128 - n) % 8], 10))
-#define s1(n) (rot(Vals[(132 - n) % 8], 26) ^ rot(Vals[(132 - n) % 8], 21) ^ rot(Vals[(132 - n) % 8], 7))
-#define ch(n) (Ch(Vals[(132 - n) % 8], Vals[(133 - n) % 8], Vals[(134 - n) % 8]))
-#define ma(n) (Ma(Vals[(129 - n) % 8], Vals[(130 - n) % 8], Vals[(128 - n) % 8]))
-#define t1(n) (K[n % 64] + Vals[(135 - n) % 8] + W[n] + s1(n) + ch(n))
-
-// intermediate W calculations
-#define P1(x) (rot(W[x - 2], 15) ^ rot(W[x - 2], 13) ^ (W[x - 2] >> 10U))
-#define P2(x) (rot(W[x - 15], 25) ^ rot(W[x - 15], 14) ^ (W[x - 15] >> 3U))
-#define P3(x) W[x - 7]
-#define P4(x) W[x - 16]
-
-// full W calculation
-#define W(x) (W[x] = P4(x) + P3(x) + P2(x) + P1(x))
-
-// SHA round without W calc
-#define sharound(n) { Vals[(131 - n) % 8] += t1(n); Vals[(135 - n) % 8] = t1(n) + s0(n) + ma(n); }
-
-__kernel void search(	const uint state0, const uint state1, const uint state2, const uint state3,
-						const uint state4, const uint state5, const uint state6, const uint state7,
-						const uint B1, const uint C1, const uint D1,
-						const uint F1, const uint G1, const uint H1,
-						const uint base,
-						const uint W2,
-						const uint W16, const uint W17,
-						const uint PreVal4, const uint T1,
-						__global uint * output)
-{
-	u W[124];
-	u Vals[8];
-
-	Vals[1] = B1;
-	Vals[2] = C1;
-	Vals[5] = F1;
-	Vals[6] = G1;
-	
-	W[2] = W2;
-#ifdef VECTORS4
-        Vals[4] = (W[3] = base + (get_global_id(0) << 2) + (uint4)(0, 1, 2, 3)) + PreVal4;
-#elif defined VECTORS2
-        Vals[4] = (W[3] = base + (get_global_id(0) << 1) + (uint2)(0, 1)) + PreVal4;
-#else
-        Vals[4] = (W[3] = base + get_global_id(0)) + PreVal4;
-#endif
-	// used in: P2(19) == 285220864 (0x11002000), P4(20)
-	W[4] = 0x80000000U;
-	// P1(x) is 0 for x == 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
-	// P2(x) is 0 for x == 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
-	// P3(x) is 0 for x == 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
-	// P4(x) is 0 for x == 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
-	// W[x] in sharound(x) is 0 for x == 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
-	W[14] = W[13] = W[12] = W[11] = W[10] = W[9] = W[8] = W[7] = W[6] = W[5] = 0x00000000U;
-	// used in: P2(30) == 10485845 (0xA00055), P3(22), P4(31)
-	// K[15] + W[15] == 0xc19bf174 + 0x00000280U = 0xc19bf3f4
-	W[15] = 0x00000280U;
-
-	W[16] = W16;
-	W[17] = W17;
-	// removed P3(18) from add because it is == 0
-	W[18] = P1(18) + P4(18) + P2(18);
-	// removed P3(19) from add because it is == 0
-	W[19] = (u)0x11002000 + P1(19) + P4(19);
-	// removed P2(20), P3(20) from add because it is == 0
-	W[20] = P1(20) + P4(20);
-	W[21] = P1(21);
-	W[22] = P1(22) + P3(22);
-	W[23] = P1(23) + P3(23);
-	W[24] = P1(24) + P3(24);
-	W[25] = P1(25) + P3(25);
-	W[26] = P1(26) + P3(26);
-	W[27] = P1(27) + P3(27);
-	W[28] = P1(28) + P3(28);
-	W[29] = P1(29) + P3(29);
-	W[30] = (u)0xA00055 + P1(30) + P3(30);
-	
-	// Round 3
-	Vals[0] = state0 + Vals[4];
-	Vals[4] += T1;
-	
-	// Round 4
-	// K[4] + W[4] == 0x3956c25b + 0x80000000U = 0xb956c25b
-	Vals[7] = (Vals[3] = (u)0xb956c25b + D1 + s1(4) + ch(4)) + H1;
-	Vals[3] += s0(4) + ma(4);
-
-	// Round 5
-	Vals[2] = K[5] + C1 + s1(5) + ch(5) + s0(5) + ma(5);
-	Vals[6] = K[5] + C1 + G1 + s1(5) + ch(5);
-
-	sharound(6);
-	sharound(7);
-	sharound(8);
-	sharound(9);
-	sharound(10);
-	sharound(11);
-	sharound(12);
-	sharound(13);
-	sharound(14);
-	sharound(15);
-	sharound(16);
-	sharound(17);
-	sharound(18);
-	sharound(19);
-	sharound(20);
-	sharound(21);
-	sharound(22);
-	sharound(23);
-	sharound(24);
-	sharound(25);
-	sharound(26);
-	sharound(27);
-	sharound(28);
-	sharound(29);
-	sharound(30);
-
-	W(31);
-	sharound(31);
-	W(32);
-	sharound(32);
-	W(33);
-	sharound(33);
-	W(34);
-	sharound(34);
-	W(35);
-	sharound(35);
-	W(36);
-	sharound(36);
-	W(37);
-	sharound(37);
-	W(38);
-	sharound(38);
-	W(39);
-	sharound(39);
-	W(40);
-	sharound(40);
-	W(41);
-	sharound(41);
-	W(42);
-	sharound(42);
-	W(43);
-	sharound(43);
-	W(44);
-	sharound(44);
-	W(45);
-	sharound(45);
-	W(46);
-	sharound(46);
-	W(47);
-	sharound(47);
-	W(48);
-	sharound(48);
-	W(49);
-	sharound(49);
-	W(50);
-	sharound(50);
-	W(51);
-	sharound(51);
-	W(52);
-	sharound(52);
-	W(53);
-	sharound(53);
-	W(54);
-	sharound(54);
-	W(55);
-	sharound(55);
-	W(56);
-	sharound(56);
-	W(57);
-	sharound(57);
-	W(58);
-	sharound(58);
-	W(59);
-	sharound(59);
-	W(60);
-	sharound(60);
-	W(61);
-	sharound(61);
-	W(62);
-	sharound(62);
-	W(63);
-	sharound(63);
-
-	W[64] = state0 + Vals[0];
-	W[65] = state1 + Vals[1];
-	W[66] = state2 + Vals[2];
-	W[67] = state3 + Vals[3];
-	W[68] = state4 + Vals[4];
-	W[69] = state5 + Vals[5];
-	W[70] = state6 + Vals[6];
-	W[71] = state7 + Vals[7];
-	// used in: P2(87) = 285220864 (0x11002000), P4(88)
-	// K[72] + W[72] ==
-	W[72] = 0x80000000U;
-	// P1(x) is 0 for x == 75, 76, 77, 78, 79, 80
-	// P2(x) is 0 for x == 88, 89, 90, 91, 92, 93
-	// P3(x) is 0 for x == 80, 81, 82, 83, 84, 85
-	// P4(x) is 0 for x == 89, 90, 91, 92, 93, 94
-	// W[x] in sharound(x) is 0 for x == 73, 74, 75, 76, 77, 78
-	W[78] = W[77] = W[76] = W[75] = W[74] = W[73] = 0x00000000U;
-	// used in: P1(81) = 10485760 (0xA00000), P2(94) = 4194338 (0x400022), P3(86), P4(95)
-	// K[79] + W[79] ==
-	W[79] = 0x00000100U;
-
-	Vals[0] = H[0];
-	Vals[1] = H[1];
-	Vals[2] = H[2];
-	Vals[3] = (u)L + W[64];
-	Vals[4] = H[3];
-	Vals[5] = H[4];
-	Vals[6] = H[5];
-	Vals[7] = H[6] + W[64];
-	
-	sharound(65);
-	sharound(66);
-	sharound(67);
-	sharound(68);
-	sharound(69);
-	sharound(70);
-	sharound(71);
-	sharound(72);
-	sharound(73);
-	sharound(74);
-	sharound(75);
-	sharound(76);
-	sharound(77);
-	sharound(78);
-	sharound(79);
-	
-	// removed P1(80), P3(80) from add because it is == 0
-	W[80] = P2(80) + P4(80);
-	W[81] = (u)0xA00000 + P4(81) + P2(81);
-	W[82] = P4(82) + P2(82) + P1(82);
-	W[83] = P4(83) + P2(83) + P1(83);
-	W[84] = P4(84) + P2(84) + P1(84);
-	W[85] = P4(85) + P2(85) + P1(85);
-	W(86);
-
-	sharound(80);
-	sharound(81);	
-	sharound(82);
-	sharound(83);
-	sharound(84);
-	sharound(85);
-	sharound(86);
-
-	W[87] = (u)0x11002000 + P4(87) + P3(87) + P1(87);
-	sharound(87);
-	W[88] = P4(88) + P3(88) + P1(88);
-	sharound(88);
-	W[89] = P3(89) + P1(89);
-	sharound(89);
-	W[90] = P3(90) + P1(90);
-	sharound(90);
-	W[91] = P3(91) + P1(91);
-	sharound(91);
-	W[92] = P3(92) + P1(92);
-	sharound(92);
-	// removed P2(93), P4(93) from add because it is == 0
-	W[93] = P3(93) + P1(93);
-	sharound(93);
-	// removed P4(94) from add because it is == 0
-	W[94] = (u)0x400022 + P3(94) + P1(94);
-	sharound(94);
-	
-	W(95);
-	sharound(95);
-	W(96);
-	sharound(96);
-	W(97);
-	sharound(97);
-	W(98);
-	sharound(98);
-	W(99);
-	sharound(99);
-	W(100);
-	sharound(100);
-	W(101);
-	sharound(101);
-	W(102);
-	sharound(102);
-	W(103);
-	sharound(103);
-	W(104);
-	sharound(104);
-	W(105);
-	sharound(105);
-	W(106);
-	sharound(106);
-	W(107);
-	sharound(107);
-	W(108);
-	sharound(108);
-	W(109);
-	sharound(109);
-	W(110);
-	sharound(110);
-	W(111);
-	sharound(111);
-	W(112);
-	sharound(112);
-	W(113);
-	sharound(113);
-	W(114);
-	sharound(114);
-	W(115);
-	sharound(115);
-	W(116);
-	sharound(116);
-	W(117);
-	sharound(117);
-	W(118);
-	sharound(118);
-	W(119);
-	sharound(119);
-	W(120);
-	sharound(120);
-	W(121);
-	sharound(121);
-	W(122);
-	sharound(122);
-	W(123);
-	sharound(123);
-
-	// Round 124
-	Vals[7] += Vals[3] + P4(124) + P3(124) + P2(124) + P1(124) + s1(124) + ch(124);
-	
-#define MAXBUFFERS (4095)
-#define NFLAG (0xFFFUL)
-
-#if defined(VECTORS4) || defined(VECTORS2)
-	if (Vals[7].x == -H[7])
-	{
-		output[MAXBUFFERS] = output[NFLAG & W[3].x] =  W[3].x;
-	}
-	if (Vals[7].y == -H[7])
-	{
-		output[MAXBUFFERS] = output[NFLAG & W[3].y] =  W[3].y;
-	}
-#ifdef VECTORS4
-	if (Vals[7].z == -H[7])
-	{
-		output[MAXBUFFERS] = output[NFLAG & W[3].z] =  W[3].z;
-	}
-	if (Vals[7].w == -H[7])
-	{
-		output[MAXBUFFERS] = output[NFLAG & W[3].w] =  W[3].w;
-	}
-#endif
-#else
-	if (Vals[7] == -H[7])
-	{
-		output[MAXBUFFERS] = output[NFLAG & W[3]] =  W[3];
-	}
-#endif
-}
+// This file is taken and modified from the public-domain poclbm project, and
+// we have therefore decided to keep it public-domain in Phoenix.
+
+// 2011-07-12: further modified by Diapolo and still public-domain
+// -ck version to be compatible with cgminer
+// 2011-07-14: shorter code
+
+#define VECTORSX
+
+#ifdef VECTORS4
+	typedef uint4 u;
+#elif defined VECTORS2
+	typedef uint2 u;
+#else
+	typedef uint u;
+#endif
+
+__constant uint K[64] = { 
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+// H[6] =  0x08909ae5U + 0xb0edbdd0 + K[0] == 0xfc08884d
+// H[7] = -0x5be0cd19 - (0x90befffa) K[60] == -0xec9fcd13
+__constant uint H[8] = { 
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0xfc08884d, 0xec9fcd13
+};
+
+// L = 0xa54ff53a + 0xb0edbdd0 + K[0] == 0x198c7e2a2
+__constant ulong L = 0x198c7e2a2;
+
+#define BFI_INTX
+#define BITALIGNX
+
+#define O 15
+
+#ifdef BITALIGN
+	#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+	#define rot(x, y) amd_bitalign(x, x, (u)(32 - y))
+#else
+	#define rot(x, y) rotate(x, (u)y)
+#endif
+
+#ifdef BFI_INT
+	#define Ch(x, y, z) amd_bytealign(x, y, z)
+#else 
+	#define Ch(x, y, z) bitselect(z, y, x)
+#endif
+
+// Ma now uses the Ch function, if BFI_INT is enabled, the optimized Ch version is used
+#define Ma(x, y, z) Ch((z ^ x), y, x)
+
+// Various intermediate calculations for each SHA round
+#define s0(n) (rot(Vals[(128 - n) % 8], 30) ^ rot(Vals[(128 - n) % 8], 19) ^ rot(Vals[(128 - n) % 8], 10))
+#define s1(n) (rot(Vals[(132 - n) % 8], 26) ^ rot(Vals[(132 - n) % 8], 21) ^ rot(Vals[(132 - n) % 8], 7))
+#define ch(n) (Ch(Vals[(132 - n) % 8], Vals[(133 - n) % 8], Vals[(134 - n) % 8]))
+#define ma(n) (Ma(Vals[(129 - n) % 8], Vals[(130 - n) % 8], Vals[(128 - n) % 8]))
+#define t1(n) (K[n % 64] + Vals[(135 - n) % 8] + W[n - O] + s1(n) + ch(n))
+#define t1_no_W(n) (K[n % 64] + Vals[(135 - n) % 8] + s1(n) + ch(n))
+
+// intermediate W calculations
+#define P1(x) (rot(W[x - 2 - O], 15) ^ rot(W[x - 2 - O], 13) ^ (W[x - 2 - O] >> 10U))
+#define P2(x) (rot(W[x - 15 - O], 25) ^ rot(W[x - 15 - O], 14) ^ (W[x - 15 - O] >> 3U))
+#define P3(x) W[x - 7 - O]
+#define P4(x) W[x - 16 - O]
+
+// full W calculation
+#define W(x) (W[x - O] = P4(x) + P3(x) + P2(x) + P1(x))
+
+// SHA round without W calc
+#define sharound(n) { Vals[(131 - n) % 8] += t1(n); Vals[(135 - n) % 8] = t1(n) + s0(n) + ma(n); }
+#define sharound_no_W(n) { Vals[(131 - n) % 8] += t1_no_W(n); Vals[(135 - n) % 8] = t1_no_W(n) + s0(n) + ma(n); }
+
+__kernel void search(	const uint state0, const uint state1, const uint state2, const uint state3,
+						const uint state4, const uint state5, const uint state6, const uint state7,
+						const uint B1, const uint C1, const uint C1addK5, const uint D1,
+						const uint F1, const uint G1, const uint H1,
+						const uint base,
+						const uint W2,
+						const uint W16, const uint W17, const uint W17_2,
+						const uint PreVal4addT1, const uint T1substate0,
+						__global uint * output)
+{
+	u W[124 - O];
+	u Vals[8];
+#ifdef VECTORS4
+        u W_3 = base + (get_global_id(0) << 2) + (uint4)(0, 1, 2, 3);
+#elif defined VECTORS2
+        u W_3 = base + (get_global_id(0) << 1) + (uint2)(0, 1);
+#else
+        u W_3 = base + get_global_id(0);
+#endif
+	u Temp;
+	
+	Vals[0] = W_3 + PreVal4addT1 + T1substate0;
+	Vals[1] = B1;
+	Vals[2] = C1;
+
+	Vals[4] = W_3 + PreVal4addT1;
+	Vals[5] = F1;
+	Vals[6] = G1;
+	
+	// used in: P2(19) == 285220864 (0x11002000), P4(20)
+	// W[4] = 0x80000000U;
+	// P1(x) is 0 for x == 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+	// P2(x) is 0 for x == 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
+	// P3(x) is 0 for x == 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
+	// P4(x) is 0 for x == 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
+	// W[x] in sharound(x) is 0 for x == 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+	// W[14] = W[13] = W[12] = W[11] = W[10] = W[9] = W[8] = W[7] = W[6] = W[5] = 0x00000000U;
+	// used in: P2(30) == 10485845 (0xA00055), P3(22), P4(31)
+	// K[15] + W[15] == 0xc19bf174 + 0x00000280U = 0xc19bf3f4
+	W[15 - O] = 0x00000280U;
+	W[16 - O] = W16;
+	W[17 - O] = W17;
+	W[18 - O] = W2 + (rot(W_3, 25) ^ rot(W_3, 14) ^ (W_3 >> 3U));
+	W[19 - O] = W_3 + W17_2;
+	W[20 - O] = (u)0x80000000U + P1(20);
+	W[21 - O] = P1(21);
+	W[22 - O] = P1(22) + P3(22);
+	W[23 - O] = P1(23) + P3(23);
+	W[24 - O] = P1(24) + P3(24);
+	W[25 - O] = P1(25) + P3(25);
+	W[26 - O] = P1(26) + P3(26);
+	W[27 - O] = P1(27) + P3(27);
+	W[28 - O] = P1(28) + P3(28);
+	W[29 - O] = P1(29) + P3(29);
+	W[30 - O] = (u)0xA00055 + P1(30) + P3(30);
+	
+	// Round 4
+	Temp = D1 + ch(4) + s1(4);
+	Vals[7] = Temp + H1;
+	Vals[3] = Temp + ma(4) + s0(4);	
+
+	// Round 5
+	Temp = C1addK5 + ch(5) + s1(5);
+	Vals[6] = Temp + G1;
+	Vals[2] = Temp + ma(5) + s0(5);
+
+	// W[6] to W[14] are 0, so no need to add them!
+	sharound_no_W(6);
+	sharound_no_W(7);
+	sharound_no_W(8);
+	sharound_no_W(9);
+	sharound_no_W(10);
+	sharound_no_W(11);
+	sharound_no_W(12);
+	sharound_no_W(13);
+	sharound_no_W(14);
+
+//	#define sharound(n) { Vals[(131 - n) % 8] += t1(n); Vals[(135 - n) % 8] = t1(n) + s0(n) + ma(n); }
+//	#define t1(n) (K[n % 64] + Vals[(135 - n) % 8] + W[n] + s1(n) + ch(n))
+
+//	Vals[(131 - 15) % 8] += (Vals[(135 - 15) % 8] = (u)0xc19bf3f4 + Vals[(135 - 15) % 8] + s1(15) + ch(15));
+//	Vals[(135 - 15) % 8] += s0(15) + ma(15);
+
+	sharound(15);
+	sharound(16);
+	sharound(17);
+	sharound(18);
+	sharound(19);
+	sharound(20);
+	sharound(21);
+	sharound(22);
+	sharound(23);
+	sharound(24);
+	sharound(25);
+	sharound(26);
+	sharound(27);
+	sharound(28);
+	sharound(29);
+	sharound(30);
+
+	W(31);
+	sharound(31);
+	W(32);
+	sharound(32);
+	W(33);
+	sharound(33);
+	W(34);
+	sharound(34);
+	W(35);
+	sharound(35);
+	W(36);
+	sharound(36);
+	W(37);
+	sharound(37);
+	W(38);
+	sharound(38);
+	W(39);
+	sharound(39);
+	W(40);
+	sharound(40);
+	W(41);
+	sharound(41);
+	W(42);
+	sharound(42);
+	W(43);
+	sharound(43);
+	W(44);
+	sharound(44);
+	W(45);
+	sharound(45);
+	W(46);
+	sharound(46);
+	W(47);
+	sharound(47);
+	W(48);
+	sharound(48);
+	W(49);
+	sharound(49);
+	W(50);
+	sharound(50);
+	W(51);
+	sharound(51);
+	W(52);
+	sharound(52);
+	W(53);
+	sharound(53);
+	W(54);
+	sharound(54);
+	W(55);
+	sharound(55);
+	W(56);
+	sharound(56);
+	W(57);
+	sharound(57);
+	W(58);
+	sharound(58);
+	W(59);
+	sharound(59);
+	W(60);
+	sharound(60);
+	W(61);
+	sharound(61);
+	W(62);
+	sharound(62);
+	W(63);
+	sharound(63);
+
+	W[64 - O] = state0 + Vals[0];
+	W[65 - O] = state1 + Vals[1];
+	W[66 - O] = state2 + Vals[2];
+	W[67 - O] = state3 + Vals[3];
+	W[68 - O] = state4 + Vals[4];
+	W[69 - O] = state5 + Vals[5];
+	W[70 - O] = state6 + Vals[6];
+	W[71 - O] = state7 + Vals[7];
+	// used in: P2(87) = 285220864 (0x11002000), P4(88)
+	// K[72] + W[72] ==
+	W[72 - O] = 0x80000000U;
+	// P1(x) is 0 for x == 75, 76, 77, 78, 79, 80
+	// P2(x) is 0 for x == 88, 89, 90, 91, 92, 93
+	// P3(x) is 0 for x == 80, 81, 82, 83, 84, 85
+	// P4(x) is 0 for x == 89, 90, 91, 92, 93, 94
+	// W[x] in sharound(x) is 0 for x == 73, 74, 75, 76, 77, 78
+	// W[78] = W[77] = W[76] = W[75] = W[74] = W[73] = 0x00000000U;
+	// used in: P1(81) = 10485760 (0xA00000), P2(94) = 4194338 (0x400022), P3(86), P4(95)
+	// K[79] + W[79] ==
+	W[79 - O] = 0x00000100U;
+
+	Vals[0] = H[0];
+	Vals[1] = H[1];
+	Vals[2] = H[2];
+	Vals[3] = (u)L + W[64 - O];
+	Vals[4] = H[3];
+	Vals[5] = H[4];
+	Vals[6] = H[5];
+	Vals[7] = H[6] + W[64 - O];
+	
+	sharound(65);
+	sharound(66);
+	sharound(67);
+	sharound(68);
+	sharound(69);
+	sharound(70);
+	sharound(71);
+	sharound(72);
+
+	// W is also zero for these rounds
+	sharound_no_W(73);
+	sharound_no_W(74);
+	sharound_no_W(75);
+	sharound_no_W(76);
+	sharound_no_W(77);
+	sharound_no_W(78);
+
+	sharound(79);
+	
+	W[80 - O] = P2(80) + P4(80);
+	W[81 - O] = (u)0xA00000 + P4(81) + P2(81);
+	W[82 - O] = P4(82) + P2(82) + P1(82);
+	W[83 - O] = P4(83) + P2(83) + P1(83);
+	W[84 - O] = P4(84) + P2(84) + P1(84);
+	W[85 - O] = P4(85) + P2(85) + P1(85);
+	W(86);
+
+	sharound(80);
+	sharound(81);	
+	sharound(82);
+	sharound(83);
+	sharound(84);
+	sharound(85);
+	sharound(86);
+
+	W[87 - O] = (u)0x11002000 + P4(87) + P3(87) + P1(87);
+	sharound(87);
+	W[88 - O] = (u)0x80000000U + P3(88) + P1(88);
+	sharound(88);
+	W[89 - O] = P3(89) + P1(89);
+	sharound(89);
+	W[90 - O] = P3(90) + P1(90);
+	sharound(90);
+	W[91 - O] = P3(91) + P1(91);
+	sharound(91);
+	W[92 - O] = P3(92) + P1(92);
+	sharound(92);
+	W[93 - O] = P3(93) + P1(93);
+	sharound(93);
+	W[94 - O] = (u)0x400022 + P3(94) + P1(94);
+	sharound(94);
+	W[95 - O] = (u)0x00000100U + P3(95) + P2(95) + P1(95);
+	sharound(95);
+
+	W(96);
+	sharound(96);
+	W(97);
+	sharound(97);
+	W(98);
+	sharound(98);
+	W(99);
+	sharound(99);
+	W(100);
+	sharound(100);
+	W(101);
+	sharound(101);
+	W(102);
+	sharound(102);
+	W(103);
+	sharound(103);
+	W(104);
+	sharound(104);
+	W(105);
+	sharound(105);
+	W(106);
+	sharound(106);
+	W(107);
+	sharound(107);
+	W(108);
+	sharound(108);
+	W(109);
+	sharound(109);
+	W(110);
+	sharound(110);
+	W(111);
+	sharound(111);
+	W(112);
+	sharound(112);
+	W(113);
+	sharound(113);
+	W(114);
+	sharound(114);
+	W(115);
+	sharound(115);
+	W(116);
+	sharound(116);
+	W(117);
+	sharound(117);
+	W(118);
+	sharound(118);
+	W(119);
+	sharound(119);
+	W(120);
+	sharound(120);
+	W(121);
+	sharound(121);
+	W(122);
+	sharound(122);
+	W(123);
+	sharound(123);
+
+	// Round 124
+	Vals[7] += Vals[3] + P4(124) + P3(124) + P2(124) + P1(124) + s1(124) + ch(124);
+	
+#define MAXBUFFERS (4095)
+#define NFLAG (0xFFFUL)
+
+#if defined(VECTORS4) || defined(VECTORS2)
+	if (Vals[7].x == -H[7])
+	{
+		output[MAXBUFFERS] = output[NFLAG & (W[3].x >> 2)] =  W_3.x;
+	}
+	if (Vals[7].y == -H[7])
+	{
+		output[MAXBUFFERS] = output[NFLAG & (W[3].y >> 2)] =  W_3.y;
+	}
+#ifdef VECTORS4
+	if (Vals[7].z == -H[7])
+	{
+		output[MAXBUFFERS] = output[NFLAG & (W[3].z >> 2)] =  W_3.z;
+	}
+	if (Vals[7].w == -H[7])
+	{
+		output[MAXBUFFERS] = output[NFLAG & (W[3].w >> 2)] =  W_3.w;
+	}
+#endif
+#else
+	if (Vals[7] == -H[7])
+	{
+		output[MAXBUFFERS] = output[NFLAG & (W[3] >> 2)] =  W_3;
+	}
+#endif
+}