14 years ago · 93ff09e577
--- a/phatk110816.cl
+++ b/phatk110816.cl
@@ -1,428 +1,428 @@
 
				-// This file is taken and modified from the public-domain poclbm project, and

			
 
				-// I have therefore decided to keep it public-domain.

			
 
				-

			
 
				-

			
 
				-#define VECTORSX

			
 
				-#define BFI_INTX

			
 
				-#define BITALIGNX

			
 
				-

			
 
				-#ifdef VECTORS4

			
 
				-	typedef uint4 u;

			
 
				-#else 

			
 
				-	#ifdef VECTORS2

			
 
				-		typedef uint2 u;

			
 
				-	#else

			
 
				-		typedef uint u;

			
 
				-	#endif

			
 
				-#endif

			
 
				-

			
 
				-__constant uint K[64] = { 

			
 
				-    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,

			
 
				-    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,

			
 
				-    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,

			
 
				-    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,

			
 
				-    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,

			
 
				-    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,

			
 
				-    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,

			
 
				-    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2

			
 
				-};

			
 
				-

			
 
				-__constant uint ConstW[128] = {

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x80000000U, 0x00000000, 0x00000000, 0x00000000,

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000280U,

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

			
 
				-

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

			
 
				-0x80000000U, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000100U,

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,

			
 
				-0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000

			
 
				-};

			
 
				-

			
 
				-__constant uint H[8] = { 

			
 
				-	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19

			
 
				-};

			
 
				-

			
 
				-

			
 
				-#ifdef BITALIGN

			
 
				-	#pragma OPENCL EXTENSION cl_amd_media_ops : enable

			
 
				-	#define rot(x, y) amd_bitalign(x, x, (uint)(32 - y))

			
 
				-#else

			
 
				-	#define rot(x, y) rotate(x, (uint)y)

			
 
				-#endif

			
 
				-

			
 
				-// This part is not from the stock poclbm kernel. It's part of an optimization

			
 
				-// added in the Phoenix Miner.

			
 
				-

			
 
				-// Some AMD devices have Vals[0] BFI_INT opcode, which behaves exactly like the

			
 
				-// SHA-256 Ch function, but provides it in exactly one instruction. If

			
 
				-// detected, use it for Ch. Otherwise, construct Ch out of simpler logical

			
 
				-// primitives.

			
 
				-

			
 
				-#ifdef BFI_INT

			
 
				-	// Well, slight problem... It turns out BFI_INT isn't actually exposed to

			
 
				-	// OpenCL (or CAL IL for that matter) in any way. However, there is 

			
 
				-	// a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via

			
 
				-	// amd_bytealign, takes the same inputs, and provides the same output. 

			
 
				-	// We can use that as a placeholder for BFI_INT and have the application 

			
 
				-	// patch it after compilation.

			
 
				-	

			
 
				-	// This is the BFI_INT function

			
 
				-	#define Ch(x, y, z) amd_bytealign(x,y,z)

			
 
				-	// Ma can also be implemented in terms of BFI_INT...

			
 
				-	#define Ma(z, x, y) amd_bytealign(z^x,y,x)

			
 
				-#else

			
 
				-	#define Ch(x, y, z) bitselect(x,y,z)

			
 
				-	// Ma can also be implemented in terms of bitselect

			
 
				-	#define Ma(z, x, y) bitselect(z^x,y,x)

			
 
				-#endif

			
 
				-

			
 
				-

			
 
				-//Various intermediate calculations for each SHA round

			
 
				-#define s0(n) (S0(Vals[(0 + 128 - (n)) % 8]))

			
 
				-#define S0(n) (rot(n, 30u)^rot(n, 19u)^rot(n,10u))

			
 
				-

			
 
				-#define s1(n) (S1(Vals[(4 + 128 - (n)) % 8]))

			
 
				-#define S1(n) (rot(n, 26u)^rot(n, 21u)^rot(n, 7u))

			
 
				-

			
 
				-#define ch(n) Ch(Vals[(4 + 128 - (n)) % 8],Vals[(5 + 128 - (n)) % 8],Vals[(6 + 128 - (n)) % 8])

			
 
				-#define maj(n) Ma(Vals[(1 + 128 - (n)) % 8],Vals[(2 + 128 - (n)) % 8],Vals[(0 + 128 - (n)) % 8])

			
 
				-

			
 
				-//t1 calc when W is already calculated

			
 
				-#define t1(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] +  W[(n)] + s1(n) + ch(n) 

			
 
				-

			
 
				-//t1 calc which calculates W

			
 
				-#define t1W(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] +  W(n) + s1(n) + ch(n)

			
 
				-

			
 
				-//Used for constant W Values (the compiler optimizes out zeros)

			
 
				-#define t1C(n) (K[(n) % 64]+ ConstW[(n)]) + Vals[(7 + 128 - (n)) % 8] + s1(n) + ch(n)

			
 
				-

			
 
				-//t2 Calc

			
 
				-#define t2(n)  maj(n) + s0(n)

			
 
				-

			
 
				-#define rotC(x,n) (x<<n | x >> (32-n))

			
 
				-

			
 
				-//W calculation used for SHA round

			
 
				-#define W(n) (W[n] = P4(n) + P3(n) + P2(n) + P1(n))

			
 
				-

			
 
				-

			
 
				-

			
 
				-//Partial W calculations (used for the begining where only some values are nonzero)

			
 
				-#define P1(n) ((rot(W[(n)-2],15u)^rot(W[(n)-2],13u)^((W[(n)-2])>>10U)))

			
 
				-#define P2(n) ((rot(W[(n)-15],25u)^rot(W[(n)-15],14u)^((W[(n)-15])>>3U)))

			
 
				-

			
 
				-

			
 
				-#define p1(x) ((rot(x,15u)^rot(x,13u)^((x)>>10U)))

			
 
				-#define p2(x) ((rot(x,25u)^rot(x,14u)^((x)>>3U)))

			
 
				-

			
 
				-

			
 
				-#define P3(n)  W[n-7]

			
 
				-#define P4(n)  W[n-16]

			
 
				-

			
 
				-

			
 
				-//Partial Calcs for constant W values

			
 
				-#define P1C(n) ((rotC(ConstW[(n)-2],15)^rotC(ConstW[(n)-2],13)^((ConstW[(n)-2])>>10U)))

			
 
				-#define P2C(n) ((rotC(ConstW[(n)-15],25)^rotC(ConstW[(n)-15],14)^((ConstW[(n)-15])>>3U)))

			
 
				-#define P3C(x)  ConstW[x-7]

			
 
				-#define P4C(x)  ConstW[x-16]

			
 
				-

			
 
				-//SHA round with built in W calc

			
 
				-#define sharoundW(n) Barrier1(n);  Vals[(3 + 128 - (n)) % 8] += t1W(n); Vals[(7 + 128 - (n)) % 8] = t1W(n) + t2(n);  

			
 
				-

			
 
				-//SHA round without W calc

			
 
				-#define sharound(n)  Barrier2(n); Vals[(3 + 128 - (n)) % 8] += t1(n); Vals[(7 + 128 - (n)) % 8] = t1(n) + t2(n);

			
 
				-

			
 
				-//SHA round for constant W values

			
 
				-#define sharoundC(n)  Barrier3(n); Vals[(3 + 128 - (n)) % 8] += t1C(n); Vals[(7 + 128 - (n)) % 8] = t1C(n) + t2(n);

			
 
				-

			
 
				-//The compiler is stupid... I put this in there only to stop the compiler from (de)optimizing the order

			
 
				-#define Barrier1(n) t1 = t1C((n+1))

			
 
				-#define Barrier2(n) t1 = t1C((n))

			
 
				-#define Barrier3(n) t1 = t1C((n))

			
 
				-

			
 
				-//#define WORKSIZE 256

			
 
				-#define MAXBUFFERS (4095)

			
 
				-

			
 
				-__kernel 

			
 
				- __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))

			
 
				-void search(	const uint state0, const uint state1, const uint state2, const uint state3,

			
 
				-						const uint state4, const uint state5, const uint state6, const uint state7,

			
 
				-						const uint B1, const uint C1, const uint D1,

			
 
				-						const uint F1, const uint G1, const uint H1,

			
 
				-						const u base,

			
 
				-						const uint W16, const uint W17,

			
 
				-						const uint PreVal4, const uint PreVal0,

			
 
				-						const uint PreW18, const uint PreW19,

			
 
				-						const uint PreW31, const uint PreW32,

			
 
				-						

			
 
				-						__global uint * output)

			
 
				-{

			
 
				-

			
 
				-

			
 
				-	u W[124];

			
 
				-	u Vals[8];

			
 
				-

			
 
				-//Dummy Variable to prevent compiler from reordering between rounds

			
 
				-	u t1;

			
 
				-	

			
 
				-	//Vals[0]=state0;

			
 
				-	Vals[1]=B1;

			
 
				-	Vals[2]=C1;

			
 
				-	Vals[3]=D1;

			
 
				-	//Vals[4]=PreVal4;

			
 
				-	Vals[5]=F1;

			
 
				-	Vals[6]=G1;

			
 
				-	Vals[7]=H1;

			
 
				-

			
 
				-	W[16] = W16;

			
 
				-	W[17] = W17;

			
 
				-

			
 
				-#ifdef VECTORS4

			
 
				-	//Less dependencies to get both the local id and group id and then add them

			
 
				-	W[3] = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u);

			
 
				-	uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);

			
 
				-	//Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3

			
 
				-	W[18] = PreW18 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U};

			
 
				-#else

			
 
				-	#ifdef VECTORS2

			
 
				-		W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);

			
 
				-		uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);

			
 
				-		W[18] = PreW18 + (u){r, r ^ 0x2004000U};

			
 
				-	#else

			
 
				-		W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);

			
 
				-		u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U);

			
 
				-		W[18] = PreW18 + r;

			
 
				-	#endif

			
 
				-#endif

			
 
				-	//the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions

			
 
				-

			
 
				-

			
 
				-

			
 
				-	Vals[4] = PreVal4 + W[3];

			
 
				-	Vals[0] = PreVal0 + W[3];

			
 
				-

			
 
				-	sharoundC(4);

			
 
				-	W[19] = PreW19 + W[3];

			
 
				-	sharoundC(5);

			
 
				-	W[20] = P4C(20) + P1(20);

			
 
				-	sharoundC(6);

			
 
				-	W[21] = P1(21);

			
 
				-	sharoundC(7);

			
 
				-	W[22] = P3C(22) + P1(22);

			
 
				-	sharoundC(8);

			
 
				-	W[23] = W[16] + P1(23);

			
 
				-	sharoundC(9);

			
 
				-	W[24] = W[17] + P1(24);

			
 
				-	sharoundC(10);

			
 
				-	W[25] = P1(25) + P3(25);

			
 
				-	W[26] = P1(26) + P3(26);

			
 
				-	sharoundC(11);

			
 
				-	W[27] = P1(27) + P3(27);

			
 
				-	W[28] = P1(28) + P3(28);

			
 
				-	sharoundC(12);

			
 
				-	W[29] = P1(29) + P3(29);

			
 
				-	sharoundC(13);

			
 
				-	W[30] = P1(30) + P2C(30) + P3(30);

			
 
				-	W[31] = PreW31 + (P1(31) + P3(31));

			
 
				-	sharoundC(14);

			
 
				-	W[32] = PreW32 + (P1(32) + P3(32));

			
 
				-	sharoundC(15);

			
 
				-	sharound(16);

			
 
				-	sharound(17);

			
 
				-	sharound(18);

			
 
				-	sharound(19);

			
 
				-	sharound(20);

			
 
				-	sharound(21);

			
 
				-	sharound(22);

			
 
				-	sharound(23);

			
 
				-	sharound(24);

			
 
				-	sharound(25);

			
 
				-	sharound(26);

			
 
				-	sharound(27);

			
 
				-	sharound(28);

			
 
				-	sharound(29);

			
 
				-	sharound(30);

			
 
				-	sharound(31);

			
 
				-	sharound(32);

			
 
				-	sharoundW(33);

			
 
				-	sharoundW(34);

			
 
				-	sharoundW(35);

			
 
				-	sharoundW(36);

			
 
				-	sharoundW(37);	

			
 
				-	sharoundW(38);

			
 
				-	sharoundW(39);

			
 
				-	sharoundW(40);

			
 
				-	sharoundW(41);

			
 
				-	sharoundW(42);

			
 
				-	sharoundW(43);

			
 
				-	sharoundW(44);

			
 
				-	sharoundW(45);

			
 
				-	sharoundW(46);

			
 
				-	sharoundW(47);

			
 
				-	sharoundW(48);

			
 
				-	sharoundW(49);

			
 
				-	sharoundW(50);

			
 
				-	sharoundW(51);

			
 
				-	sharoundW(52);

			
 
				-	sharoundW(53);

			
 
				-	sharoundW(54);

			
 
				-	sharoundW(55);

			
 
				-	sharoundW(56);

			
 
				-	sharoundW(57);

			
 
				-	sharoundW(58);

			
 
				-	sharoundW(59);

			
 
				-	sharoundW(60);

			
 
				-	sharoundW(61);

			
 
				-	sharoundW(62);

			
 
				-	sharoundW(63);

			
 
				-

			
 
				-	W[64]=state0+Vals[0];

			
 
				-	W[65]=state1+Vals[1];

			
 
				-	W[66]=state2+Vals[2];

			
 
				-	W[67]=state3+Vals[3];

			
 
				-	W[68]=state4+Vals[4];

			
 
				-	W[69]=state5+Vals[5];

			
 
				-	W[70]=state6+Vals[6];

			
 
				-	W[71]=state7+Vals[7];

			
 
				-

			
 
				-	Vals[0]=H[0];

			
 
				-	Vals[1]=H[1];

			
 
				-	Vals[2]=H[2];

			
 
				-	Vals[3]=H[3];

			
 
				-	Vals[4]=H[4];

			
 
				-	Vals[5]=H[5];

			
 
				-	Vals[6]=H[6];

			
 
				-	Vals[7]=H[7];

			
 
				-

			
 
				-	//sharound(64 + 0);

			
 
				-	const u Temp = (0xb0edbdd0U + K[0]) +  W[64];

			
 
				-	Vals[7] = Temp + 0x08909ae5U;

			
 
				-	Vals[3] = 0xa54ff53aU + Temp;

			
 
				-	

			
 
				-#define P124(n) P2(n) + P1(n) + P4(n)

			
 
				-

			
 
				-

			
 
				-	W[64 + 16] = + P2(64 + 16) + P4(64 + 16);

			
 
				-	sharound(64 + 1);

			
 
				-	W[64 + 17] = P1C(64 + 17) + P2(64 + 17) + P4(64 + 17);

			
 
				-	sharound(64 + 2);

			
 
				-	W[64 + 18] = P124(64 + 18);

			
 
				-	sharound(64 + 3);

			
 
				-	W[64 + 19] = P124(64 + 19);

			
 
				-	sharound(64 + 4);

			
 
				-	W[64 + 20] = P124(64 + 20);

			
 
				-	sharound(64 + 5);

			
 
				-	W[64 + 21] = P124(64 + 21);

			
 
				-	sharound(64 + 6);

			
 
				-	W[64 + 22] = P4(64 + 22) + P3C(64 + 22) + P2(64 + 22) + P1(64 + 22);

			
 
				-	sharound(64 + 7);

			
 
				-	W[64 + 23] = P4(64 + 23) + P3(64 + 23) + P2C(64 + 23) + P1(64 + 23);

			
 
				-	sharoundC(64 + 8);

			
 
				-	W[64 + 24] =   P1(64 + 24) + P4C(64 + 24) + P3(64 + 24);

			
 
				-	sharoundC(64 + 9);

			
 
				-	W[64 + 25] = P3(64 + 25) + P1(64 + 25);

			
 
				-	sharoundC(64 + 10);

			
 
				-	W[64 + 26] = P3(64 + 26) + P1(64 + 26);

			
 
				-	sharoundC(64 + 11);

			
 
				-	W[64 + 27] = P3(64 + 27) + P1(64 + 27);

			
 
				-	sharoundC(64 + 12);

			
 
				-	W[64 + 28] = P3(64 + 28) + P1(64 + 28);

			
 
				-	sharoundC(64 + 13);

			
 
				-	W[64 + 29] = P1(64 + 29) + P3(64 + 29);

			
 
				-	W[64 + 30] = P3(64 + 30) + P2C(64 + 30) + P1(64 + 30);

			
 
				-	sharoundC(64 + 14);

			
 
				-	W[64 + 31] = P4C(64 + 31) + P3(64 + 31) + P2(64 + 31) + P1(64 + 31);

			
 
				-	sharoundC(64 + 15);

			
 
				-	sharound(64 + 16);

			
 
				-	sharound(64 + 17);

			
 
				-	sharound(64 + 18);

			
 
				-	sharound(64 + 19);

			
 
				-	sharound(64 + 20);

			
 
				-	sharound(64 + 21);

			
 
				-	sharound(64 + 22);

			
 
				-	sharound(64 + 23);

			
 
				-	sharound(64 + 24);

			
 
				-	sharound(64 + 25);

			
 
				-	sharound(64 + 26);

			
 
				-	sharound(64 + 27);

			
 
				-	sharound(64 + 28);

			
 
				-	sharound(64 + 29);

			
 
				-	sharound(64 + 30);

			
 
				-	sharound(64 + 31);

			
 
				-	sharoundW(64 + 32);

			
 
				-	sharoundW(64 + 33);

			
 
				-	sharoundW(64 + 34);

			
 
				-	sharoundW(64 + 35);

			
 
				-	sharoundW(64 + 36);

			
 
				-	sharoundW(64 + 37);

			
 
				-	sharoundW(64 + 38);

			
 
				-	sharoundW(64 + 39);

			
 
				-	sharoundW(64 + 40);

			
 
				-	sharoundW(64 + 41);

			
 
				-	sharoundW(64 + 42);

			
 
				-	sharoundW(64 + 43);

			
 
				-	sharoundW(64 + 44);

			
 
				-	sharoundW(64 + 45);

			
 
				-	sharoundW(64 + 46);

			
 
				-	sharoundW(64 + 47);

			
 
				-	sharoundW(64 + 48);

			
 
				-	sharoundW(64 + 49);

			
 
				-	sharoundW(64 + 50);

			
 
				-	sharoundW(64 + 51);

			
 
				-	sharoundW(64 + 52);

			
 
				-	sharoundW(64 + 53);

			
 
				-	sharoundW(64 + 54);

			
 
				-	sharoundW(64 + 55);

			
 
				-	sharoundW(64 + 56);

			
 
				-	sharoundW(64 + 57);

			
 
				-	sharoundW(64 + 58);

			
 
				-

			
 
				-	u v = W[117] + W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64),Vals[1],Vals[2]); 

			
 
				-	u g = -(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(59+64))  + s1(64+59)+ ch(59+64));

			
 
				-	

			
 
				-#define NFLAG (0xFF)

			
 
				-

			
 
				-#ifdef VECTORS4

			
 
				-	if (v.x == g.x)

			
 
				-	{

			
 
				-		output[MAXBUFFERS] = output[NFLAG & W[3].x] = W[3].x;

			
 
				-	}

			
 
				-	if (v.y == g.y)

			
 
				-	{

			
 
				-		output[MAXBUFFERS] = output[NFLAG & W[3].y] = W[3].y;

			
 
				-	}

			
 
				-	if (v.z == g.z)

			
 
				-	{

			
 
				-		output[MAXBUFFERS] = output[NFLAG & W[3].z] = W[3].z;

			
 
				-	}

			
 
				-	if (v.w == g.w)

			
 
				-	{

			
 
				-		output[MAXBUFFERS] = output[NFLAG & W[3].w] = W[3].w;

			
 
				-	}

			
 
				-#else

			
 
				-	#ifdef VECTORS2

			
 
				-		if (v.x == g.x)

			
 
				-		{

			
 
				-			output[MAXBUFFERS] = output[NFLAG & W[3].x] = W[3].x;

			
 
				-		}

			
 
				-		if (v.y == g.y)

			
 
				-		{

			
 
				-			output[MAXBUFFERS] = output[NFLAG & W[3].y] = W[3].y;

			
 
				-		}

			
 
				-	#else

			
 
				-		if (v == g)

			
 
				-		{

			
 
				-			output[MAXBUFFERS] = output[NFLAG & W[3]] = W[3];

			
 
				-		}

			
 
				-	#endif

			
 
				-#endif

			
 
				-}

			
 
				-

			
 
				+// This file is taken and modified from the public-domain poclbm project, and
			
 
				+// I have therefore decided to keep it public-domain.
			
 
				+
			
 
				+
			
 
				+#define VECTORSX
			
 
				+#define BFI_INTX
			
 
				+#define BITALIGNX
			
 
				+
			
 
				+#ifdef VECTORS4
			
 
				+	typedef uint4 u;
			
 
				+#else 
			
 
				+	#ifdef VECTORS2
			
 
				+		typedef uint2 u;
			
 
				+	#else
			
 
				+		typedef uint u;
			
 
				+	#endif
			
 
				+#endif
			
 
				+
			
 
				+__constant uint K[64] = { 
			
 
				+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
			
 
				+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
			
 
				+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
			
 
				+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
			
 
				+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
			
 
				+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
			
 
				+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
			
 
				+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
			
 
				+};
			
 
				+
			
 
				+__constant uint ConstW[128] = {
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x80000000U, 0x00000000, 0x00000000, 0x00000000,
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000280U,
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
			
 
				+
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
			
 
				+0x80000000U, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000100U,
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
			
 
				+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
			
 
				+};
			
 
				+
			
 
				+__constant uint H[8] = { 
			
 
				+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
			
 
				+};
			
 
				+
			
 
				+
			
 
				+#ifdef BITALIGN
			
 
				+	#pragma OPENCL EXTENSION cl_amd_media_ops : enable
			
 
				+	#define rot(x, y) amd_bitalign(x, x, (uint)(32 - y))
			
 
				+#else
			
 
				+	#define rot(x, y) rotate(x, (uint)y)
			
 
				+#endif
			
 
				+
			
 
				+// This part is not from the stock poclbm kernel. It's part of an optimization
			
 
				+// added in the Phoenix Miner.
			
 
				+
			
 
				+// Some AMD devices have Vals[0] BFI_INT opcode, which behaves exactly like the
			
 
				+// SHA-256 Ch function, but provides it in exactly one instruction. If
			
 
				+// detected, use it for Ch. Otherwise, construct Ch out of simpler logical
			
 
				+// primitives.
			
 
				+
			
 
				+#ifdef BFI_INT
			
 
				+	// Well, slight problem... It turns out BFI_INT isn't actually exposed to
			
 
				+	// OpenCL (or CAL IL for that matter) in any way. However, there is 
			
 
				+	// a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via
			
 
				+	// amd_bytealign, takes the same inputs, and provides the same output. 
			
 
				+	// We can use that as a placeholder for BFI_INT and have the application 
			
 
				+	// patch it after compilation.
			
 
				+	
			
 
				+	// This is the BFI_INT function
			
 
				+	#define Ch(x, y, z) amd_bytealign(x,y,z)
			
 
				+	// Ma can also be implemented in terms of BFI_INT...
			
 
				+	#define Ma(z, x, y) amd_bytealign(z^x,y,x)
			
 
				+#else
			
 
				+	#define Ch(x, y, z) bitselect(x,y,z)
			
 
				+	// Ma can also be implemented in terms of bitselect
			
 
				+	#define Ma(z, x, y) bitselect(z^x,y,x)
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+//Various intermediate calculations for each SHA round
			
 
				+#define s0(n) (S0(Vals[(0 + 128 - (n)) % 8]))
			
 
				+#define S0(n) (rot(n, 30u)^rot(n, 19u)^rot(n,10u))
			
 
				+
			
 
				+#define s1(n) (S1(Vals[(4 + 128 - (n)) % 8]))
			
 
				+#define S1(n) (rot(n, 26u)^rot(n, 21u)^rot(n, 7u))
			
 
				+
			
 
				+#define ch(n) Ch(Vals[(4 + 128 - (n)) % 8],Vals[(5 + 128 - (n)) % 8],Vals[(6 + 128 - (n)) % 8])
			
 
				+#define maj(n) Ma(Vals[(1 + 128 - (n)) % 8],Vals[(2 + 128 - (n)) % 8],Vals[(0 + 128 - (n)) % 8])
			
 
				+
			
 
				+//t1 calc when W is already calculated
			
 
				+#define t1(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] +  W[(n)] + s1(n) + ch(n) 
			
 
				+
			
 
				+//t1 calc which calculates W
			
 
				+#define t1W(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] +  W(n) + s1(n) + ch(n)
			
 
				+
			
 
				+//Used for constant W Values (the compiler optimizes out zeros)
			
 
				+#define t1C(n) (K[(n) % 64]+ ConstW[(n)]) + Vals[(7 + 128 - (n)) % 8] + s1(n) + ch(n)
			
 
				+
			
 
				+//t2 Calc
			
 
				+#define t2(n)  maj(n) + s0(n)
			
 
				+
			
 
				+#define rotC(x,n) (x<<n | x >> (32-n))
			
 
				+
			
 
				+//W calculation used for SHA round
			
 
				+#define W(n) (W[n] = P4(n) + P3(n) + P2(n) + P1(n))
			
 
				+
			
 
				+
			
 
				+
			
 
				+//Partial W calculations (used for the begining where only some values are nonzero)
			
 
				+#define P1(n) ((rot(W[(n)-2],15u)^rot(W[(n)-2],13u)^((W[(n)-2])>>10U)))
			
 
				+#define P2(n) ((rot(W[(n)-15],25u)^rot(W[(n)-15],14u)^((W[(n)-15])>>3U)))
			
 
				+
			
 
				+
			
 
				+#define p1(x) ((rot(x,15u)^rot(x,13u)^((x)>>10U)))
			
 
				+#define p2(x) ((rot(x,25u)^rot(x,14u)^((x)>>3U)))
			
 
				+
			
 
				+
			
 
				+#define P3(n)  W[n-7]
			
 
				+#define P4(n)  W[n-16]
			
 
				+
			
 
				+
			
 
				+//Partial Calcs for constant W values
			
 
				+#define P1C(n) ((rotC(ConstW[(n)-2],15)^rotC(ConstW[(n)-2],13)^((ConstW[(n)-2])>>10U)))
			
 
				+#define P2C(n) ((rotC(ConstW[(n)-15],25)^rotC(ConstW[(n)-15],14)^((ConstW[(n)-15])>>3U)))
			
 
				+#define P3C(x)  ConstW[x-7]
			
 
				+#define P4C(x)  ConstW[x-16]
			
 
				+
			
 
				+//SHA round with built in W calc
			
 
				+#define sharoundW(n) Barrier1(n);  Vals[(3 + 128 - (n)) % 8] += t1W(n); Vals[(7 + 128 - (n)) % 8] = t1W(n) + t2(n);  
			
 
				+
			
 
				+//SHA round without W calc
			
 
				+#define sharound(n)  Barrier2(n); Vals[(3 + 128 - (n)) % 8] += t1(n); Vals[(7 + 128 - (n)) % 8] = t1(n) + t2(n);
			
 
				+
			
 
				+//SHA round for constant W values
			
 
				+#define sharoundC(n)  Barrier3(n); Vals[(3 + 128 - (n)) % 8] += t1C(n); Vals[(7 + 128 - (n)) % 8] = t1C(n) + t2(n);
			
 
				+
			
 
				+//The compiler is stupid... I put this in there only to stop the compiler from (de)optimizing the order
			
 
				+#define Barrier1(n) t1 = t1C((n+1))
			
 
				+#define Barrier2(n) t1 = t1C((n))
			
 
				+#define Barrier3(n) t1 = t1C((n))
			
 
				+
			
 
				+//#define WORKSIZE 256
			
 
				+#define MAXBUFFERS (4095)
			
 
				+
			
 
				+__kernel 
			
 
				+ __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
			
 
				+void search(	const uint state0, const uint state1, const uint state2, const uint state3,
			
 
				+						const uint state4, const uint state5, const uint state6, const uint state7,
			
 
				+						const uint B1, const uint C1, const uint D1,
			
 
				+						const uint F1, const uint G1, const uint H1,
			
 
				+						const u base,
			
 
				+						const uint W16, const uint W17,
			
 
				+						const uint PreVal4, const uint PreVal0,
			
 
				+						const uint PreW18, const uint PreW19,
			
 
				+						const uint PreW31, const uint PreW32,
			
 
				+						
			
 
				+						__global uint * output)
			
 
				+{
			
 
				+
			
 
				+
			
 
				+	u W[124];
			
 
				+	u Vals[8];
			
 
				+
			
 
				+//Dummy Variable to prevent compiler from reordering between rounds
			
 
				+	u t1;
			
 
				+	
			
 
				+	//Vals[0]=state0;
			
 
				+	Vals[1]=B1;
			
 
				+	Vals[2]=C1;
			
 
				+	Vals[3]=D1;
			
 
				+	//Vals[4]=PreVal4;
			
 
				+	Vals[5]=F1;
			
 
				+	Vals[6]=G1;
			
 
				+	Vals[7]=H1;
			
 
				+
			
 
				+	W[16] = W16;
			
 
				+	W[17] = W17;
			
 
				+
			
 
				+#ifdef VECTORS4
			
 
				+	//Less dependencies to get both the local id and group id and then add them
			
 
				+	W[3] = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u);
			
 
				+	uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
			
 
				+	//Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3
			
 
				+	W[18] = PreW18 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U};
			
 
				+#else
			
 
				+	#ifdef VECTORS2
			
 
				+		W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
			
 
				+		uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
			
 
				+		W[18] = PreW18 + (u){r, r ^ 0x2004000U};
			
 
				+	#else
			
 
				+		W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
			
 
				+		u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U);
			
 
				+		W[18] = PreW18 + r;
			
 
				+	#endif
			
 
				+#endif
			
 
				+	//the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions
			
 
				+
			
 
				+
			
 
				+
			
 
				+	Vals[4] = PreVal4 + W[3];
			
 
				+	Vals[0] = PreVal0 + W[3];
			
 
				+
			
 
				+	sharoundC(4);
			
 
				+	W[19] = PreW19 + W[3];
			
 
				+	sharoundC(5);
			
 
				+	W[20] = P4C(20) + P1(20);
			
 
				+	sharoundC(6);
			
 
				+	W[21] = P1(21);
			
 
				+	sharoundC(7);
			
 
				+	W[22] = P3C(22) + P1(22);
			
 
				+	sharoundC(8);
			
 
				+	W[23] = W[16] + P1(23);
			
 
				+	sharoundC(9);
			
 
				+	W[24] = W[17] + P1(24);
			
 
				+	sharoundC(10);
			
 
				+	W[25] = P1(25) + P3(25);
			
 
				+	W[26] = P1(26) + P3(26);
			
 
				+	sharoundC(11);
			
 
				+	W[27] = P1(27) + P3(27);
			
 
				+	W[28] = P1(28) + P3(28);
			
 
				+	sharoundC(12);
			
 
				+	W[29] = P1(29) + P3(29);
			
 
				+	sharoundC(13);
			
 
				+	W[30] = P1(30) + P2C(30) + P3(30);
			
 
				+	W[31] = PreW31 + (P1(31) + P3(31));
			
 
				+	sharoundC(14);
			
 
				+	W[32] = PreW32 + (P1(32) + P3(32));
			
 
				+	sharoundC(15);
			
 
				+	sharound(16);
			
 
				+	sharound(17);
			
 
				+	sharound(18);
			
 
				+	sharound(19);
			
 
				+	sharound(20);
			
 
				+	sharound(21);
			
 
				+	sharound(22);
			
 
				+	sharound(23);
			
 
				+	sharound(24);
			
 
				+	sharound(25);
			
 
				+	sharound(26);
			
 
				+	sharound(27);
			
 
				+	sharound(28);
			
 
				+	sharound(29);
			
 
				+	sharound(30);
			
 
				+	sharound(31);
			
 
				+	sharound(32);
			
 
				+	sharoundW(33);
			
 
				+	sharoundW(34);
			
 
				+	sharoundW(35);
			
 
				+	sharoundW(36);
			
 
				+	sharoundW(37);	
			
 
				+	sharoundW(38);
			
 
				+	sharoundW(39);
			
 
				+	sharoundW(40);
			
 
				+	sharoundW(41);
			
 
				+	sharoundW(42);
			
 
				+	sharoundW(43);
			
 
				+	sharoundW(44);
			
 
				+	sharoundW(45);
			
 
				+	sharoundW(46);
			
 
				+	sharoundW(47);
			
 
				+	sharoundW(48);
			
 
				+	sharoundW(49);
			
 
				+	sharoundW(50);
			
 
				+	sharoundW(51);
			
 
				+	sharoundW(52);
			
 
				+	sharoundW(53);
			
 
				+	sharoundW(54);
			
 
				+	sharoundW(55);
			
 
				+	sharoundW(56);
			
 
				+	sharoundW(57);
			
 
				+	sharoundW(58);
			
 
				+	sharoundW(59);
			
 
				+	sharoundW(60);
			
 
				+	sharoundW(61);
			
 
				+	sharoundW(62);
			
 
				+	sharoundW(63);
			
 
				+
			
 
				+	W[64]=state0+Vals[0];
			
 
				+	W[65]=state1+Vals[1];
			
 
				+	W[66]=state2+Vals[2];
			
 
				+	W[67]=state3+Vals[3];
			
 
				+	W[68]=state4+Vals[4];
			
 
				+	W[69]=state5+Vals[5];
			
 
				+	W[70]=state6+Vals[6];
			
 
				+	W[71]=state7+Vals[7];
			
 
				+
			
 
				+	Vals[0]=H[0];
			
 
				+	Vals[1]=H[1];
			
 
				+	Vals[2]=H[2];
			
 
				+	Vals[3]=H[3];
			
 
				+	Vals[4]=H[4];
			
 
				+	Vals[5]=H[5];
			
 
				+	Vals[6]=H[6];
			
 
				+	Vals[7]=H[7];
			
 
				+
			
 
				+	//sharound(64 + 0);
			
 
				+	const u Temp = (0xb0edbdd0U + K[0]) +  W[64];
			
 
				+	Vals[7] = Temp + 0x08909ae5U;
			
 
				+	Vals[3] = 0xa54ff53aU + Temp;
			
 
				+	
			
 
				+#define P124(n) P2(n) + P1(n) + P4(n)
			
 
				+
			
 
				+
			
 
				+	W[64 + 16] = + P2(64 + 16) + P4(64 + 16);
			
 
				+	sharound(64 + 1);
			
 
				+	W[64 + 17] = P1C(64 + 17) + P2(64 + 17) + P4(64 + 17);
			
 
				+	sharound(64 + 2);
			
 
				+	W[64 + 18] = P124(64 + 18);
			
 
				+	sharound(64 + 3);
			
 
				+	W[64 + 19] = P124(64 + 19);
			
 
				+	sharound(64 + 4);
			
 
				+	W[64 + 20] = P124(64 + 20);
			
 
				+	sharound(64 + 5);
			
 
				+	W[64 + 21] = P124(64 + 21);
			
 
				+	sharound(64 + 6);
			
 
				+	W[64 + 22] = P4(64 + 22) + P3C(64 + 22) + P2(64 + 22) + P1(64 + 22);
			
 
				+	sharound(64 + 7);
			
 
				+	W[64 + 23] = P4(64 + 23) + P3(64 + 23) + P2C(64 + 23) + P1(64 + 23);
			
 
				+	sharoundC(64 + 8);
			
 
				+	W[64 + 24] =   P1(64 + 24) + P4C(64 + 24) + P3(64 + 24);
			
 
				+	sharoundC(64 + 9);
			
 
				+	W[64 + 25] = P3(64 + 25) + P1(64 + 25);
			
 
				+	sharoundC(64 + 10);
			
 
				+	W[64 + 26] = P3(64 + 26) + P1(64 + 26);
			
 
				+	sharoundC(64 + 11);
			
 
				+	W[64 + 27] = P3(64 + 27) + P1(64 + 27);
			
 
				+	sharoundC(64 + 12);
			
 
				+	W[64 + 28] = P3(64 + 28) + P1(64 + 28);
			
 
				+	sharoundC(64 + 13);
			
 
				+	W[64 + 29] = P1(64 + 29) + P3(64 + 29);
			
 
				+	W[64 + 30] = P3(64 + 30) + P2C(64 + 30) + P1(64 + 30);
			
 
				+	sharoundC(64 + 14);
			
 
				+	W[64 + 31] = P4C(64 + 31) + P3(64 + 31) + P2(64 + 31) + P1(64 + 31);
			
 
				+	sharoundC(64 + 15);
			
 
				+	sharound(64 + 16);
			
 
				+	sharound(64 + 17);
			
 
				+	sharound(64 + 18);
			
 
				+	sharound(64 + 19);
			
 
				+	sharound(64 + 20);
			
 
				+	sharound(64 + 21);
			
 
				+	sharound(64 + 22);
			
 
				+	sharound(64 + 23);
			
 
				+	sharound(64 + 24);
			
 
				+	sharound(64 + 25);
			
 
				+	sharound(64 + 26);
			
 
				+	sharound(64 + 27);
			
 
				+	sharound(64 + 28);
			
 
				+	sharound(64 + 29);
			
 
				+	sharound(64 + 30);
			
 
				+	sharound(64 + 31);
			
 
				+	sharoundW(64 + 32);
			
 
				+	sharoundW(64 + 33);
			
 
				+	sharoundW(64 + 34);
			
 
				+	sharoundW(64 + 35);
			
 
				+	sharoundW(64 + 36);
			
 
				+	sharoundW(64 + 37);
			
 
				+	sharoundW(64 + 38);
			
 
				+	sharoundW(64 + 39);
			
 
				+	sharoundW(64 + 40);
			
 
				+	sharoundW(64 + 41);
			
 
				+	sharoundW(64 + 42);
			
 
				+	sharoundW(64 + 43);
			
 
				+	sharoundW(64 + 44);
			
 
				+	sharoundW(64 + 45);
			
 
				+	sharoundW(64 + 46);
			
 
				+	sharoundW(64 + 47);
			
 
				+	sharoundW(64 + 48);
			
 
				+	sharoundW(64 + 49);
			
 
				+	sharoundW(64 + 50);
			
 
				+	sharoundW(64 + 51);
			
 
				+	sharoundW(64 + 52);
			
 
				+	sharoundW(64 + 53);
			
 
				+	sharoundW(64 + 54);
			
 
				+	sharoundW(64 + 55);
			
 
				+	sharoundW(64 + 56);
			
 
				+	sharoundW(64 + 57);
			
 
				+	sharoundW(64 + 58);
			
 
				+
			
 
				+	u v = W[117] + W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64),Vals[1],Vals[2]); 
			
 
				+	u g = -(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(59+64))  + s1(64+59)+ ch(59+64));
			
 
				+	
			
 
				+#define NFLAG (0xFF)
			
 
				+
			
 
				+#ifdef VECTORS4
			
 
				+	if (v.x == g.x)
			
 
				+	{
			
 
				+		output[MAXBUFFERS] = output[NFLAG & W[3].x] = W[3].x;
			
 
				+	}
			
 
				+	if (v.y == g.y)
			
 
				+	{
			
 
				+		output[MAXBUFFERS] = output[NFLAG & W[3].y] = W[3].y;
			
 
				+	}
			
 
				+	if (v.z == g.z)
			
 
				+	{
			
 
				+		output[MAXBUFFERS] = output[NFLAG & W[3].z] = W[3].z;
			
 
				+	}
			
 
				+	if (v.w == g.w)
			
 
				+	{
			
 
				+		output[MAXBUFFERS] = output[NFLAG & W[3].w] = W[3].w;
			
 
				+	}
			
 
				+#else
			
 
				+	#ifdef VECTORS2
			
 
				+		if (v.x == g.x)
			
 
				+		{
			
 
				+			output[MAXBUFFERS] = output[NFLAG & W[3].x] = W[3].x;
			
 
				+		}
			
 
				+		if (v.y == g.y)
			
 
				+		{
			
 
				+			output[MAXBUFFERS] = output[NFLAG & W[3].y] = W[3].y;
			
 
				+		}
			
 
				+	#else
			
 
				+		if (v == g)
			
 
				+		{
			
 
				+			output[MAXBUFFERS] = output[NFLAG & W[3]] = W[3];
			
 
				+		}
			
 
				+	#endif
			
 
				+#endif
			
 
				+}
			
 
				+