Browse Source

Add back in new cl files.

Con Kolivas 13 years ago
parent
commit
bff58c3bed
5 changed files with 4388 additions and 0 deletions
  1. 1274 0
      diablo120724.cl
  2. 587 0
      diakgcn120724.cl
  3. 417 0
      phatk120724.cl
  4. 1353 0
      poclbm120724.cl
  5. 757 0
      scrypt120724.cl

+ 1274 - 0
diablo120724.cl

@@ -0,0 +1,1274 @@
+/*
+ *  DiabloMiner - OpenCL miner for BitCoin
+ *  Copyright (C) 2010, 2011, 2012 Patrick McFarland <diablod3@gmail.com>
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more detail).
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef VECTORS4
+	typedef uint4 z;
+#elif defined(VECTORS2)
+	typedef uint2 z;
+#else
+	typedef uint z;
+#endif
+
+#ifdef BITALIGN
+#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+#define Zrotr(a, b) amd_bitalign((z)a, (z)a, (z)(32 - b))
+#else
+#define Zrotr(a, b) rotate((z)a, (z)b)
+#endif
+
+#ifdef BFI_INT
+#define ZCh(a, b, c) amd_bytealign(a, b, c)
+#define ZMa(a, b, c) amd_bytealign((c ^ a), (b), (a))
+#else
+#define ZCh(a, b, c) bitselect((z)c, (z)b, (z)a)
+#define ZMa(a, b, c) bitselect((z)a, (z)b, (z)c ^ (z)a)
+#endif
+
+#define ZR25(n) ((Zrotr((n), 25) ^ Zrotr((n), 14) ^ ((n) >> 3U)))
+#define ZR15(n) ((Zrotr((n), 15) ^ Zrotr((n), 13) ^ ((n) >> 10U)))
+#define ZR26(n) ((Zrotr((n), 26) ^ Zrotr((n), 21) ^ Zrotr((n), 7)))
+#define ZR30(n) ((Zrotr((n), 30) ^ Zrotr((n), 19) ^ Zrotr((n), 10)))
+
+__kernel
+__attribute__((vec_type_hint(z)))
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+void search(
+#ifndef GOFFSET
+    const z base,
+#endif
+    const uint PreVal4_state0, const uint PreVal4_state0_k7,
+    const uint PreVal4_T1,
+    const uint W18, const uint W19,
+    const uint W16, const uint W17,
+    const uint W16_plus_K16, const uint W17_plus_K17,
+    const uint W31, const uint W32,
+    const uint d1, const uint b1, const uint c1,
+    const uint h1, const uint f1, const uint g1,
+    const uint c1_plus_k5, const uint b1_plus_k6,
+    const uint state0, const uint state1, const uint state2, const uint state3,
+    const uint state4, const uint state5, const uint state6, const uint state7,
+    __global uint * output)
+{
+
+  z ZA[930];
+
+#ifdef GOFFSET
+	const z Znonce = (uint)(get_global_id(0));
+#else
+	const z Znonce = base + (uint)(get_global_id(0));
+#endif
+
+    ZA[15] = Znonce + PreVal4_state0;
+    
+    ZA[16] = (ZCh(ZA[15], b1, c1) + d1) + ZR26(ZA[15]);
+    ZA[26] = Znonce + PreVal4_T1;
+    
+    ZA[27] = ZMa(f1, g1, ZA[26]) + ZR30(ZA[26]);
+    ZA[17] = ZA[16] + h1;
+    
+    ZA[19] = (ZCh(ZA[17], ZA[15], b1) + c1_plus_k5) + ZR26(ZA[17]);
+    ZA[28] = ZA[27] + ZA[16];
+    
+    ZA[548] = ZMa(ZA[26], f1, ZA[28]) + ZR30(ZA[28]);
+    ZA[20] = ZA[19] + g1;
+    
+    ZA[22] = (ZCh(ZA[20], ZA[17], ZA[15]) + b1_plus_k6) + ZR26(ZA[20]);
+    ZA[29] = ZA[548] + ZA[19];
+    
+    ZA[549] = ZMa(ZA[28], ZA[26], ZA[29]) + ZR30(ZA[29]);
+    ZA[23] = ZA[22] + f1;
+    
+    ZA[24] = ZCh(ZA[23], ZA[20], ZA[17]) + ZR26(ZA[23]);
+    ZA[180] = Znonce + PreVal4_state0_k7;
+    ZA[30] = ZA[549] + ZA[22];
+    
+    ZA[31] = ZMa(ZA[29], ZA[28], ZA[30]) + ZR30(ZA[30]);
+    ZA[181] = ZA[180] + ZA[24];
+    
+    ZA[182] = ZA[181] + ZA[26];
+    ZA[183] = ZA[181] + ZA[31];
+    ZA[18] = ZA[17] + 0xd807aa98U;
+    
+    ZA[186] = (ZCh(ZA[182], ZA[23], ZA[20]) + ZA[18]) + ZR26(ZA[182]);
+    ZA[184] = ZMa(ZA[30], ZA[29], ZA[183]) + ZR30(ZA[183]);
+    
+    ZA[187] = ZA[186] + ZA[28];
+    ZA[188] = ZA[186] + ZA[184];
+    ZA[21] = ZA[20] + 0x12835b01U;
+    
+    ZA[191] = (ZCh(ZA[187], ZA[182], ZA[23]) + ZA[21]) + ZR26(ZA[187]);
+    ZA[189] = ZMa(ZA[183], ZA[30], ZA[188]) + ZR30(ZA[188]);
+    
+    ZA[192] = ZA[191] + ZA[29];
+    ZA[193] = ZA[191] + ZA[189];
+    ZA[25] = ZA[23] + 0x243185beU;
+    
+    ZA[196] = (ZCh(ZA[192], ZA[187], ZA[182]) + ZA[25]) + ZR26(ZA[192]);
+    ZA[194] = ZMa(ZA[188], ZA[183], ZA[193]) + ZR30(ZA[193]);
+    
+    ZA[197] = ZA[196] + ZA[30];
+    ZA[198] = ZA[196] + ZA[194];
+    ZA[185] = ZA[182] + 0x550c7dc3U;
+    
+    ZA[201] = (ZCh(ZA[197], ZA[192], ZA[187]) + ZA[185]) + ZR26(ZA[197]);
+    ZA[199] = ZMa(ZA[193], ZA[188], ZA[198]) + ZR30(ZA[198]);
+    
+    ZA[202] = ZA[201] + ZA[183];
+    ZA[203] = ZA[201] + ZA[199];
+    ZA[190] = ZA[187] + 0x72be5d74U;
+    
+    ZA[206] = (ZCh(ZA[202], ZA[197], ZA[192]) + ZA[190]) + ZR26(ZA[202]);
+    ZA[204] = ZMa(ZA[198], ZA[193], ZA[203]) + ZR30(ZA[203]);
+    
+    ZA[207] = ZA[206] + ZA[188];
+    ZA[208] = ZA[206] + ZA[204];
+    ZA[195] = ZA[192] + 0x80deb1feU;
+    
+    ZA[211] = (ZCh(ZA[207], ZA[202], ZA[197]) + ZA[195]) + ZR26(ZA[207]);
+    ZA[209] = ZMa(ZA[203], ZA[198], ZA[208]) + ZR30(ZA[208]);
+    
+    ZA[212] = ZA[193] + ZA[211];
+    ZA[213] = ZA[211] + ZA[209];
+    ZA[200] = ZA[197] + 0x9bdc06a7U;
+    
+    ZA[216] = (ZCh(ZA[212], ZA[207], ZA[202]) + ZA[200]) + ZR26(ZA[212]);
+    ZA[214] = ZMa(ZA[208], ZA[203], ZA[213]) + ZR30(ZA[213]);
+    
+    ZA[217] = ZA[198] + ZA[216];
+    ZA[218] = ZA[216] + ZA[214];
+    ZA[205] = ZA[202] + 0xc19bf3f4U;
+    
+    ZA[220] = (ZCh(ZA[217], ZA[212], ZA[207]) + ZA[205]) + ZR26(ZA[217]);
+    ZA[219] = ZMa(ZA[213], ZA[208], ZA[218]) + ZR30(ZA[218]);
+    
+    ZA[222] = ZA[203] + ZA[220];
+    ZA[223] = ZA[220] + ZA[219];
+    ZA[210] = ZA[207] + W16_plus_K16;
+    
+    ZA[226] = (ZCh(ZA[222], ZA[217], ZA[212]) + ZA[210]) + ZR26(ZA[222]);
+    ZA[225] = ZMa(ZA[218], ZA[213], ZA[223]) + ZR30(ZA[223]);
+    
+    ZA[0] = ZR25(Znonce) + W18;
+    ZA[228] = ZA[226] + ZA[225];
+    ZA[227] = ZA[208] + ZA[226];
+    ZA[215] = ZA[212] + W17_plus_K17;
+    
+    ZA[231] = (ZCh(ZA[227], ZA[222], ZA[217]) + ZA[215]) + ZR26(ZA[227]);
+    ZA[229] = ZMa(ZA[223], ZA[218], ZA[228]) + ZR30(ZA[228]);
+    ZA[1] = ZA[0] + 0x0fc19dc6U;
+    
+    ZA[232] = ZA[213] + ZA[231];
+    ZA[233] = ZA[231] + ZA[229];
+    ZA[221] = ZA[217] + ZA[1];
+    ZA[32] = Znonce + W19;
+    
+    ZA[236] = (ZCh(ZA[232], ZA[227], ZA[222]) + ZA[221]) + ZR26(ZA[232]);
+    ZA[234] = ZMa(ZA[228], ZA[223], ZA[233]) + ZR30(ZA[233]);
+    ZA[33] = ZA[32] + 0x240ca1ccU;
+    
+    ZA[3] = ZR15(ZA[0]) + 0x80000000U;
+    ZA[238] = ZA[236] + ZA[234];
+    ZA[237] = ZA[218] + ZA[236];
+    ZA[224] = ZA[222] + ZA[33];
+    
+    ZA[241] = (ZCh(ZA[237], ZA[232], ZA[227]) + ZA[224]) + ZR26(ZA[237]);
+    ZA[239] = ZMa(ZA[233], ZA[228], ZA[238]) + ZR30(ZA[238]);
+    ZA[4] = ZA[3] + 0x2de92c6fU;
+    
+    ZA[35] = ZR15(ZA[32]);
+    ZA[243] = ZA[241] + ZA[239];
+    ZA[242] = ZA[223] + ZA[241];
+    ZA[230] = ZA[227] + ZA[4];
+    
+    ZA[246] = (ZCh(ZA[242], ZA[237], ZA[232]) + ZA[230]) + ZR26(ZA[242]);
+    ZA[244] = ZMa(ZA[238], ZA[233], ZA[243]) + ZR30(ZA[243]);
+    ZA[36] = ZA[35] + 0x4a7484aaU;
+    
+    ZA[7] = ZR15(ZA[3]) + 0x00000280U;
+    ZA[248] = ZA[246] + ZA[244];
+    ZA[247] = ZA[228] + ZA[246];
+    ZA[235] = ZA[232] + ZA[36];
+    
+    ZA[251] = (ZCh(ZA[247], ZA[242], ZA[237]) + ZA[235]) + ZR26(ZA[247]);
+    ZA[249] = ZMa(ZA[243], ZA[238], ZA[248]) + ZR30(ZA[248]);
+    ZA[8] = ZA[7] + 0x5cb0a9dcU;
+    
+    ZA[38] = ZR15(ZA[35]) + W16;
+    ZA[253] = ZA[251] + ZA[249];
+    ZA[252] = ZA[233] + ZA[251];
+    ZA[240] = ZA[237] + ZA[8];
+    
+    ZA[256] = (ZCh(ZA[252], ZA[247], ZA[242]) + ZA[240]) + ZR26(ZA[252]);
+    ZA[254] = ZMa(ZA[248], ZA[243], ZA[253]) + ZR30(ZA[253]);
+    ZA[40] = ZA[38] + 0x76f988daU;
+    
+    ZA[10] = ZR15(ZA[7]) + W17;
+    ZA[258] = ZA[256] + ZA[254];
+    ZA[257] = ZA[238] + ZA[256];
+    ZA[245] = ZA[242] + ZA[40];
+    
+    ZA[261] = (ZCh(ZA[257], ZA[252], ZA[247]) + ZA[245]) + ZR26(ZA[257]);
+    ZA[259] = ZMa(ZA[253], ZA[248], ZA[258]) + ZR30(ZA[258]);
+    ZA[13] = ZA[10] + 0x983e5152U;
+    
+    ZA[43] = ZR15(ZA[38]) + ZA[0];
+    ZA[263] = ZA[261] + ZA[259];
+    ZA[262] = ZA[243] + ZA[261];
+    ZA[250] = ZA[247] + ZA[13];
+    
+    ZA[266] = (ZCh(ZA[262], ZA[257], ZA[252]) + ZA[250]) + ZR26(ZA[262]);
+    ZA[264] = ZMa(ZA[258], ZA[253], ZA[263]) + ZR30(ZA[263]);
+    ZA[11] = ZR15(ZA[10]);
+    ZA[45] = ZA[43] + 0xa831c66dU;
+    
+    ZA[52] = ZA[11] + ZA[32];
+    ZA[267] = ZA[248] + ZA[266];
+    ZA[255] = ZA[252] + ZA[45];
+    ZA[268] = ZA[266] + ZA[264];
+    
+    ZA[271] = (ZCh(ZA[267], ZA[262], ZA[257]) + ZA[255]) + ZR26(ZA[267]);
+    ZA[269] = ZMa(ZA[263], ZA[258], ZA[268]) + ZR30(ZA[268]);
+    ZA[54] = ZA[52] + 0xb00327c8U;
+    
+    ZA[48] = ZR15(ZA[43]) + ZA[3];
+    ZA[273] = ZA[271] + ZA[269];
+    ZA[272] = ZA[253] + ZA[271];
+    ZA[260] = ZA[257] + ZA[54];
+    
+    ZA[276] = (ZCh(ZA[272], ZA[267], ZA[262]) + ZA[260]) + ZR26(ZA[272]);
+    ZA[274] = ZMa(ZA[268], ZA[263], ZA[273]) + ZR30(ZA[273]);
+    ZA[49] = ZA[48] + 0xbf597fc7U;
+    
+    ZA[61] = ZR15(ZA[52]) + ZA[35];
+    ZA[278] = ZA[276] + ZA[274];
+    ZA[277] = ZA[258] + ZA[276];
+    ZA[265] = ZA[262] + ZA[49];
+    
+    ZA[281] = (ZCh(ZA[277], ZA[272], ZA[267]) + ZA[265]) + ZR26(ZA[277]);
+    ZA[279] = ZMa(ZA[273], ZA[268], ZA[278]) + ZR30(ZA[278]);
+    ZA[62] = ZA[61] + 0xc6e00bf3U;
+    
+    ZA[53] = ZR15(ZA[48]) + ZA[7];
+    ZA[283] = ZA[281] + ZA[279];
+    ZA[282] = ZA[263] + ZA[281];
+    ZA[270] = ZA[267] + ZA[62];
+    
+    ZA[286] = (ZCh(ZA[282], ZA[277], ZA[272]) + ZA[270]) + ZR26(ZA[282]);
+    ZA[284] = ZMa(ZA[278], ZA[273], ZA[283]) + ZR30(ZA[283]);
+    ZA[39] = ZA[38] + 0x00A00055U;
+    ZA[55] = ZA[53] + 0xd5a79147U;
+    
+    ZA[66] = ZR15(ZA[61]) + ZA[39];
+    ZA[288] = ZA[286] + ZA[284];
+    ZA[287] = ZA[268] + ZA[286];
+    ZA[275] = ZA[272] + ZA[55];
+    
+    ZA[291] = (ZCh(ZA[287], ZA[282], ZA[277]) + ZA[275]) + ZR26(ZA[287]);
+    ZA[289] = ZMa(ZA[283], ZA[278], ZA[288]) + ZR30(ZA[288]);
+    ZA[12] = ZA[10] + W31;
+    ZA[68] = ZA[66] + 0x06ca6351U;
+    
+    ZA[67] = ZR15(ZA[53]) + ZA[12];
+    ZA[293] = ZA[291] + ZA[289];
+    ZA[292] = ZA[273] + ZA[291];
+    ZA[280] = ZA[277] + ZA[68];
+    
+    ZA[296] = (ZCh(ZA[292], ZA[287], ZA[282]) + ZA[280]) + ZR26(ZA[292]);
+    ZA[294] = ZMa(ZA[288], ZA[283], ZA[293]) + ZR30(ZA[293]);
+    ZA[2] = ZR25(ZA[0]);
+    ZA[69] = ZA[67] + 0x14292967U;
+    ZA[44] = ZA[43] + W32;
+    
+    ZA[75] = ZR15(ZA[66]) + ZA[44];
+    ZA[298] = ZA[296] + ZA[294];
+    ZA[297] = ZA[278] + ZA[296];
+    ZA[285] = ZA[282] + ZA[69];
+    ZA[5] = ZA[2] + W17;
+    
+    ZA[301] = (ZCh(ZA[297], ZA[292], ZA[287]) + ZA[285]) + ZR26(ZA[297]);
+    ZA[299] = ZMa(ZA[293], ZA[288], ZA[298]) + ZR30(ZA[298]);
+    ZA[56] = ZA[52] + ZA[5];
+    ZA[76] = ZA[75] + 0x27b70a85U;
+    
+    ZA[34] = ZR25(ZA[32]) + ZA[0];
+    ZA[70] = ZR15(ZA[67]) + ZA[56];
+    ZA[302] = ZA[283] + ZA[301];
+    ZA[303] = ZA[301] + ZA[299];
+    ZA[290] = ZA[287] + ZA[76];
+    
+    ZA[306] = (ZCh(ZA[302], ZA[297], ZA[292]) + ZA[290]) + ZR26(ZA[302]);
+    ZA[304] = ZMa(ZA[298], ZA[293], ZA[303]) + ZR30(ZA[303]);
+    ZA[6] = ZR25(ZA[3]);
+    ZA[77] = ZA[70] + 0x2e1b2138U;
+    ZA[50] = ZA[34] + ZA[48];
+    
+    ZA[78] = ZR15(ZA[75]) + ZA[50];
+    ZA[308] = ZA[306] + ZA[304];
+    ZA[307] = ZA[288] + ZA[306];
+    ZA[295] = ZA[292] + ZA[77];
+    ZA[41] = ZA[32] + ZA[6];
+    
+    ZA[311] = (ZCh(ZA[307], ZA[302], ZA[297]) + ZA[295]) + ZR26(ZA[307]);
+    ZA[309] = ZMa(ZA[303], ZA[298], ZA[308]) + ZR30(ZA[308]);
+    ZA[63] = ZA[41] + ZA[61];
+    ZA[85] = ZA[78] + 0x4d2c6dfcU;
+    
+    ZA[37] = ZR25(ZA[35]) + ZA[3];
+    ZA[79] = ZR15(ZA[70]) + ZA[63];
+    ZA[312] = ZA[293] + ZA[311];
+    ZA[313] = ZA[311] + ZA[309];
+    ZA[300] = ZA[297] + ZA[85];
+    
+    ZA[316] = (ZCh(ZA[312], ZA[307], ZA[302]) + ZA[300]) + ZR26(ZA[312]);
+    ZA[314] = ZMa(ZA[308], ZA[303], ZA[313]) + ZR30(ZA[313]);
+    ZA[9] = ZR25(ZA[7]);
+    ZA[86] = ZA[79] + 0x53380d13U;
+    ZA[57] = ZA[37] + ZA[53];
+    
+    ZA[87] = ZR15(ZA[78]) + ZA[57];
+    ZA[318] = ZA[316] + ZA[314];
+    ZA[317] = ZA[298] + ZA[316];
+    ZA[305] = ZA[302] + ZA[86];
+    ZA[46] = ZA[35] + ZA[9];
+    
+    ZA[321] = (ZCh(ZA[317], ZA[312], ZA[307]) + ZA[305]) + ZR26(ZA[317]);
+    ZA[319] = ZMa(ZA[313], ZA[308], ZA[318]) + ZR30(ZA[318]);
+    ZA[71] = ZA[46] + ZA[66];
+    ZA[92] = ZA[87] + 0x650a7354U;
+    
+    ZA[42] = ZR25(ZA[38]) + ZA[7];
+    ZA[88] = ZR15(ZA[79]) + ZA[71];
+    ZA[322] = ZA[303] + ZA[321];
+    ZA[323] = ZA[321] + ZA[319];
+    ZA[310] = ZA[307] + ZA[92];
+    
+    ZA[326] = (ZCh(ZA[322], ZA[317], ZA[312]) + ZA[310]) + ZR26(ZA[322]);
+    ZA[324] = ZMa(ZA[318], ZA[313], ZA[323]) + ZR30(ZA[323]);
+    ZA[14] = ZR25(ZA[10]);
+    ZA[93] = ZA[88] + 0x766a0abbU;
+    ZA[72] = ZA[42] + ZA[67];
+    
+    ZA[94] = ZR15(ZA[87]) + ZA[72];
+    ZA[328] = ZA[326] + ZA[324];
+    ZA[327] = ZA[308] + ZA[326];
+    ZA[315] = ZA[312] + ZA[93];
+    ZA[51] = ZA[38] + ZA[14];
+    
+    ZA[331] = (ZCh(ZA[327], ZA[322], ZA[317]) + ZA[315]) + ZR26(ZA[327]);
+    ZA[329] = ZMa(ZA[323], ZA[318], ZA[328]) + ZR30(ZA[328]);
+    ZA[80] = ZA[51] + ZA[75];
+    ZA[100] = ZA[94] + 0x81c2c92eU;
+    
+    ZA[47] = ZR25(ZA[43]) + ZA[10];
+    ZA[95] = ZR15(ZA[88]) + ZA[80];
+    ZA[332] = ZA[313] + ZA[331];
+    ZA[333] = ZA[331] + ZA[329];
+    ZA[320] = ZA[317] + ZA[100];
+    
+    ZA[336] = (ZCh(ZA[332], ZA[327], ZA[322]) + ZA[320]) + ZR26(ZA[332]);
+    ZA[334] = ZMa(ZA[328], ZA[323], ZA[333]) + ZR30(ZA[333]);
+    ZA[81] = ZA[47] + ZA[70];
+    ZA[101] = ZA[95] + 0x92722c85U;
+    
+    ZA[58] = ZR25(ZA[52]) + ZA[43];
+    ZA[102] = ZR15(ZA[94]) + ZA[81];
+    ZA[337] = ZA[318] + ZA[336];
+    ZA[338] = ZA[336] + ZA[334];
+    ZA[325] = ZA[322] + ZA[101];
+    
+    ZA[341] = (ZCh(ZA[337], ZA[332], ZA[327]) + ZA[325]) + ZR26(ZA[337]);
+    ZA[339] = ZMa(ZA[333], ZA[328], ZA[338]) + ZR30(ZA[338]);
+    ZA[89] = ZA[58] + ZA[78];
+    ZA[108] = ZA[102] + 0xa2bfe8a1U;
+    
+    ZA[59] = ZR25(ZA[48]) + ZA[52];
+    ZA[103] = ZR15(ZA[95]) + ZA[89];
+    ZA[342] = ZA[323] + ZA[341];
+    ZA[343] = ZA[341] + ZA[339];
+    ZA[330] = ZA[327] + ZA[108];
+    
+    ZA[346] = (ZCh(ZA[342], ZA[337], ZA[332]) + ZA[330]) + ZR26(ZA[342]);
+    ZA[344] = ZMa(ZA[338], ZA[333], ZA[343]) + ZR30(ZA[343]);
+    ZA[90] = ZA[59] + ZA[79];
+    ZA[109] = ZA[103] + 0xa81a664bU;
+    
+    ZA[64] = ZR25(ZA[61]) + ZA[48];
+    ZA[110] = ZR15(ZA[102]) + ZA[90];
+    ZA[347] = ZA[328] + ZA[346];
+    ZA[348] = ZA[346] + ZA[344];
+    ZA[335] = ZA[332] + ZA[109];
+    
+    ZA[351] = (ZCh(ZA[347], ZA[342], ZA[337]) + ZA[335]) + ZR26(ZA[347]);
+    ZA[349] = ZMa(ZA[343], ZA[338], ZA[348]) + ZR30(ZA[348]);
+    ZA[60] = ZR25(ZA[53]);
+    ZA[116] = ZA[110] + 0xc24b8b70U;
+    ZA[96] = ZA[87] + ZA[64];
+    
+    ZA[111] = ZR15(ZA[103]) + ZA[96];
+    ZA[353] = ZA[351] + ZA[349];
+    ZA[352] = ZA[333] + ZA[351];
+    ZA[340] = ZA[337] + ZA[116];
+    ZA[65] = ZA[60] + ZA[61];
+    
+    ZA[356] = (ZCh(ZA[352], ZA[347], ZA[342]) + ZA[340]) + ZR26(ZA[352]);
+    ZA[354] = ZMa(ZA[348], ZA[343], ZA[353]) + ZR30(ZA[353]);
+    ZA[97] = ZA[88] + ZA[65];
+    ZA[117] = ZA[111] + 0xc76c51a3U;
+    
+    ZA[73] = ZR25(ZA[66]) + ZA[53];
+    ZA[118] = ZR15(ZA[110]) + ZA[97];
+    ZA[357] = ZA[338] + ZA[356];
+    ZA[358] = ZA[356] + ZA[354];
+    ZA[345] = ZA[342] + ZA[117];
+    
+    ZA[361] = (ZCh(ZA[357], ZA[352], ZA[347]) + ZA[345]) + ZR26(ZA[357]);
+    ZA[359] = ZMa(ZA[353], ZA[348], ZA[358]) + ZR30(ZA[358]);
+    ZA[104] = ZA[73] + ZA[94];
+    ZA[124] = ZA[118] + 0xd192e819U;
+    
+    ZA[74] = ZR25(ZA[67]) + ZA[66];
+    ZA[119] = ZR15(ZA[111]) + ZA[104];
+    ZA[362] = ZA[343] + ZA[361];
+    ZA[363] = ZA[361] + ZA[359];
+    ZA[350] = ZA[347] + ZA[124];
+    
+    ZA[366] = (ZCh(ZA[362], ZA[357], ZA[352]) + ZA[350]) + ZR26(ZA[362]);
+    ZA[364] = ZMa(ZA[358], ZA[353], ZA[363]) + ZR30(ZA[363]);
+    ZA[105] = ZA[74] + ZA[95];
+    ZA[125] = ZA[119] + 0xd6990624U;
+    
+    ZA[82] = ZR25(ZA[75]) + ZA[67];
+    ZA[126] = ZR15(ZA[118]) + ZA[105];
+    ZA[367] = ZA[348] + ZA[366];
+    ZA[368] = ZA[366] + ZA[364];
+    ZA[355] = ZA[352] + ZA[125];
+    
+    ZA[371] = (ZCh(ZA[367], ZA[362], ZA[357]) + ZA[355]) + ZR26(ZA[367]);
+    ZA[369] = ZMa(ZA[363], ZA[358], ZA[368]) + ZR30(ZA[368]);
+    ZA[112] = ZA[102] + ZA[82];
+    ZA[132] = ZA[126] + 0xf40e3585U;
+    
+    ZA[83] = ZR25(ZA[70]) + ZA[75];
+    ZA[127] = ZR15(ZA[119]) + ZA[112];
+    ZA[372] = ZA[353] + ZA[371];
+    ZA[373] = ZA[371] + ZA[369];
+    ZA[360] = ZA[357] + ZA[132];
+    
+    ZA[376] = (ZCh(ZA[372], ZA[367], ZA[362]) + ZA[360]) + ZR26(ZA[372]);
+    ZA[374] = ZMa(ZA[368], ZA[363], ZA[373]) + ZR30(ZA[373]);
+    ZA[113] = ZA[103] + ZA[83];
+    ZA[133] = ZA[127] + 0x106aa070U;
+    
+    ZA[84] = ZR25(ZA[78]) + ZA[70];
+    ZA[134] = ZR15(ZA[126]) + ZA[113];
+    ZA[377] = ZA[358] + ZA[376];
+    ZA[378] = ZA[376] + ZA[374];
+    ZA[365] = ZA[362] + ZA[133];
+    
+    ZA[381] = (ZCh(ZA[377], ZA[372], ZA[367]) + ZA[365]) + ZR26(ZA[377]);
+    ZA[379] = ZMa(ZA[373], ZA[368], ZA[378]) + ZR30(ZA[378]);
+    ZA[120] = ZA[110] + ZA[84];
+    ZA[140] = ZA[134] + 0x19a4c116U;
+    
+    ZA[91] = ZR25(ZA[79]) + ZA[78];
+    ZA[135] = ZR15(ZA[127]) + ZA[120];
+    ZA[382] = ZA[363] + ZA[381];
+    ZA[383] = ZA[381] + ZA[379];
+    ZA[370] = ZA[367] + ZA[140];
+    
+    ZA[386] = (ZCh(ZA[382], ZA[377], ZA[372]) + ZA[370]) + ZR26(ZA[382]);
+    ZA[384] = ZMa(ZA[378], ZA[373], ZA[383]) + ZR30(ZA[383]);
+    ZA[121] = ZA[111] + ZA[91];
+    ZA[141] = ZA[135] + 0x1e376c08U;
+    
+    ZA[98] = ZR25(ZA[87]) + ZA[79];
+    ZA[142] = ZR15(ZA[134]) + ZA[121];
+    ZA[387] = ZA[368] + ZA[386];
+    ZA[388] = ZA[386] + ZA[384];
+    ZA[375] = ZA[372] + ZA[141];
+    
+    ZA[391] = (ZCh(ZA[387], ZA[382], ZA[377]) + ZA[375]) + ZR26(ZA[387]);
+    ZA[389] = ZMa(ZA[383], ZA[378], ZA[388]) + ZR30(ZA[388]);
+    ZA[128] = ZA[118] + ZA[98];
+    ZA[147] = ZA[142] + 0x2748774cU;
+    
+    ZA[99] = ZR25(ZA[88]) + ZA[87];
+    ZA[143] = ZR15(ZA[135]) + ZA[128];
+    ZA[392] = ZA[373] + ZA[391];
+    ZA[393] = ZA[391] + ZA[389];
+    ZA[380] = ZA[377] + ZA[147];
+    
+    ZA[396] = (ZCh(ZA[392], ZA[387], ZA[382]) + ZA[380]) + ZR26(ZA[392]);
+    ZA[394] = ZMa(ZA[388], ZA[383], ZA[393]) + ZR30(ZA[393]);
+    ZA[129] = ZA[119] + ZA[99];
+    ZA[148] = ZA[143] + 0x34b0bcb5U;
+    
+    ZA[106] = ZR25(ZA[94]) + ZA[88];
+    ZA[149] = ZR15(ZA[142]) + ZA[129];
+    ZA[397] = ZA[378] + ZA[396];
+    ZA[398] = ZA[396] + ZA[394];
+    ZA[385] = ZA[382] + ZA[148];
+    
+    ZA[401] = (ZCh(ZA[397], ZA[392], ZA[387]) + ZA[385]) + ZR26(ZA[397]);
+    ZA[399] = ZMa(ZA[393], ZA[388], ZA[398]) + ZR30(ZA[398]);
+    ZA[136] = ZA[126] + ZA[106];
+    ZA[153] = ZA[149] + 0x391c0cb3U;
+    
+    ZA[107] = ZR25(ZA[95]) + ZA[94];
+    ZA[150] = ZR15(ZA[143]) + ZA[136];
+    ZA[402] = ZA[383] + ZA[401];
+    ZA[403] = ZA[401] + ZA[399];
+    ZA[390] = ZA[387] + ZA[153];
+    
+    ZA[406] = (ZCh(ZA[402], ZA[397], ZA[392]) + ZA[390]) + ZR26(ZA[402]);
+    ZA[404] = ZMa(ZA[398], ZA[393], ZA[403]) + ZR30(ZA[403]);
+    ZA[137] = ZA[127] + ZA[107];
+    ZA[154] = ZA[150] + 0x4ed8aa4aU;
+    
+    ZA[114] = ZR25(ZA[102]) + ZA[95];
+    ZA[155] = ZR15(ZA[149]) + ZA[137];
+    ZA[407] = ZA[388] + ZA[406];
+    ZA[408] = ZA[406] + ZA[404];
+    ZA[395] = ZA[392] + ZA[154];
+    
+    ZA[411] = (ZCh(ZA[407], ZA[402], ZA[397]) + ZA[395]) + ZR26(ZA[407]);
+    ZA[409] = ZMa(ZA[403], ZA[398], ZA[408]) + ZR30(ZA[408]);
+    ZA[144] = ZA[134] + ZA[114];
+    ZA[159] = ZA[155] + 0x5b9cca4fU;
+    
+    ZA[115] = ZR25(ZA[103]) + ZA[102];
+    ZA[156] = ZR15(ZA[150]) + ZA[144];
+    ZA[412] = ZA[393] + ZA[411];
+    ZA[413] = ZA[411] + ZA[409];
+    ZA[400] = ZA[397] + ZA[159];
+    
+    ZA[416] = (ZCh(ZA[412], ZA[407], ZA[402]) + ZA[400]) + ZR26(ZA[412]);
+    ZA[414] = ZMa(ZA[408], ZA[403], ZA[413]) + ZR30(ZA[413]);
+    ZA[145] = ZA[135] + ZA[115];
+    ZA[160] = ZA[156] + 0x682e6ff3U;
+    
+    ZA[122] = ZR25(ZA[110]) + ZA[103];
+    ZA[161] = ZR15(ZA[155]) + ZA[145];
+    ZA[417] = ZA[398] + ZA[416];
+    ZA[418] = ZA[416] + ZA[414];
+    ZA[405] = ZA[402] + ZA[160];
+    
+    ZA[421] = (ZCh(ZA[417], ZA[412], ZA[407]) + ZA[405]) + ZR26(ZA[417]);
+    ZA[419] = ZMa(ZA[413], ZA[408], ZA[418]) + ZR30(ZA[418]);
+    ZA[151] = ZA[142] + ZA[122];
+    ZA[165] = ZA[161] + 0x748f82eeU;
+    
+    ZA[123] = ZR25(ZA[111]) + ZA[110];
+    ZA[162] = ZR15(ZA[156]) + ZA[151];
+    ZA[422] = ZA[403] + ZA[421];
+    ZA[423] = ZA[421] + ZA[419];
+    ZA[410] = ZA[407] + ZA[165];
+    
+    ZA[426] = (ZCh(ZA[422], ZA[417], ZA[412]) + ZA[410]) + ZR26(ZA[422]);
+    ZA[424] = ZMa(ZA[418], ZA[413], ZA[423]) + ZR30(ZA[423]);
+    ZA[152] = ZA[143] + ZA[123];
+    ZA[166] = ZA[162] + 0x78a5636fU;
+    
+    ZA[130] = ZR25(ZA[118]) + ZA[111];
+    ZA[167] = ZR15(ZA[161]) + ZA[152];
+    ZA[427] = ZA[408] + ZA[426];
+    ZA[428] = ZA[426] + ZA[424];
+    ZA[415] = ZA[412] + ZA[166];
+    
+    ZA[431] = (ZCh(ZA[427], ZA[422], ZA[417]) + ZA[415]) + ZR26(ZA[427]);
+    ZA[429] = ZMa(ZA[423], ZA[418], ZA[428]) + ZR30(ZA[428]);
+    ZA[157] = ZA[149] + ZA[130];
+    ZA[170] = ZA[167] + 0x84c87814U;
+    
+    ZA[131] = ZR25(ZA[119]) + ZA[118];
+    ZA[168] = ZR15(ZA[162]) + ZA[157];
+    ZA[432] = ZA[413] + ZA[431];
+    ZA[433] = ZA[431] + ZA[429];
+    ZA[420] = ZA[417] + ZA[170];
+    
+    ZA[436] = (ZCh(ZA[432], ZA[427], ZA[422]) + ZA[420]) + ZR26(ZA[432]);
+    ZA[434] = ZMa(ZA[428], ZA[423], ZA[433]) + ZR30(ZA[433]);
+    ZA[158] = ZA[150] + ZA[131];
+    ZA[171] = ZA[168] + 0x8cc70208U;
+    
+    ZA[138] = ZR25(ZA[126]) + ZA[119];
+    ZA[172] = ZR15(ZA[167]) + ZA[158];
+    ZA[437] = ZA[418] + ZA[436];
+    ZA[438] = ZA[436] + ZA[434];
+    ZA[425] = ZA[422] + ZA[171];
+    
+    ZA[441] = (ZCh(ZA[437], ZA[432], ZA[427]) + ZA[425]) + ZR26(ZA[437]);
+    ZA[439] = ZMa(ZA[433], ZA[428], ZA[438]) + ZR30(ZA[438]);
+    ZA[163] = ZA[155] + ZA[138];
+    ZA[174] = ZA[172] + 0x90befffaU;
+    
+    ZA[139] = ZR25(ZA[127]) + ZA[126];
+    ZA[173] = ZR15(ZA[168]) + ZA[163];
+    ZA[442] = ZA[423] + ZA[441];
+    ZA[443] = ZA[441] + ZA[439];
+    ZA[430] = ZA[427] + ZA[174];
+    
+    ZA[445] = (ZCh(ZA[442], ZA[437], ZA[432]) + ZA[430]) + ZR26(ZA[442]);
+    ZA[444] = ZMa(ZA[438], ZA[433], ZA[443]) + ZR30(ZA[443]);
+    ZA[164] = ZA[156] + ZA[139];
+    ZA[175] = ZA[173] + 0xa4506cebU;
+    
+    ZA[146] = ZR25(ZA[134]) + ZA[127];
+    ZA[176] = ZR15(ZA[172]) + ZA[164];
+    ZA[446] = ZA[428] + ZA[445];
+    ZA[447] = ZA[445] + ZA[444];
+    ZA[435] = ZA[432] + ZA[175];
+    
+    ZA[449] = (ZCh(ZA[446], ZA[442], ZA[437]) + ZA[435]) + ZR26(ZA[446]);
+    ZA[448] = ZMa(ZA[443], ZA[438], ZA[447]) + ZR30(ZA[447]);
+    ZA[169] = ZA[161] + ZA[146];
+    ZA[178] = ZA[176] + 0xbef9a3f7U;
+    
+    ZA[177] = ZR15(ZA[173]) + ZA[169];
+    ZA[451] = ZA[449] + ZA[448];
+    ZA[450] = ZA[433] + ZA[449];
+    ZA[440] = ZA[437] + ZA[178];
+    
+    ZA[453] = (ZCh(ZA[450], ZA[446], ZA[442]) + ZA[440]) + ZR26(ZA[450]);
+    ZA[452] = ZMa(ZA[447], ZA[443], ZA[451]) + ZR30(ZA[451]);
+    ZA[179] = ZA[177] + 0xc67178f2U;
+    
+    ZA[454] = ZA[438] + ZA[453];
+    ZA[494] = ZA[442] + ZA[179];
+    ZA[455] = ZA[453] + ZA[452];
+    
+    ZA[457] = (ZCh(ZA[454], ZA[450], ZA[446]) + ZA[494]) + ZR26(ZA[454]);
+    ZA[456] = ZMa(ZA[451], ZA[447], ZA[455]) + ZR30(ZA[455]);
+    
+    ZA[459] = ZA[457] + ZA[456];
+    
+    ZA[461] = ZA[455] + state1;
+    ZA[460] = ZA[459] + state0;
+    
+    ZA[495] = ZA[460] + 0x98c7e2a2U;
+    ZA[469] = ZA[461] + 0x90bb1e3cU;
+    
+    ZA[498] = (ZCh(ZA[495], 0x510e527fU, 0x9b05688cU) + ZA[469]) + ZR26(ZA[495]);
+    ZA[462] = ZA[451] + state2;
+    
+    ZA[496] = ZA[460] + 0xfc08884dU;
+    ZA[506] = ZA[498] + 0x3c6ef372U;
+    ZA[470] = ZA[462] + 0x50c6645bU;
+    
+    ZA[507] = (ZCh(ZA[506], ZA[495], 0x510e527fU) + ZA[470]) + ZR26(ZA[506]);
+    ZA[500] = ZMa(0x6a09e667U, 0xbb67ae85U, ZA[496]) + ZR30(ZA[496]);
+    ZA[463] = ZA[447] + state3;
+    
+    ZA[458] = ZA[443] + ZA[457];
+    ZA[499] = ZA[498] + ZA[500];
+    ZA[508] = ZA[507] + 0xbb67ae85U;
+    ZA[473] = ZA[463] + 0x3ac42e24U;
+    
+    ZA[510] = (ZCh(ZA[508], ZA[506], ZA[495]) + ZA[473]) + ZR26(ZA[508]);
+    ZA[928] = ZMa(ZA[496], 0x6a09e667U, ZA[499]) + ZR30(ZA[499]);
+    ZA[464] = ZA[458] + state4;
+    
+    ZA[476] = ZA[464] + ZA[460] + 0xd21ea4fdU;
+    ZA[511] = ZA[510] + 0x6a09e667U;
+    ZA[509] = ZA[928] + ZA[507];
+    ZA[465] = ZA[454] + state5;
+    
+    ZA[514] = (ZCh(ZA[511], ZA[508], ZA[506]) + ZA[476]) + ZR26(ZA[511]);
+    ZA[512] = ZMa(ZA[499], ZA[496], ZA[509]) + ZR30(ZA[509]);
+    ZA[478] = ZA[465] + 0x59f111f1U;
+    
+    ZA[519] = ZA[506] + ZA[478];
+    ZA[516] = ZA[496] + ZA[514];
+    ZA[513] = ZA[510] + ZA[512];
+    ZA[466] = ZA[450] + state6;
+    
+    ZA[520] = (ZCh(ZA[516], ZA[511], ZA[508]) + ZA[519]) + ZR26(ZA[516]);
+    ZA[515] = ZMa(ZA[509], ZA[499], ZA[513]) + ZR30(ZA[513]);
+    ZA[480] = ZA[466] + 0x923f82a4U;
+    
+    ZA[524] = ZA[508] + ZA[480];
+    ZA[521] = ZA[499] + ZA[520];
+    ZA[517] = ZA[514] + ZA[515];
+    ZA[467] = ZA[446] + state7;
+    
+    ZA[525] = (ZCh(ZA[521], ZA[516], ZA[511]) + ZA[524]) + ZR26(ZA[521]);
+    ZA[522] = ZMa(ZA[513], ZA[509], ZA[517]) + ZR30(ZA[517]);
+    ZA[484] = ZA[467] + 0xab1c5ed5U;
+    
+    ZA[529] = ZA[511] + ZA[484];
+    ZA[526] = ZA[509] + ZA[525];
+    ZA[523] = ZA[520] + ZA[522];
+    
+    ZA[530] = (ZCh(ZA[526], ZA[521], ZA[516]) + ZA[529]) + ZR26(ZA[526]);
+    ZA[550] = ZMa(ZA[517], ZA[513], ZA[523]) + ZR30(ZA[523]);
+    
+    ZA[531] = ZA[513] + ZA[530];
+    ZA[533] = ZA[516] + 0x5807aa98U;
+    ZA[527] = ZA[550] + ZA[525];
+    
+    ZA[534] = (ZCh(ZA[531], ZA[526], ZA[521]) + ZA[533]) + ZR26(ZA[531]);
+    ZA[551] = ZMa(ZA[523], ZA[517], ZA[527]) + ZR30(ZA[527]);
+    
+    ZA[535] = ZA[517] + ZA[534];
+    ZA[538] = ZA[521] + 0x12835b01U;
+    ZA[532] = ZA[551] + ZA[530];
+    
+    ZA[539] = (ZCh(ZA[535], ZA[531], ZA[526]) + ZA[538]) + ZR26(ZA[535]);
+    ZA[552] = ZMa(ZA[527], ZA[523], ZA[532]) + ZR30(ZA[532]);
+    
+    ZA[540] = ZA[523] + ZA[539];
+    ZA[542] = ZA[526] + 0x243185beU;
+    ZA[536] = ZA[552] + ZA[534];
+    
+    ZA[543] = (ZCh(ZA[540], ZA[535], ZA[531]) + ZA[542]) + ZR26(ZA[540]);
+    ZA[553] = ZMa(ZA[532], ZA[527], ZA[536]) + ZR30(ZA[536]);
+    
+    ZA[544] = ZA[527] + ZA[543];
+    ZA[555] = ZA[531] + 0x550c7dc3U;
+    ZA[541] = ZA[553] + ZA[539];
+    
+    ZA[558] = (ZCh(ZA[544], ZA[540], ZA[535]) + ZA[555]) + ZR26(ZA[544]);
+    ZA[547] = ZMa(ZA[536], ZA[532], ZA[541]) + ZR30(ZA[541]);
+    
+    ZA[559] = ZA[532] + ZA[558];
+    ZA[556] = ZA[535] + 0x72be5d74U;
+    ZA[545] = ZA[547] + ZA[543];
+    
+    ZA[562] = (ZCh(ZA[559], ZA[544], ZA[540]) + ZA[556]) + ZR26(ZA[559]);
+    ZA[561] = ZMa(ZA[541], ZA[536], ZA[545]) + ZR30(ZA[545]);
+    
+    ZA[563] = ZA[536] + ZA[562];
+    ZA[560] = ZA[561] + ZA[558];
+    ZA[557] = ZA[540] + 0x80deb1feU;
+    
+    ZA[568] = (ZCh(ZA[563], ZA[559], ZA[544]) + ZA[557]) + ZR26(ZA[563]);
+    ZA[564] = ZMa(ZA[545], ZA[541], ZA[560]) + ZR30(ZA[560]);
+    
+    ZA[569] = ZA[541] + ZA[568];
+    ZA[572] = ZA[544] + 0x9bdc06a7U;
+    ZA[565] = ZA[562] + ZA[564];
+    
+    ZA[574] = (ZCh(ZA[569], ZA[563], ZA[559]) + ZA[572]) + ZR26(ZA[569]);
+    ZA[570] = ZMa(ZA[560], ZA[545], ZA[565]) + ZR30(ZA[565]);
+    ZA[468] = ZR25(ZA[461]);
+    
+    ZA[497] = ZA[468] + ZA[460];
+    ZA[575] = ZA[545] + ZA[574];
+    ZA[571] = ZA[568] + ZA[570];
+    ZA[573] = ZA[559] + 0xc19bf274U;
+    
+    ZA[578] = (ZCh(ZA[575], ZA[569], ZA[563]) + ZA[573]) + ZR26(ZA[575]);
+    ZA[576] = ZMa(ZA[565], ZA[560], ZA[571]) + ZR30(ZA[571]);
+    ZA[929] = ZR25(ZA[462]);
+    ZA[503] = ZA[497] + 0xe49b69c1U;
+    
+    ZA[471] = ZA[929] + ZA[461] + 0x00a00000U;
+    ZA[582] = ZA[563] + ZA[503];
+    ZA[579] = ZA[560] + ZA[578];
+    ZA[577] = ZA[574] + ZA[576];
+    
+    ZA[583] = (ZCh(ZA[579], ZA[575], ZA[569]) + ZA[582]) + ZR26(ZA[579]);
+    ZA[580] = ZMa(ZA[571], ZA[565], ZA[577]) + ZR30(ZA[577]);
+    ZA[488] = ZA[471] + 0xefbe4786U;
+    
+    ZA[472] = ZR25(ZA[463]) + ZA[462];
+    ZA[587] = ZA[569] + ZA[488];
+    ZA[584] = ZA[565] + ZA[583];
+    ZA[581] = ZA[578] + ZA[580];
+    
+    ZA[588] = (ZCh(ZA[584], ZA[579], ZA[575]) + ZA[587]) + ZR26(ZA[584]);
+    ZA[586] = ZMa(ZA[577], ZA[571], ZA[581]) + ZR30(ZA[581]);
+    ZA[501] = ZR15(ZA[497]) + ZA[472];
+    ZA[475] = ZR15(ZA[471]);
+    ZA[926] = ZA[575] + 0x0fc19dc6U;
+    
+    ZA[474] = ZA[475] + ZA[463] + ZR25(ZA[464]);
+    ZA[927] = ZA[926] + ZA[501];
+    ZA[589] = ZA[571] + ZA[588];
+    ZA[585] = ZA[583] + ZA[586];
+    
+    ZA[592] = (ZCh(ZA[589], ZA[584], ZA[579]) + ZA[927]) + ZR26(ZA[589]);
+    ZA[590] = ZMa(ZA[581], ZA[577], ZA[585]) + ZR30(ZA[585]);
+    ZA[477] = ZR25(ZA[465]) + ZA[464];
+    ZA[489] = ZA[474] + 0x240ca1ccU;
+    
+    ZA[518] = ZR15(ZA[501]) + ZA[477];
+    ZA[479] = ZR25(ZA[466]);
+    ZA[596] = ZA[579] + ZA[489];
+    ZA[593] = ZA[577] + ZA[592];
+    ZA[591] = ZA[588] + ZA[590];
+    
+    ZA[597] = (ZCh(ZA[593], ZA[589], ZA[584]) + ZA[596]) + ZR26(ZA[593]);
+    ZA[594] = ZMa(ZA[585], ZA[581], ZA[591]) + ZR30(ZA[591]);
+    ZA[481] = ZA[479] + ZA[465];
+    ZA[601] = ZA[518] + 0x2de92c6fU;
+    
+    ZA[482] = ZR15(ZA[474]) + ZA[481];
+    ZA[602] = ZA[584] + ZA[601];
+    ZA[598] = ZA[581] + ZA[597];
+    ZA[595] = ZA[592] + ZA[594];
+    
+    ZA[632] = (ZCh(ZA[598], ZA[593], ZA[589]) + ZA[602]) + ZR26(ZA[598]);
+    ZA[599] = ZMa(ZA[591], ZA[585], ZA[595]) + ZR30(ZA[595]);
+    ZA[483] = ZA[466] + 0x00000100U + ZR25(ZA[467]);
+    ZA[490] = ZA[482] + 0x4a7484aaU;
+    
+    ZA[528] = ZR15(ZA[518]) + ZA[483];
+    ZA[736] = ZA[585] + ZA[632];
+    ZA[605] = ZA[589] + ZA[490];
+    ZA[600] = ZA[597] + ZA[599];
+    ZA[485] = ZA[467] + 0x11002000U;
+    
+    ZA[738] = (ZCh(ZA[736], ZA[598], ZA[593]) + ZA[605]) + ZR26(ZA[736]);
+    ZA[744] = ZMa(ZA[595], ZA[591], ZA[600]) + ZR30(ZA[600]);
+    ZA[487] = ZR15(ZA[482]) + ZA[485];
+    ZA[603] = ZA[528] + 0x5cb0a9dcU;
+    
+    ZA[502] = ZA[497] + ZA[487];
+    ZA[739] = ZA[591] + ZA[738];
+    ZA[604] = ZA[593] + ZA[603];
+    ZA[737] = ZA[744] + ZA[632];
+    
+    ZA[741] = (ZCh(ZA[739], ZA[736], ZA[598]) + ZA[604]) + ZR26(ZA[739]);
+    ZA[745] = ZMa(ZA[600], ZA[595], ZA[737]) + ZR30(ZA[737]);
+    ZA[486] = ZA[471] + 0x80000000U;
+    ZA[606] = ZA[502] + 0x76f988daU;
+    
+    ZA[537] = ZR15(ZA[528]) + ZA[486];
+    ZA[742] = ZA[595] + ZA[741];
+    ZA[613] = ZA[598] + ZA[606];
+    ZA[740] = ZA[745] + ZA[738];
+    
+    ZA[747] = (ZCh(ZA[742], ZA[739], ZA[736]) + ZA[613]) + ZR26(ZA[742]);
+    ZA[746] = ZMa(ZA[737], ZA[600], ZA[740]) + ZR30(ZA[740]);
+    ZA[607] = ZA[537] + 0x983e5152U;
+    
+    ZA[546] = ZR15(ZA[502]) + ZA[501];
+    ZA[751] = ZA[736] + ZA[607];
+    ZA[748] = ZA[600] + ZA[747];
+    ZA[743] = ZA[746] + ZA[741];
+    
+    ZA[752] = (ZCh(ZA[748], ZA[742], ZA[739]) + ZA[751]) + ZR26(ZA[748]);
+    ZA[749] = ZMa(ZA[740], ZA[737], ZA[743]) + ZR30(ZA[743]);
+    ZA[608] = ZA[546] + 0xa831c66dU;
+    
+    ZA[554] = ZR15(ZA[537]) + ZA[474];
+    ZA[756] = ZA[739] + ZA[608];
+    ZA[753] = ZA[737] + ZA[752];
+    ZA[750] = ZA[747] + ZA[749];
+    
+    ZA[757] = (ZCh(ZA[753], ZA[748], ZA[742]) + ZA[756]) + ZR26(ZA[753]);
+    ZA[754] = ZMa(ZA[743], ZA[740], ZA[750]) + ZR30(ZA[750]);
+    ZA[609] = ZA[554] + 0xb00327c8U;
+    
+    ZA[566] = ZR15(ZA[546]) + ZA[518];
+    ZA[761] = ZA[742] + ZA[609];
+    ZA[758] = ZA[740] + ZA[757];
+    ZA[755] = ZA[752] + ZA[754];
+    
+    ZA[762] = (ZCh(ZA[758], ZA[753], ZA[748]) + ZA[761]) + ZR26(ZA[758]);
+    ZA[759] = ZMa(ZA[750], ZA[743], ZA[755]) + ZR30(ZA[755]);
+    ZA[610] = ZA[566] + 0xbf597fc7U;
+    
+    ZA[567] = ZR15(ZA[554]) + ZA[482];
+    ZA[766] = ZA[748] + ZA[610];
+    ZA[763] = ZA[743] + ZA[762];
+    ZA[760] = ZA[757] + ZA[759];
+    
+    ZA[767] = (ZCh(ZA[763], ZA[758], ZA[753]) + ZA[766]) + ZR26(ZA[763]);
+    ZA[764] = ZMa(ZA[755], ZA[750], ZA[760]) + ZR30(ZA[760]);
+    ZA[611] = ZA[567] + 0xc6e00bf3U;
+    
+    ZA[614] = ZR15(ZA[566]) + ZA[528];
+    ZA[771] = ZA[753] + ZA[611];
+    ZA[768] = ZA[750] + ZA[767];
+    ZA[765] = ZA[762] + ZA[764];
+    
+    ZA[772] = (ZCh(ZA[768], ZA[763], ZA[758]) + ZA[771]) + ZR26(ZA[768]);
+    ZA[769] = ZMa(ZA[760], ZA[755], ZA[765]) + ZR30(ZA[765]);
+    ZA[612] = ZA[502] + 0x00400022U;
+    ZA[615] = ZA[614] + 0xd5a79147U;
+    
+    ZA[616] = ZR15(ZA[567]) + ZA[612];
+    ZA[504] = ZR25(ZA[497]) + 0x00000100U;
+    ZA[776] = ZA[758] + ZA[615];
+    ZA[773] = ZA[755] + ZA[772];
+    ZA[770] = ZA[767] + ZA[769];
+    
+    ZA[777] = (ZCh(ZA[773], ZA[768], ZA[763]) + ZA[776]) + ZR26(ZA[773]);
+    ZA[774] = ZMa(ZA[765], ZA[760], ZA[770]) + ZR30(ZA[770]);
+    ZA[492] = ZR25(ZA[471]);
+    ZA[618] = ZA[537] + ZA[504];
+    ZA[617] = ZA[616] + 0x06ca6351U;
+    
+    ZA[619] = ZR15(ZA[614]) + ZA[618];
+    ZA[781] = ZA[763] + ZA[617];
+    ZA[778] = ZA[760] + ZA[777];
+    ZA[775] = ZA[772] + ZA[774];
+    ZA[505] = ZA[492] + ZA[497];
+    
+    ZA[782] = (ZCh(ZA[778], ZA[773], ZA[768]) + ZA[781]) + ZR26(ZA[778]);
+    ZA[779] = ZMa(ZA[770], ZA[765], ZA[775]) + ZR30(ZA[775]);
+    ZA[621] = ZA[505] + ZA[546];
+    ZA[620] = ZA[619] + 0x14292967U;
+    
+    ZA[622] = ZR15(ZA[616]) + ZA[621];
+    ZA[625] = ZR25(ZA[501]);
+    ZA[786] = ZA[768] + ZA[620];
+    ZA[783] = ZA[765] + ZA[782];
+    ZA[624] = ZA[554] + ZA[471];
+    ZA[780] = ZA[777] + ZA[779];
+    
+    ZA[787] = (ZCh(ZA[783], ZA[778], ZA[773]) + ZA[786]) + ZR26(ZA[783]);
+    ZA[784] = ZMa(ZA[775], ZA[770], ZA[780]) + ZR30(ZA[780]);
+    ZA[493] = ZR25(ZA[474]);
+    ZA[626] = ZA[625] + ZA[624];
+    ZA[623] = ZA[622] + 0x27b70a85U;
+    
+    ZA[627] = ZR15(ZA[619]) + ZA[626];
+    ZA[791] = ZA[773] + ZA[623];
+    ZA[788] = ZA[770] + ZA[787];
+    ZA[785] = ZA[782] + ZA[784];
+    ZA[629] = ZA[493] + ZA[501];
+    
+    ZA[792] = (ZCh(ZA[788], ZA[783], ZA[778]) + ZA[791]) + ZR26(ZA[788]);
+    ZA[789] = ZMa(ZA[780], ZA[775], ZA[785]) + ZR30(ZA[785]);
+    ZA[630] = ZA[566] + ZA[629];
+    ZA[628] = ZA[627] + 0x2e1b2138U;
+    
+    ZA[634] = ZR25(ZA[518]) + ZA[474];
+    ZA[631] = ZR15(ZA[622]) + ZA[630];
+    ZA[796] = ZA[778] + ZA[628];
+    ZA[793] = ZA[775] + ZA[792];
+    ZA[790] = ZA[787] + ZA[789];
+    
+    ZA[797] = (ZCh(ZA[793], ZA[788], ZA[783]) + ZA[796]) + ZR26(ZA[793]);
+    ZA[794] = ZMa(ZA[785], ZA[780], ZA[790]) + ZR30(ZA[790]);
+    ZA[491] = ZR25(ZA[482]);
+    ZA[635] = ZA[567] + ZA[634];
+    ZA[633] = ZA[631] + 0x4d2c6dfcU;
+    
+    ZA[636] = ZR15(ZA[627]) + ZA[635];
+    ZA[801] = ZA[783] + ZA[633];
+    ZA[798] = ZA[780] + ZA[797];
+    ZA[795] = ZA[792] + ZA[794];
+    ZA[638] = ZA[491] + ZA[518];
+    
+    ZA[802] = (ZCh(ZA[798], ZA[793], ZA[788]) + ZA[801]) + ZR26(ZA[798]);
+    ZA[799] = ZMa(ZA[790], ZA[785], ZA[795]) + ZR30(ZA[795]);
+    ZA[639] = ZA[638] + ZA[614];
+    ZA[637] = ZA[636] + 0x53380d13U;
+    
+    ZA[642] = ZR25(ZA[528]) + ZA[482];
+    ZA[640] = ZR15(ZA[631]) + ZA[639];
+    ZA[806] = ZA[788] + ZA[637];
+    ZA[803] = ZA[785] + ZA[802];
+    ZA[800] = ZA[797] + ZA[799];
+    
+    ZA[807] = (ZCh(ZA[803], ZA[798], ZA[793]) + ZA[806]) + ZR26(ZA[803]);
+    ZA[804] = ZMa(ZA[795], ZA[790], ZA[800]) + ZR30(ZA[800]);
+    ZA[643] = ZA[616] + ZA[642];
+    ZA[641] = ZA[640] + 0x650a7354U;
+    
+    ZA[646] = ZR25(ZA[502]) + ZA[528];
+    ZA[644] = ZR15(ZA[636]) + ZA[643];
+    ZA[811] = ZA[793] + ZA[641];
+    ZA[808] = ZA[790] + ZA[807];
+    ZA[805] = ZA[802] + ZA[804];
+    
+    ZA[812] = (ZCh(ZA[808], ZA[803], ZA[798]) + ZA[811]) + ZR26(ZA[808]);
+    ZA[809] = ZMa(ZA[800], ZA[795], ZA[805]) + ZR30(ZA[805]);
+    ZA[647] = ZA[619] + ZA[646];
+    ZA[645] = ZA[644] + 0x766a0abbU;
+    
+    ZA[650] = ZR25(ZA[537]) + ZA[502];
+    ZA[648] = ZR15(ZA[640]) + ZA[647];
+    ZA[816] = ZA[798] + ZA[645];
+    ZA[813] = ZA[795] + ZA[812];
+    ZA[810] = ZA[807] + ZA[809];
+    
+    ZA[817] = (ZCh(ZA[813], ZA[808], ZA[803]) + ZA[816]) + ZR26(ZA[813]);
+    ZA[814] = ZMa(ZA[805], ZA[800], ZA[810]) + ZR30(ZA[810]);
+    ZA[925] = ZA[622] + ZA[650];
+    ZA[649] = ZA[648] + 0x81c2c92eU;
+    
+    ZA[653] = ZR25(ZA[546]) + ZA[537];
+    ZA[651] = ZR15(ZA[644]) + ZA[925];
+    ZA[821] = ZA[803] + ZA[649];
+    ZA[818] = ZA[800] + ZA[817];
+    ZA[815] = ZA[812] + ZA[814];
+    
+    ZA[822] = (ZCh(ZA[818], ZA[813], ZA[808]) + ZA[821]) + ZR26(ZA[818]);
+    ZA[819] = ZMa(ZA[810], ZA[805], ZA[815]) + ZR30(ZA[815]);
+    ZA[654] = ZA[627] + ZA[653];
+    ZA[652] = ZA[651] + 0x92722c85U;
+    
+    ZA[657] = ZR25(ZA[554]) + ZA[546];
+    ZA[655] = ZR15(ZA[648]) + ZA[654];
+    ZA[826] = ZA[808] + ZA[652];
+    ZA[823] = ZA[805] + ZA[822];
+    ZA[820] = ZA[817] + ZA[819];
+    
+    ZA[827] = (ZCh(ZA[823], ZA[818], ZA[813]) + ZA[826]) + ZR26(ZA[823]);
+    ZA[824] = ZMa(ZA[815], ZA[810], ZA[820]) + ZR30(ZA[820]);
+    ZA[658] = ZA[631] + ZA[657];
+    ZA[656] = ZA[655] + 0xa2bfe8a1U;
+    
+    ZA[661] = ZR25(ZA[566]) + ZA[554];
+    ZA[659] = ZR15(ZA[651]) + ZA[658];
+    ZA[831] = ZA[813] + ZA[656];
+    ZA[828] = ZA[810] + ZA[827];
+    ZA[825] = ZA[822] + ZA[824];
+    
+    ZA[832] = (ZCh(ZA[828], ZA[823], ZA[818]) + ZA[831]) + ZR26(ZA[828]);
+    ZA[829] = ZMa(ZA[820], ZA[815], ZA[825]) + ZR30(ZA[825]);
+    ZA[662] = ZA[636] + ZA[661];
+    ZA[660] = ZA[659] + 0xa81a664bU;
+    
+    ZA[665] = ZR25(ZA[567]) + ZA[566];
+    ZA[663] = ZR15(ZA[655]) + ZA[662];
+    ZA[836] = ZA[818] + ZA[660];
+    ZA[833] = ZA[815] + ZA[832];
+    ZA[830] = ZA[827] + ZA[829];
+    
+    ZA[837] = (ZCh(ZA[833], ZA[828], ZA[823]) + ZA[836]) + ZR26(ZA[833]);
+    ZA[834] = ZMa(ZA[825], ZA[820], ZA[830]) + ZR30(ZA[830]);
+    ZA[666] = ZA[640] + ZA[665];
+    ZA[664] = ZA[663] + 0xc24b8b70U;
+    
+    ZA[669] = ZR25(ZA[614]) + ZA[567];
+    ZA[667] = ZR15(ZA[659]) + ZA[666];
+    ZA[841] = ZA[823] + ZA[664];
+    ZA[838] = ZA[820] + ZA[837];
+    ZA[835] = ZA[832] + ZA[834];
+    
+    ZA[842] = (ZCh(ZA[838], ZA[833], ZA[828]) + ZA[841]) + ZR26(ZA[838]);
+    ZA[839] = ZMa(ZA[830], ZA[825], ZA[835]) + ZR30(ZA[835]);
+    ZA[670] = ZA[644] + ZA[669];
+    ZA[668] = ZA[667] + 0xc76c51a3U;
+    
+    ZA[677] = ZR25(ZA[616]) + ZA[614];
+    ZA[671] = ZR15(ZA[663]) + ZA[670];
+    ZA[846] = ZA[828] + ZA[668];
+    ZA[843] = ZA[825] + ZA[842];
+    ZA[840] = ZA[837] + ZA[839];
+    
+    ZA[847] = (ZCh(ZA[843], ZA[838], ZA[833]) + ZA[846]) + ZR26(ZA[843]);
+    ZA[844] = ZMa(ZA[835], ZA[830], ZA[840]) + ZR30(ZA[840]);
+    ZA[678] = ZA[648] + ZA[677];
+    ZA[676] = ZA[671] + 0xd192e819U;
+    
+    ZA[682] = ZR25(ZA[619]) + ZA[616];
+    ZA[679] = ZR15(ZA[667]) + ZA[678];
+    ZA[851] = ZA[833] + ZA[676];
+    ZA[848] = ZA[830] + ZA[847];
+    ZA[845] = ZA[842] + ZA[844];
+    
+    ZA[852] = (ZCh(ZA[848], ZA[843], ZA[838]) + ZA[851]) + ZR26(ZA[848]);
+    ZA[849] = ZMa(ZA[840], ZA[835], ZA[845]) + ZR30(ZA[845]);
+    ZA[683] = ZA[651] + ZA[682];
+    ZA[680] = ZA[679] + 0xd6990624U;
+    
+    ZA[686] = ZR25(ZA[622]) + ZA[619];
+    ZA[684] = ZR15(ZA[671]) + ZA[683];
+    ZA[856] = ZA[838] + ZA[680];
+    ZA[853] = ZA[835] + ZA[852];
+    ZA[850] = ZA[847] + ZA[849];
+    
+    ZA[857] = (ZCh(ZA[853], ZA[848], ZA[843]) + ZA[856]) + ZR26(ZA[853]);
+    ZA[854] = ZMa(ZA[845], ZA[840], ZA[850]) + ZR30(ZA[850]);
+    ZA[687] = ZA[655] + ZA[686];
+    ZA[685] = ZA[684] + 0xf40e3585U;
+    
+    ZA[690] = ZR25(ZA[627]) + ZA[622];
+    ZA[688] = ZR15(ZA[679]) + ZA[687];
+    ZA[861] = ZA[843] + ZA[685];
+    ZA[858] = ZA[840] + ZA[857];
+    ZA[855] = ZA[852] + ZA[854];
+    
+    ZA[862] = (ZCh(ZA[858], ZA[853], ZA[848]) + ZA[861]) + ZR26(ZA[858]);
+    ZA[859] = ZMa(ZA[850], ZA[845], ZA[855]) + ZR30(ZA[855]);
+    ZA[691] = ZA[659] + ZA[690];
+    ZA[689] = ZA[688] + 0x106aa070U;
+    
+    ZA[694] = ZR25(ZA[631]) + ZA[627];
+    ZA[692] = ZR15(ZA[684]) + ZA[691];
+    ZA[866] = ZA[848] + ZA[689];
+    ZA[863] = ZA[845] + ZA[862];
+    ZA[860] = ZA[857] + ZA[859];
+    
+    ZA[867] = (ZCh(ZA[863], ZA[858], ZA[853]) + ZA[866]) + ZR26(ZA[863]);
+    ZA[864] = ZMa(ZA[855], ZA[850], ZA[860]) + ZR30(ZA[860]);
+    ZA[695] = ZA[663] + ZA[694];
+    ZA[693] = ZA[692] + 0x19a4c116U;
+    
+    ZA[698] = ZR25(ZA[636]) + ZA[631];
+    ZA[696] = ZR15(ZA[688]) + ZA[695];
+    ZA[871] = ZA[853] + ZA[693];
+    ZA[868] = ZA[850] + ZA[867];
+    ZA[865] = ZA[862] + ZA[864];
+    
+    ZA[873] = (ZCh(ZA[868], ZA[863], ZA[858]) + ZA[871]) + ZR26(ZA[868]);
+    ZA[869] = ZMa(ZA[860], ZA[855], ZA[865]) + ZR30(ZA[865]);
+    ZA[699] = ZA[667] + ZA[698];
+    ZA[697] = ZA[696] + 0x1e376c08U;
+    
+    ZA[702] = ZR25(ZA[640]) + ZA[636];
+    ZA[700] = ZR15(ZA[692]) + ZA[699];
+    ZA[877] = ZA[858] + ZA[697];
+    ZA[874] = ZA[855] + ZA[873];
+    ZA[870] = ZA[867] + ZA[869];
+    
+    ZA[878] = (ZCh(ZA[874], ZA[868], ZA[863]) + ZA[877]) + ZR26(ZA[874]);
+    ZA[875] = ZMa(ZA[865], ZA[860], ZA[870]) + ZR30(ZA[870]);
+    ZA[703] = ZA[671] + ZA[702];
+    ZA[701] = ZA[700] + 0x2748774cU;
+    
+    ZA[706] = ZR25(ZA[644]) + ZA[640];
+    ZA[704] = ZR15(ZA[696]) + ZA[703];
+    ZA[882] = ZA[863] + ZA[701];
+    ZA[879] = ZA[860] + ZA[878];
+    ZA[876] = ZA[873] + ZA[875];
+    
+    ZA[883] = (ZCh(ZA[879], ZA[874], ZA[868]) + ZA[882]) + ZR26(ZA[879]);
+    ZA[880] = ZMa(ZA[870], ZA[865], ZA[876]) + ZR30(ZA[876]);
+    ZA[707] = ZA[679] + ZA[706];
+    ZA[705] = ZA[704] + 0x34b0bcb5U;
+    
+    ZA[710] = ZR25(ZA[648]) + ZA[644];
+    ZA[708] = ZR15(ZA[700]) + ZA[707];
+    ZA[887] = ZA[868] + ZA[705];
+    ZA[884] = ZA[865] + ZA[883];
+    ZA[881] = ZA[878] + ZA[880];
+    
+    ZA[888] = (ZCh(ZA[884], ZA[879], ZA[874]) + ZA[887]) + ZR26(ZA[884]);
+    ZA[885] = ZMa(ZA[876], ZA[870], ZA[881]) + ZR30(ZA[881]);
+    ZA[711] = ZA[684] + ZA[710];
+    ZA[709] = ZA[708] + 0x391c0cb3U;
+    
+    ZA[714] = ZR25(ZA[651]) + ZA[648];
+    ZA[712] = ZR15(ZA[704]) + ZA[711];
+    ZA[892] = ZA[874] + ZA[709];
+    ZA[889] = ZA[870] + ZA[888];
+    ZA[886] = ZA[883] + ZA[885];
+    
+    ZA[893] = (ZCh(ZA[889], ZA[884], ZA[879]) + ZA[892]) + ZR26(ZA[889]);
+    ZA[890] = ZMa(ZA[881], ZA[876], ZA[886]) + ZR30(ZA[886]);
+    ZA[715] = ZA[688] + ZA[714];
+    ZA[713] = ZA[712] + 0x4ed8aa4aU;
+    
+    ZA[718] = ZR25(ZA[655]) + ZA[651];
+    ZA[716] = ZR15(ZA[708]) + ZA[715];
+    ZA[897] = ZA[879] + ZA[713];
+    ZA[894] = ZA[876] + ZA[893];
+    ZA[891] = ZA[888] + ZA[890];
+    
+    ZA[898] = (ZCh(ZA[894], ZA[889], ZA[884]) + ZA[897]) + ZR26(ZA[894]);
+    ZA[895] = ZMa(ZA[886], ZA[881], ZA[891]) + ZR30(ZA[891]);
+    ZA[719] = ZA[692] + ZA[718];
+    ZA[717] = ZA[716] + 0x5b9cca4fU;
+    
+    ZA[722] = ZR25(ZA[659]) + ZA[655];
+    ZA[720] = ZR15(ZA[712]) + ZA[719];
+    ZA[902] = ZA[884] + ZA[717];
+    ZA[899] = ZA[881] + ZA[898];
+    ZA[896] = ZA[893] + ZA[895];
+    
+    ZA[903] = (ZCh(ZA[899], ZA[894], ZA[889]) + ZA[902]) + ZR26(ZA[899]);
+    ZA[900] = ZMa(ZA[891], ZA[886], ZA[896]) + ZR30(ZA[896]);
+    ZA[723] = ZA[696] + ZA[722];
+    ZA[721] = ZA[720] + 0x682e6ff3U;
+    
+    ZA[672] = ZR25(ZA[663]) + ZA[659];
+    ZA[724] = ZR15(ZA[716]) + ZA[723];
+    ZA[907] = ZA[889] + ZA[721];
+    ZA[904] = ZA[886] + ZA[903];
+    ZA[901] = ZA[898] + ZA[900];
+    
+    ZA[908] = (ZCh(ZA[904], ZA[899], ZA[894]) + ZA[907]) + ZR26(ZA[904]);
+    ZA[905] = ZMa(ZA[896], ZA[891], ZA[901]) + ZR30(ZA[901]);
+    ZA[673] = ZR25(ZA[667]) + ZA[663];
+    ZA[726] = ZA[700] + ZA[672];
+    ZA[725] = ZA[724] + 0x748f82eeU;
+    
+    ZA[727] = ZR15(ZA[720]) + ZA[726];
+    ZA[912] = ZA[894] + ZA[725];
+    ZA[909] = ZA[891] + ZA[908];
+    ZA[906] = ZA[903] + ZA[905];
+    ZA[675] = ZA[667] + 0x8cc70208U;
+    ZA[729] = ZA[704] + ZA[673];
+    
+    ZA[913] = (ZCh(ZA[909], ZA[904], ZA[899]) + ZA[912]) + ZR26(ZA[909]);
+    ZA[910] = ZMa(ZA[901], ZA[896], ZA[906]) + ZR30(ZA[906]);
+    ZA[674] = ZR25(ZA[671]) + ZA[675];
+    ZA[730] = ZR15(ZA[724]) + ZA[729];
+    ZA[728] = ZA[727] + 0x78a5636fU;
+    
+    ZA[681] = ZR25(ZA[679]) + ZA[671];
+    ZA[917] = ZA[899] + ZA[901] + ZA[728];
+    ZA[914] = ZA[896] + ZA[913];
+    ZA[911] = ZA[908] + ZA[910];
+    ZA[732] = ZA[708] + ZA[674];
+    ZA[731] = ZA[730] + 0x84c87814U;
+    
+    ZA[918] = (ZCh(ZA[914], ZA[909], ZA[904]) + ZA[917]) + ZR26(ZA[914]);
+    ZA[915] = ZMa(ZA[906], ZA[901], ZA[911]) + ZR30(ZA[911]);
+    ZA[733] = ZR15(ZA[727]) + ZA[732];
+    ZA[919] = ZA[906] + ZA[904] + ZA[731];
+    ZA[734] = ZA[712] + ZA[681];
+    
+    ZA[920] = (ZCh(ZA[918], ZA[914], ZA[909]) + ZA[919]) + ZR26(ZA[918]);
+    ZA[735] = ZR15(ZA[730]) + ZA[734];
+    ZA[921] = ZA[911] + ZA[909] + ZA[733];
+    ZA[916] = ZA[913] + ZA[915];
+    
+    ZA[922] = (ZCh(ZA[920], ZA[918], ZA[914]) + ZA[921]) + ZR26(ZA[920]);
+    ZA[923] = ZA[916] + ZA[914] + ZA[735];
+    
+    ZA[924] = (ZCh(ZA[922], ZA[920], ZA[918]) + ZA[923]) + ZR26(ZA[922]);
+    
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
+
+#if defined(VECTORS4)
+	bool result = any(ZA[924] == 0x136032EDU);
+
+	if (result) {
+		if (ZA[924].x == 0x136032EDU)
+			output[FOUND] = output[NFLAG & Znonce.x] =  Znonce.x;
+		if (ZA[924].y == 0x136032EDU)
+			output[FOUND] = output[NFLAG & Znonce.y] =  Znonce.y;
+		if (ZA[924].z == 0x136032EDU)
+			output[FOUND] = output[NFLAG & Znonce.z] =  Znonce.z;
+		if (ZA[924].w == 0x136032EDU)
+			output[FOUND] = output[NFLAG & Znonce.w] =  Znonce.w;
+	}
+#elif defined(VECTORS2)
+	bool result = any(ZA[924] == 0x136032EDU);
+
+	if (result) {
+		if (ZA[924].x == 0x136032EDU)
+			output[FOUND] = output[NFLAG & Znonce.x] =  Znonce.x;
+		if (ZA[924].y == 0x136032EDU)
+			output[FOUND] = output[NFLAG & Znonce.y] =  Znonce.y;
+	}
+#else
+	if (ZA[924] == 0x136032EDU)
+		output[FOUND] = output[NFLAG & Znonce] =  Znonce;
+#endif
+}

+ 587 - 0
diakgcn120724.cl

@@ -0,0 +1,587 @@
+// DiaKGCN 27-04-2012 - OpenCL kernel by Diapolo
+//
+// Parts and / or ideas for this kernel are based upon the public-domain poclbm project, the phatk kernel by Phateus and the DiabloMiner kernel by DiabloD3.
+// The kernel was rewritten by me (Diapolo) and is still public-domain!
+
+#ifdef VECTORS4
+	typedef uint4 u;
+#elif defined VECTORS2
+	typedef uint2 u;
+#else
+	typedef uint u;
+#endif
+
+#ifdef BITALIGN
+	#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+	#ifdef BFI_INT
+		#define ch(x, y, z) amd_bytealign(x, y, z)
+		#define ma(x, y, z) amd_bytealign(z ^ x, y, x)
+	#else
+		#define ch(x, y, z) bitselect(z, y, x)
+		#define ma(z, x, y) bitselect(z, y, z ^ x)
+	#endif
+#else
+	#define ch(x, y, z) (z ^ (x & (y ^ z)))
+	#define ma(x, y, z) ((x & z) | (y & (x | z)))
+#endif
+
+#define rotr15(n) (rotate(n, 15U) ^ rotate(n, 13U) ^ (n >> 10U))
+#define rotr25(n) (rotate(n, 25U) ^ rotate(n, 14U) ^ (n >> 3U))
+#define rotr26(n) (rotate(n, 26U) ^ rotate(n, 21U) ^ rotate(n, 7U))
+#define rotr30(n) (rotate(n, 30U) ^ rotate(n, 19U) ^ rotate(n, 10U))
+
+__kernel
+	__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+	void search(	
+#ifndef GOFFSET
+			const u base,
+#endif
+			const uint PreVal0, const uint PreVal4,
+			const uint H1, const uint D1A, const uint B1, const uint C1,
+			const uint F1, const uint G1, const uint C1addK5, const uint B1addK6, const uint PreVal0addK7,
+			const uint W16addK16, const uint W17addK17,
+			const uint PreW18, const uint PreW19,
+			const uint W16, const uint W17,
+			const uint PreW31, const uint PreW32,
+			const uint state0, const uint state1, const uint state2, const uint state3,
+			const uint state4, const uint state5, const uint state6, const uint state7,
+			const uint state0A, const uint state0B,
+			const uint state1A, const uint state2A, const uint state3A, const uint state4A,
+			const uint state5A, const uint state6A, const uint state7A,
+			__global uint * output)
+{
+	u V[8];
+	u W[16];
+
+#ifdef VECTORS4
+	const u nonce = (uint)(get_local_id(0)) * 4U + (uint)(get_group_id(0)) * (uint)(WORKVEC) + base;
+#elif defined VECTORS2
+	const u nonce = (uint)(get_local_id(0)) * 2U + (uint)(get_group_id(0)) * (uint)(WORKVEC) + base;
+#else
+	#ifdef GOFFSET
+		const u nonce = (uint)(get_global_id(0));
+	#else
+		const u nonce = (uint)(get_local_id(0)) + (uint)(get_group_id(0)) * (uint)(WORKSIZE) + base;
+	#endif
+#endif
+
+	V[0] = PreVal0 + nonce;
+	V[1] = B1;
+	V[2] = C1;
+	V[3] = D1A;
+	V[4] = PreVal4 + nonce;
+	V[5] = F1;
+	V[6] = G1;
+	V[7] = H1;
+
+	V[7] += V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += C1addK5 + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  C1addK5 + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += B1addK6 + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  B1addK6 + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += PreVal0addK7 + nonce + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  PreVal0addK7 + nonce + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0xd807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0xd807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0xc19bf3f4U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0xc19bf3f4U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += W16addK16 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  W16addK16 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += W17addK17 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  W17addK17 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+//----------------------------------------------------------------------------------
+
+#ifdef VECTORS4
+	 W[0] = PreW18 + (u)(rotr25(nonce.x), rotr25(nonce.x) ^ 0x2004000U, rotr25(nonce.x) ^ 0x4008000U, rotr25(nonce.x) ^ 0x600c000U);
+#elif defined VECTORS2
+	 W[0] = PreW18 + (u)(rotr25(nonce.x), rotr25(nonce.x) ^ 0x2004000U);
+#else
+	 W[0] = PreW18 + rotr25(nonce);
+#endif
+	 W[1] = PreW19 + nonce;
+	 W[2] = 0x80000000U + rotr15(W[0]);
+	 W[3] = rotr15(W[1]);
+	 W[4] = 0x00000280U + rotr15(W[2]);
+	 W[5] = W16 + rotr15(W[3]);
+	 W[6] = W17 + rotr15(W[4]);
+	 W[7] = W[0] + rotr15(W[5]);
+	 W[8] = W[1] + rotr15(W[6]);
+	 W[9] = W[2] + rotr15(W[7]);
+	W[10] = W[3] + rotr15(W[8]);
+	W[11] = W[4] + rotr15(W[9]);
+	W[12] = W[5] + 0x00a00055U + rotr15(W[10]);
+	W[13] = W[6] + PreW31 + rotr15(W[11]);
+	W[14] = W[7] + PreW32 + rotr15(W[12]);
+	W[15] = W[8] + W17 + rotr15(W[13]) + rotr25(W[0]);
+
+	V[1] += 0x0fc19dc6U + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + W[0];
+	V[5] =  0x0fc19dc6U + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + W[0] + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x240ca1ccU + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x240ca1ccU + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x2de92c6fU + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x2de92c6fU + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x4a7484aaU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x4a7484aaU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x5cb0a9dcU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x5cb0a9dcU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x76f988daU + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x76f988daU + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0x983e5152U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x983e5152U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0xa831c66dU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0xa831c66dU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0xb00327c8U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0xb00327c8U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0xbf597fc7U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0xbf597fc7U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0xc6e00bf3U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0xc6e00bf3U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0xd5a79147U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0xd5a79147U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x06ca6351U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x06ca6351U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x14292967U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x14292967U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0x27b70a85U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x27b70a85U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x2e1b2138U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0x2e1b2138U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+//----------------------------------------------------------------------------------
+
+	 W[0] =  W[0] +  W[9] + rotr15(W[14]) + rotr25( W[1]);
+	 W[1] =  W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
+	 W[2] =  W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
+	 W[3] =  W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
+	 W[4] =  W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
+	 W[5] =  W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
+	 W[6] =  W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
+	 W[7] =  W[7] +  W[0] + rotr15( W[5]) + rotr25( W[8]);
+	 W[8] =  W[8] +  W[1] + rotr15( W[6]) + rotr25( W[9]);
+	 W[9] =  W[9] +  W[2] + rotr15( W[7]) + rotr25(W[10]);
+	W[10] = W[10] +  W[3] + rotr15( W[8]) + rotr25(W[11]);
+	W[11] = W[11] +  W[4] + rotr15( W[9]) + rotr25(W[12]);
+	W[12] = W[12] +  W[5] + rotr15(W[10]) + rotr25(W[13]);
+	W[13] = W[13] +  W[6] + rotr15(W[11]) + rotr25(W[14]);
+	W[14] = W[14] +  W[7] + rotr15(W[12]) + rotr25(W[15]);
+	W[15] = W[15] +  W[8] + rotr15(W[13]) + rotr25( W[0]);
+
+	V[1] += 0x4d2c6dfcU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x4d2c6dfcU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x53380d13U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x53380d13U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x650a7354U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x650a7354U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x766a0abbU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x766a0abbU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x81c2c92eU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x81c2c92eU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x92722c85U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x92722c85U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0xa2bfe8a1U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0xa2bfe8a1U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0xa81a664bU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0xa81a664bU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0xc24b8b70U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0xc24b8b70U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0xc76c51a3U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0xc76c51a3U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0xd192e819U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0xd192e819U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0xd6990624U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0xd6990624U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0xf40e3585U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0xf40e3585U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x106aa070U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x106aa070U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0x19a4c116U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x19a4c116U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x1e376c08U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0x1e376c08U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+//----------------------------------------------------------------------------------
+
+	 W[0] =  W[0] +  W[9] + rotr15(W[14]) + rotr25( W[1]);
+	 W[1] =  W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
+	 W[2] =  W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
+	 W[3] =  W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
+	 W[4] =  W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
+	 W[5] =  W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
+	 W[6] =  W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
+	 W[7] =  W[7] +  W[0] + rotr15( W[5]) + rotr25( W[8]);
+	 W[8] =  W[8] +  W[1] + rotr15( W[6]) + rotr25( W[9]);
+	 W[9] =  W[9] +  W[2] + rotr15( W[7]) + rotr25(W[10]);
+	W[10] = W[10] +  W[3] + rotr15( W[8]) + rotr25(W[11]);
+	W[11] = W[11] +  W[4] + rotr15( W[9]) + rotr25(W[12]);
+	W[12] = W[12] +  W[5] + rotr15(W[10]) + rotr25(W[13]);
+	W[13] = W[13] +  W[6] + rotr15(W[11]) + rotr25(W[14]);
+
+	V[1] += 0x2748774cU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x2748774cU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x34b0bcb5U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x34b0bcb5U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x391c0cb3U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x391c0cb3U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x4ed8aa4aU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x4ed8aa4aU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x5b9cca4fU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x5b9cca4fU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x682e6ff3U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x682e6ff3U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0x748f82eeU + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x748f82eeU + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x78a5636fU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0x78a5636fU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0x84c87814U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x84c87814U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x8cc70208U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x8cc70208U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x90befffaU + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x90befffaU + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0xa4506cebU + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0xa4506cebU + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0xbef9a3f7U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0xbef9a3f7U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0xc67178f2U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0xc67178f2U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+//----------------------------------------------------------------------------------
+
+	 W[0] = state0 + V[0] + rotr25(state1 + V[1]);
+	 W[1] = state1 + V[1] + 0x00a00000U + rotr25(state2 + V[2]);
+	 W[2] = state2 + V[2] + rotr15(W[0]) + rotr25(state3 + V[3]);
+	 W[3] = state3 + V[3] + rotr15(W[1]) + rotr25(state4 + V[4]);
+	 W[4] = state4 + V[4] + rotr15(W[2]) + rotr25(state5 + V[5]);
+	 W[5] = state5 + V[5] + rotr15(W[3]) + rotr25(state6 + V[6]);
+	 W[6] = state6 + V[6] + 0x00000100U + rotr15(W[4]) + rotr25(state7 + V[7]);	
+	 W[7] = state7 + V[7] + W[0] + 0x11002000U + rotr15(W[5]);
+	 W[8] = W[1] + 0x80000000U + rotr15(W[6]);	
+	 W[9] = W[2] + rotr15(W[7]);
+	W[10] = W[3] + rotr15(W[8]);
+	W[11] = W[4] + rotr15(W[9]);
+	W[12] = W[5] + rotr15(W[10]);
+	W[13] = W[6] + rotr15(W[11]);
+	W[14] = W[7] + 0x00400022U + rotr15(W[12]);
+	W[15] = W[8] + 0x00000100U + rotr15(W[13]) + rotr25(W[0]);
+
+	// 0x71374491U + 0x1f83d9abU + state1
+	const u state1AaddV1 = state1A + V[1];
+	// 0xb5c0fbcfU + 0x9b05688cU + state2
+	const u state2AaddV2 = state2A + V[2];
+	// 0x510e527fU + 0xe9b5dba5U + state3
+	const u state3AaddV3 = state3A + V[3];
+	// 0x3956c25bU + state4
+	const u state4AaddV4 = state4A + V[4];
+	// 0x59f111f1U + state5
+	const u state5AaddV5 = state5A + V[5];
+	// 0x923f82a4U + state6
+	const u state6AaddV6 = state6A + V[6];
+	// 0xab1c5ed5U + state7
+	const u state7AaddV7 = state7A + V[7];
+
+	// 0x98c7e2a2U + state0	
+	V[3] = state0A + V[0];
+	// 0xfc08884dU + state0
+	V[7] = state0B + V[0];
+	V[0] = 0x6a09e667U;
+	V[1] = 0xbb67ae85U;
+	V[2] = 0x3c6ef372U;
+	V[4] = 0x510e527fU;
+	V[5] = 0x9b05688cU;
+	V[6] = 0x1f83d9abU;
+
+	V[2] += state1AaddV1 + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  state1AaddV1 + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += state2AaddV2 + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  state2AaddV2 + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += state3AaddV3 + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  state3AaddV3 + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += state4AaddV4 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  state4AaddV4 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += state5AaddV5 + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  state5AaddV5 + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += state6AaddV6 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  state6AaddV6 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += state7AaddV7 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  state7AaddV7 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0x5807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x5807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0xc19bf274U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0xc19bf274U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0xe49b69c1U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0xe49b69c1U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0xefbe4786U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0xefbe4786U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0x0fc19dc6U + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x0fc19dc6U + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x240ca1ccU + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x240ca1ccU + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x2de92c6fU + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x2de92c6fU + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x4a7484aaU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x4a7484aaU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x5cb0a9dcU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x5cb0a9dcU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x76f988daU + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x76f988daU + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0x983e5152U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x983e5152U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0xa831c66dU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0xa831c66dU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0xb00327c8U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0xb00327c8U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0xbf597fc7U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0xbf597fc7U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0xc6e00bf3U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0xc6e00bf3U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0xd5a79147U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0xd5a79147U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x06ca6351U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x06ca6351U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x14292967U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x14292967U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+//----------------------------------------------------------------------------------
+
+	 W[0] =  W[0] +  W[9] + rotr15(W[14]) + rotr25( W[1]);
+	 W[1] =  W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
+	 W[2] =  W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
+	 W[3] =  W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
+	 W[4] =  W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
+	 W[5] =  W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
+	 W[6] =  W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
+	 W[7] =  W[7] +  W[0] + rotr15( W[5]) + rotr25( W[8]);
+	 W[8] =  W[8] +  W[1] + rotr15( W[6]) + rotr25( W[9]);
+	 W[9] =  W[9] +  W[2] + rotr15( W[7]) + rotr25(W[10]);
+	W[10] = W[10] +  W[3] + rotr15( W[8]) + rotr25(W[11]);
+	W[11] = W[11] +  W[4] + rotr15( W[9]) + rotr25(W[12]);
+	W[12] = W[12] +  W[5] + rotr15(W[10]) + rotr25(W[13]);
+	W[13] = W[13] +  W[6] + rotr15(W[11]) + rotr25(W[14]);
+	W[14] = W[14] +  W[7] + rotr15(W[12]) + rotr25(W[15]);
+	W[15] = W[15] +  W[8] + rotr15(W[13]) + rotr25( W[0]);
+
+	V[3] += 0x27b70a85U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x27b70a85U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x2e1b2138U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0x2e1b2138U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0x4d2c6dfcU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x4d2c6dfcU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x53380d13U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x53380d13U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x650a7354U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x650a7354U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x766a0abbU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x766a0abbU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x81c2c92eU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x81c2c92eU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x92722c85U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x92722c85U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0xa2bfe8a1U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0xa2bfe8a1U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0xa81a664bU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0xa81a664bU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0xc24b8b70U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0xc24b8b70U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0xc76c51a3U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0xc76c51a3U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0xd192e819U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0xd192e819U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0xd6990624U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0xd6990624U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0xf40e3585U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0xf40e3585U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x106aa070U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x106aa070U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+//----------------------------------------------------------------------------------
+
+	 W[0] =  W[0] +  W[9] + rotr15(W[14]) + rotr25( W[1]);
+	 W[1] =  W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
+	 W[2] =  W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
+	 W[3] =  W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
+	 W[4] =  W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
+	 W[5] =  W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
+	 W[6] =  W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
+	 W[7] =  W[7] +  W[0] + rotr15( W[5]) + rotr25( W[8]);
+	 W[8] =  W[8] +  W[1] + rotr15( W[6]) + rotr25( W[9]);
+	 W[9] =  W[9] +  W[2] + rotr15( W[7]) + rotr25(W[10]);
+	W[10] = W[10] +  W[3] + rotr15( W[8]) + rotr25(W[11]);
+	W[11] = W[11] +  W[4] + rotr15( W[9]) + rotr25(W[12]);
+	W[12] = W[12] +  W[5] + rotr15(W[10]) + rotr25(W[13]);
+
+	V[3] += 0x19a4c116U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x19a4c116U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x1e376c08U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0x1e376c08U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0x2748774cU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x2748774cU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x34b0bcb5U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x34b0bcb5U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x391c0cb3U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x391c0cb3U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x4ed8aa4aU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x4ed8aa4aU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x5b9cca4fU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x5b9cca4fU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x682e6ff3U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x682e6ff3U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0x748f82eeU + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x748f82eeU + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x78a5636fU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+
+	V[1] += 0x84c87814U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+
+	V[0] += 0x8cc70208U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+
+	V[7] += V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
+
+#ifdef VECTORS4
+	if ((V[7].x == 0x136032edU) ^ (V[7].y == 0x136032edU) ^ (V[7].z == 0x136032edU) ^ (V[7].w == 0x136032edU))
+		output[FOUND] = output[NFLAG & nonce.x] = (V[7].x == 0x136032edU) ? nonce.x : ((V[7].y == 0x136032edU) ? nonce.y : ((V[7].z == 0x136032edU) ? nonce.z : nonce.w));
+#elif defined VECTORS2
+	if ((V[7].x == 0x136032edU) + (V[7].y == 0x136032edU))
+		output[FOUND] = output[NFLAG & nonce.x] = (V[7].x == 0x136032edU) ? nonce.x : nonce.y;
+#else
+	if (V[7] == 0x136032edU)
+		output[FOUND] = output[NFLAG & nonce] = nonce;
+#endif
+}

+ 417 - 0
phatk120724.cl

@@ -0,0 +1,417 @@
+// This file is taken and modified from the public-domain poclbm project, and
+// I have therefore decided to keep it public-domain.
+// Modified version copyright 2011-2012 Con Kolivas
+
+#ifdef VECTORS4
+	typedef uint4 u;
+#elif defined VECTORS2
+	typedef uint2 u;
+#else
+	typedef uint u;
+#endif
+
+__constant uint K[64] = { 
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+__constant uint ConstW[128] = {
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x80000000U, 0x00000000, 0x00000000, 0x00000000,
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000280U,
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+0x80000000U, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000100U,
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
+0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000
+};
+
+__constant uint H[8] = { 
+	0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19
+};
+
+
+#ifdef BITALIGN
+	#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+	#define rot(x, y) amd_bitalign(x, x, (uint)(32 - y))
+
+// This part is not from the stock poclbm kernel. It's part of an optimization
+// added in the Phoenix Miner.
+
+// Some AMD devices have Vals[0] BFI_INT opcode, which behaves exactly like the
+// SHA-256 Ch function, but provides it in exactly one instruction. If
+// detected, use it for Ch. Otherwise, construct Ch out of simpler logical
+// primitives.
+
+ #ifdef BFI_INT
+	// Well, slight problem... It turns out BFI_INT isn't actually exposed to
+	// OpenCL (or CAL IL for that matter) in any way. However, there is 
+	// a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via
+	// amd_bytealign, takes the same inputs, and provides the same output. 
+	// We can use that as a placeholder for BFI_INT and have the application 
+	// patch it after compilation.
+	
+	// This is the BFI_INT function
+	#define Ch(x, y, z) amd_bytealign(x,y,z)
+	// Ma can also be implemented in terms of BFI_INT...
+	#define Ma(z, x, y) amd_bytealign(z^x,y,x)
+ #else // BFI_INT
+	// Later SDKs optimise this to BFI INT without patching and GCN
+	// actually fails if manually patched with BFI_INT
+
+	#define Ch(x, y, z) bitselect((u)z, (u)y, (u)x)
+	#define Ma(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x)
+	#define rotr(x, y) amd_bitalign((u)x, (u)x, (u)y)
+ #endif
+#else // BITALIGN
+	#define Ch(x, y, z) (z ^ (x & (y ^ z)))
+	#define Ma(x, y, z) ((x & z) | (y & (x | z)))
+	#define rot(x, y) rotate((u)x, (u)y)
+	#define rotr(x, y) rotate((u)x, (u)(32-y))
+#endif
+
+
+
+//Various intermediate calculations for each SHA round
+#define s0(n) (S0(Vals[(0 + 128 - (n)) % 8]))
+#define S0(n) (rot(n, 30u)^rot(n, 19u)^rot(n,10u))
+
+#define s1(n) (S1(Vals[(4 + 128 - (n)) % 8]))
+#define S1(n) (rot(n, 26u)^rot(n, 21u)^rot(n, 7u))
+
+#define ch(n) Ch(Vals[(4 + 128 - (n)) % 8],Vals[(5 + 128 - (n)) % 8],Vals[(6 + 128 - (n)) % 8])
+#define maj(n) Ma(Vals[(1 + 128 - (n)) % 8],Vals[(2 + 128 - (n)) % 8],Vals[(0 + 128 - (n)) % 8])
+
+//t1 calc when W is already calculated
+#define t1(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] +  W[(n)] + s1(n) + ch(n) 
+
+//t1 calc which calculates W
+#define t1W(n) K[(n) % 64] + Vals[(7 + 128 - (n)) % 8] +  W(n) + s1(n) + ch(n)
+
+//Used for constant W Values (the compiler optimizes out zeros)
+#define t1C(n) (K[(n) % 64]+ ConstW[(n)]) + Vals[(7 + 128 - (n)) % 8] + s1(n) + ch(n)
+
+//t2 Calc
+#define t2(n)  maj(n) + s0(n)
+
+#define rotC(x,n) (x<<n | x >> (32-n))
+
+//W calculation used for SHA round
+#define W(n) (W[n] = P4(n) + P3(n) + P2(n) + P1(n))
+
+
+
+//Partial W calculations (used for the begining where only some values are nonzero)
+#define P1(n) ((rot(W[(n)-2],15u)^rot(W[(n)-2],13u)^((W[(n)-2])>>10U)))
+#define P2(n) ((rot(W[(n)-15],25u)^rot(W[(n)-15],14u)^((W[(n)-15])>>3U)))
+
+
+#define p1(x) ((rot(x,15u)^rot(x,13u)^((x)>>10U)))
+#define p2(x) ((rot(x,25u)^rot(x,14u)^((x)>>3U)))
+
+
+#define P3(n)  W[n-7]
+#define P4(n)  W[n-16]
+
+
+//Partial Calcs for constant W values
+#define P1C(n) ((rotC(ConstW[(n)-2],15)^rotC(ConstW[(n)-2],13)^((ConstW[(n)-2])>>10U)))
+#define P2C(n) ((rotC(ConstW[(n)-15],25)^rotC(ConstW[(n)-15],14)^((ConstW[(n)-15])>>3U)))
+#define P3C(x)  ConstW[x-7]
+#define P4C(x)  ConstW[x-16]
+
+//SHA round with built in W calc
+#define sharoundW(n) Barrier1(n);  Vals[(3 + 128 - (n)) % 8] += t1W(n); Vals[(7 + 128 - (n)) % 8] = t1W(n) + t2(n);  
+
+//SHA round without W calc
+#define sharound(n)  Barrier2(n); Vals[(3 + 128 - (n)) % 8] += t1(n); Vals[(7 + 128 - (n)) % 8] = t1(n) + t2(n);
+
+//SHA round for constant W values
+#define sharoundC(n)  Barrier3(n); Vals[(3 + 128 - (n)) % 8] += t1C(n); Vals[(7 + 128 - (n)) % 8] = t1C(n) + t2(n);
+
+//The compiler is stupid... I put this in there only to stop the compiler from (de)optimizing the order
+#define Barrier1(n) t1 = t1C((n+1))
+#define Barrier2(n) t1 = t1C((n))
+#define Barrier3(n) t1 = t1C((n))
+
+//#define WORKSIZE 256
+#define MAXBUFFERS (4095)
+
+__kernel 
+ __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+void search(	const uint state0, const uint state1, const uint state2, const uint state3,
+						const uint state4, const uint state5, const uint state6, const uint state7,
+						const uint B1, const uint C1, const uint D1,
+						const uint F1, const uint G1, const uint H1,
+						const u base,
+						const uint W16, const uint W17,
+						const uint PreVal4, const uint PreVal0,
+						const uint PreW18, const uint PreW19,
+						const uint PreW31, const uint PreW32,
+						
+						__global uint * output)
+{
+
+
+	u W[124];
+	u Vals[8];
+
+//Dummy Variable to prevent compiler from reordering between rounds
+	u t1;
+
+	//Vals[0]=state0;
+	Vals[1]=B1;
+	Vals[2]=C1;
+	Vals[3]=D1;
+	//Vals[4]=PreVal4;
+	Vals[5]=F1;
+	Vals[6]=G1;
+	Vals[7]=H1;
+
+	W[16] = W16;
+	W[17] = W17;
+
+#ifdef VECTORS4
+	//Less dependencies to get both the local id and group id and then add them
+	W[3] = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u);
+	uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
+	//Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3
+	W[18] = PreW18 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U};
+#elif defined VECTORS2
+	W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
+	uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
+	W[18] = PreW18 + (u){r, r ^ 0x2004000U};
+#else
+	W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
+	u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U);
+	W[18] = PreW18 + r;
+#endif
+	//the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions
+
+
+
+	Vals[4] = PreVal4 + W[3];
+	Vals[0] = PreVal0 + W[3];
+
+	sharoundC(4);
+	W[19] = PreW19 + W[3];
+	sharoundC(5);
+	W[20] = P4C(20) + P1(20);
+	sharoundC(6);
+	W[21] = P1(21);
+	sharoundC(7);
+	W[22] = P3C(22) + P1(22);
+	sharoundC(8);
+	W[23] = W[16] + P1(23);
+	sharoundC(9);
+	W[24] = W[17] + P1(24);
+	sharoundC(10);
+	W[25] = P1(25) + P3(25);
+	W[26] = P1(26) + P3(26);
+	sharoundC(11);
+	W[27] = P1(27) + P3(27);
+	W[28] = P1(28) + P3(28);
+	sharoundC(12);
+	W[29] = P1(29) + P3(29);
+	sharoundC(13);
+	W[30] = P1(30) + P2C(30) + P3(30);
+	W[31] = PreW31 + (P1(31) + P3(31));
+	sharoundC(14);
+	W[32] = PreW32 + (P1(32) + P3(32));
+	sharoundC(15);
+	sharound(16);
+	sharound(17);
+	sharound(18);
+	sharound(19);
+	sharound(20);
+	sharound(21);
+	sharound(22);
+	sharound(23);
+	sharound(24);
+	sharound(25);
+	sharound(26);
+	sharound(27);
+	sharound(28);
+	sharound(29);
+	sharound(30);
+	sharound(31);
+	sharound(32);
+	sharoundW(33);
+	sharoundW(34);
+	sharoundW(35);
+	sharoundW(36);
+	sharoundW(37);	
+	sharoundW(38);
+	sharoundW(39);
+	sharoundW(40);
+	sharoundW(41);
+	sharoundW(42);
+	sharoundW(43);
+	sharoundW(44);
+	sharoundW(45);
+	sharoundW(46);
+	sharoundW(47);
+	sharoundW(48);
+	sharoundW(49);
+	sharoundW(50);
+	sharoundW(51);
+	sharoundW(52);
+	sharoundW(53);
+	sharoundW(54);
+	sharoundW(55);
+	sharoundW(56);
+	sharoundW(57);
+	sharoundW(58);
+	sharoundW(59);
+	sharoundW(60);
+	sharoundW(61);
+	sharoundW(62);
+	sharoundW(63);
+
+	W[64]=state0+Vals[0];
+	W[65]=state1+Vals[1];
+	W[66]=state2+Vals[2];
+	W[67]=state3+Vals[3];
+	W[68]=state4+Vals[4];
+	W[69]=state5+Vals[5];
+	W[70]=state6+Vals[6];
+	W[71]=state7+Vals[7];
+
+	Vals[0]=H[0];
+	Vals[1]=H[1];
+	Vals[2]=H[2];
+	Vals[3]=H[3];
+	Vals[4]=H[4];
+	Vals[5]=H[5];
+	Vals[6]=H[6];
+	Vals[7]=H[7];
+
+	//sharound(64 + 0);
+	const u Temp = (0xb0edbdd0U + K[0]) +  W[64];
+	Vals[7] = Temp + 0x08909ae5U;
+	Vals[3] = 0xa54ff53aU + Temp;
+	
+#define P124(n) P2(n) + P1(n) + P4(n)
+
+
+	W[64 + 16] = + P2(64 + 16) + P4(64 + 16);
+	sharound(64 + 1);
+	W[64 + 17] = P1C(64 + 17) + P2(64 + 17) + P4(64 + 17);
+	sharound(64 + 2);
+	W[64 + 18] = P124(64 + 18);
+	sharound(64 + 3);
+	W[64 + 19] = P124(64 + 19);
+	sharound(64 + 4);
+	W[64 + 20] = P124(64 + 20);
+	sharound(64 + 5);
+	W[64 + 21] = P124(64 + 21);
+	sharound(64 + 6);
+	W[64 + 22] = P4(64 + 22) + P3C(64 + 22) + P2(64 + 22) + P1(64 + 22);
+	sharound(64 + 7);
+	W[64 + 23] = P4(64 + 23) + P3(64 + 23) + P2C(64 + 23) + P1(64 + 23);
+	sharoundC(64 + 8);
+	W[64 + 24] =   P1(64 + 24) + P4C(64 + 24) + P3(64 + 24);
+	sharoundC(64 + 9);
+	W[64 + 25] = P3(64 + 25) + P1(64 + 25);
+	sharoundC(64 + 10);
+	W[64 + 26] = P3(64 + 26) + P1(64 + 26);
+	sharoundC(64 + 11);
+	W[64 + 27] = P3(64 + 27) + P1(64 + 27);
+	sharoundC(64 + 12);
+	W[64 + 28] = P3(64 + 28) + P1(64 + 28);
+	sharoundC(64 + 13);
+	W[64 + 29] = P1(64 + 29) + P3(64 + 29);
+	W[64 + 30] = P3(64 + 30) + P2C(64 + 30) + P1(64 + 30);
+	sharoundC(64 + 14);
+	W[64 + 31] = P4C(64 + 31) + P3(64 + 31) + P2(64 + 31) + P1(64 + 31);
+	sharoundC(64 + 15);
+	sharound(64 + 16);
+	sharound(64 + 17);
+	sharound(64 + 18);
+	sharound(64 + 19);
+	sharound(64 + 20);
+	sharound(64 + 21);
+	sharound(64 + 22);
+	sharound(64 + 23);
+	sharound(64 + 24);
+	sharound(64 + 25);
+	sharound(64 + 26);
+	sharound(64 + 27);
+	sharound(64 + 28);
+	sharound(64 + 29);
+	sharound(64 + 30);
+	sharound(64 + 31);
+	sharoundW(64 + 32);
+	sharoundW(64 + 33);
+	sharoundW(64 + 34);
+	sharoundW(64 + 35);
+	sharoundW(64 + 36);
+	sharoundW(64 + 37);
+	sharoundW(64 + 38);
+	sharoundW(64 + 39);
+	sharoundW(64 + 40);
+	sharoundW(64 + 41);
+	sharoundW(64 + 42);
+	sharoundW(64 + 43);
+	sharoundW(64 + 44);
+	sharoundW(64 + 45);
+	sharoundW(64 + 46);
+	sharoundW(64 + 47);
+	sharoundW(64 + 48);
+	sharoundW(64 + 49);
+	sharoundW(64 + 50);
+	sharoundW(64 + 51);
+	sharoundW(64 + 52);
+	sharoundW(64 + 53);
+	sharoundW(64 + 54);
+	sharoundW(64 + 55);
+	sharoundW(64 + 56);
+	sharoundW(64 + 57);
+	sharoundW(64 + 58);
+
+	W[117] += W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64),Vals[1],Vals[2]) -
+		(-(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(59+64))  + s1(64+59)+ ch(59+64)));
+
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
+
+#ifdef VECTORS4
+	bool result = W[117].x & W[117].y & W[117].z & W[117].w;
+	if (!result) {
+		if (!W[117].x)
+			output[FOUND] = output[NFLAG & W[3].x] = W[3].x;
+		if (!W[117].y)
+			output[FOUND] = output[NFLAG & W[3].y] = W[3].y;
+		if (!W[117].z)
+			output[FOUND] = output[NFLAG & W[3].z] = W[3].z;
+		if (!W[117].w)
+			output[FOUND] = output[NFLAG & W[3].w] = W[3].w;
+	}
+#elif defined VECTORS2
+	bool result = W[117].x & W[117].y;
+	if (!result) {
+		if (!W[117].x)
+			output[FOUND] = output[NFLAG & W[3].x] = W[3].x;
+		if (!W[117].y)
+			output[FOUND] = output[NFLAG & W[3].y] = W[3].y;
+	}
+#else
+	if (!W[117])
+		output[FOUND] = output[NFLAG & W[3]] = W[3];
+#endif
+}

+ 1353 - 0
poclbm120724.cl

@@ -0,0 +1,1353 @@
+// -ck modified kernel taken from Phoenix taken from poclbm, with aspects of
+// phatk and others.
+// Modified version copyright 2011-2012 Con Kolivas
+
+// This file is taken and modified from the public-domain poclbm project, and
+// we have therefore decided to keep it public-domain in Phoenix.
+
+#ifdef VECTORS4
+	typedef uint4 u;
+#elif defined VECTORS2
+	typedef uint2 u;
+#else
+	typedef uint u;
+#endif
+
+__constant uint K[64] = { 
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+
+// This part is not from the stock poclbm kernel. It's part of an optimization
+// added in the Phoenix Miner.
+
+// Some AMD devices have a BFI_INT opcode, which behaves exactly like the
+// SHA-256 ch function, but provides it in exactly one instruction. If
+// detected, use it for ch. Otherwise, construct ch out of simpler logical
+// primitives.
+
+#ifdef BITALIGN
+	#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+	#define rotr(x, y) amd_bitalign((u)x, (u)x, (u)y)
+#else
+	#define rotr(x, y) rotate((u)x, (u)(32 - y))
+#endif
+#ifdef BFI_INT
+	// Well, slight problem... It turns out BFI_INT isn't actually exposed to
+	// OpenCL (or CAL IL for that matter) in any way. However, there is 
+	// a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via
+	// amd_bytealign, takes the same inputs, and provides the same output. 
+	// We can use that as a placeholder for BFI_INT and have the application 
+	// patch it after compilation.
+	
+	// This is the BFI_INT function
+	#define ch(x, y, z) amd_bytealign(x, y, z)
+	
+	// Ma can also be implemented in terms of BFI_INT...
+	#define Ma(x, y, z) amd_bytealign( (z^x), (y), (x) )
+
+	// AMD's KernelAnalyzer throws errors compiling the kernel if we use
+	// amd_bytealign on constants with vectors enabled, so we use this to avoid
+	// problems. (this is used 4 times, and likely optimized out by the compiler.)
+	#define Ma2(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x)
+#else // BFI_INT
+	//GCN actually fails if manually patched with BFI_INT
+
+	#define ch(x, y, z) bitselect((u)z, (u)y, (u)x)
+	#define Ma(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x)
+	#define Ma2(x, y, z) Ma(x, y, z)
+#endif
+
+
+__kernel
+__attribute__((vec_type_hint(u)))
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+void search(const uint state0, const uint state1, const uint state2, const uint state3,
+	const uint state4, const uint state5, const uint state6, const uint state7,
+	const uint b1, const uint c1,
+	const uint f1, const uint g1, const uint h1,
+#ifndef GOFFSET
+	const u base,
+#endif
+	const uint fw0, const uint fw1, const uint fw2, const uint fw3, const uint fw15, const uint fw01r,
+	const uint D1A, const uint C1addK5, const uint B1addK6,
+	const uint W16addK16, const uint W17addK17,
+	const uint PreVal4addT1, const uint Preval0,
+	__global uint * output)
+{
+	u Vals[24];
+	u *W = &Vals[8];
+
+#ifdef GOFFSET
+	const u nonce = (uint)(get_global_id(0));
+#else
+	const u nonce = base + (uint)(get_global_id(0));
+#endif
+
+Vals[5]=Preval0;
+Vals[5]+=nonce;
+
+Vals[0]=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],b1,c1);
+Vals[0]+=D1A;
+
+Vals[2]=Vals[0];
+Vals[2]+=h1;
+
+Vals[1]=PreVal4addT1;
+Vals[1]+=nonce;
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+
+Vals[6]=C1addK5;
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],b1);
+
+Vals[3]=Vals[6];
+Vals[3]+=g1;
+Vals[0]+=Ma2(g1,Vals[1],f1);
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma2(f1,Vals[0],Vals[1]);
+
+Vals[7]=B1addK6;
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+
+Vals[4]=Vals[7];
+Vals[4]+=f1;
+
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[7];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[8];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[9];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[10];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[11];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[12];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[13];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[14];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=0xC19BF3F4U;
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=W16addK16;
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=W17addK17;
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[2]=(rotr(nonce,7)^rotr(nonce,18)^(nonce>>3U));
+W[2]+=fw2;
+Vals[4]+=W[2];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[18];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[3]=nonce;
+W[3]+=fw3;
+Vals[1]+=W[3];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[19];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[4]=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U));
+W[4]+=0x80000000U;
+Vals[0]+=W[4];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[20];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[5]=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U));
+Vals[6]+=W[5];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[21];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[6]=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U));
+W[6]+=0x00000280U;
+Vals[7]+=W[6];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[22];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[7]=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U));
+W[7]+=fw0;
+Vals[5]+=W[7];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[23];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[8]=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U));
+W[8]+=fw1;
+Vals[2]+=W[8];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[24];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[9]=W[2];
+W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U));
+Vals[3]+=W[9];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[25];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[10]=W[3];
+W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U));
+Vals[4]+=W[10];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[26];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[11]=W[4];
+W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
+Vals[1]+=W[11];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[27];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[12]=W[5];
+W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U));
+Vals[0]+=W[12];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[28];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[13]=W[6];
+W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U));
+Vals[6]+=W[13];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[29];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[14]=0x00a00055U;
+W[14]+=W[7];
+W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U));
+Vals[7]+=W[14];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[30];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[15]=fw15;
+W[15]+=W[8];
+W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U));
+Vals[5]+=W[15];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[31];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[0]=fw01r;
+W[0]+=W[9];
+W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U));
+Vals[2]+=W[0];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[32];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[1]=fw1;
+W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U));
+W[1]+=W[10];
+W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U));
+Vals[3]+=W[1];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[33];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U));
+W[2]+=W[11];
+W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U));
+Vals[4]+=W[2];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[34];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U));
+W[3]+=W[12];
+W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U));
+Vals[1]+=W[3];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[35];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U));
+W[4]+=W[13];
+W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U));
+Vals[0]+=W[4];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[36];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U));
+W[5]+=W[14];
+W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U));
+Vals[6]+=W[5];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[37];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U));
+W[6]+=W[15];
+W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U));
+Vals[7]+=W[6];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[38];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U));
+W[7]+=W[0];
+W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U));
+Vals[5]+=W[7];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[39];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U));
+W[8]+=W[1];
+W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U));
+Vals[2]+=W[8];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[40];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U));
+W[9]+=W[2];
+W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U));
+Vals[3]+=W[9];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[41];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U));
+W[10]+=W[3];
+W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U));
+Vals[4]+=W[10];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[42];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[11]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U));
+W[11]+=W[4];
+W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
+Vals[1]+=W[11];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[43];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[12]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U));
+W[12]+=W[5];
+W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U));
+Vals[0]+=W[12];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[44];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[13]+=(rotr(W[14],7)^rotr(W[14],18)^(W[14]>>3U));
+W[13]+=W[6];
+W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U));
+Vals[6]+=W[13];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[45];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[14]+=(rotr(W[15],7)^rotr(W[15],18)^(W[15]>>3U));
+W[14]+=W[7];
+W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U));
+Vals[7]+=W[14];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[46];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[15]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U));
+W[15]+=W[8];
+W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U));
+Vals[5]+=W[15];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[47];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U));
+W[0]+=W[9];
+W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U));
+Vals[2]+=W[0];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[48];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U));
+W[1]+=W[10];
+W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U));
+Vals[3]+=W[1];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[49];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U));
+W[2]+=W[11];
+W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U));
+Vals[4]+=W[2];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[50];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U));
+W[3]+=W[12];
+W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U));
+Vals[1]+=W[3];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[51];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U));
+W[4]+=W[13];
+W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U));
+Vals[0]+=W[4];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[52];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U));
+W[5]+=W[14];
+W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U));
+Vals[6]+=W[5];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[53];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U));
+W[6]+=W[15];
+W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U));
+Vals[7]+=W[6];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[54];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U));
+W[7]+=W[0];
+W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U));
+Vals[5]+=W[7];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[55];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U));
+W[8]+=W[1];
+W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U));
+Vals[2]+=W[8];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[56];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U));
+W[9]+=W[2];
+W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U));
+Vals[3]+=W[9];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[57];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U));
+W[10]+=W[3];
+W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U));
+Vals[4]+=W[10];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[58];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[11]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U));
+W[11]+=W[4];
+W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
+Vals[1]+=W[11];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[59];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[12]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U));
+W[12]+=W[5];
+W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U));
+Vals[0]+=W[12];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[60];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[13]+=(rotr(W[14],7)^rotr(W[14],18)^(W[14]>>3U));
+W[13]+=W[6];
+W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U));
+Vals[6]+=W[13];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[61];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+Vals[7]+=W[14];
+Vals[7]+=(rotr(W[15],7)^rotr(W[15],18)^(W[15]>>3U));
+Vals[7]+=W[7];
+Vals[7]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U));
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[62];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+Vals[5]+=W[15];
+Vals[5]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U));
+Vals[5]+=W[8];
+Vals[5]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U));
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[63];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+Vals[5]+=state0;
+
+W[7]=state7;
+W[7]+=Vals[2];
+
+Vals[2]=0xF377ED68U;
+Vals[2]+=Vals[5];
+
+W[3]=state3;
+W[3]+=Vals[0];
+
+Vals[0]=0xa54ff53aU;
+Vals[0]+=Vals[2];
+Vals[2]+=0x08909ae5U;
+
+W[6]=state6;
+W[6]+=Vals[3];
+
+Vals[3]=0x90BB1E3CU;
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=(0x9b05688cU^(Vals[0]&0xca0b3af3U));
+
+Vals[7]+=state1;
+Vals[3]+=Vals[7];
+
+W[2]=state2;
+W[2]+=Vals[6];
+
+Vals[6]=0x3c6ef372U;
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma2(0xbb67ae85U,Vals[2],0x6a09e667U);
+
+W[5]=state5;
+W[5]+=Vals[4];
+
+Vals[4]=0x50C6645BU;
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],0x510e527fU);
+Vals[4]+=W[2];
+
+W[1]=Vals[7];
+Vals[7]=0xbb67ae85U;
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma2(0x6a09e667U,Vals[3],Vals[2]);
+
+W[4]=state4;
+W[4]+=Vals[1];
+
+Vals[1]=0x3AC42E24U;
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=W[3];
+
+W[0]=Vals[5];
+
+Vals[5]=Vals[1];
+Vals[5]+=0x6a09e667U;
+
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[4];
+Vals[0]+=W[4];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[5];
+Vals[6]+=W[5];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[6];
+Vals[7]+=W[6];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[7];
+Vals[5]+=W[7];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=0x5807AA98U;
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[9];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[10];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[11];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[12];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[13];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[14];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=0xC19BF274U;
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U));
+Vals[2]+=W[0];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[16];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U));
+W[1]+=0x00a00000U;
+Vals[3]+=W[1];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[17];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U));
+W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U));
+Vals[4]+=W[2];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[18];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U));
+W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U));
+Vals[1]+=W[3];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[19];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U));
+W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U));
+Vals[0]+=W[4];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[20];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U));
+W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U));
+Vals[6]+=W[5];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[21];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U));
+W[6]+=0x00000100U;
+W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U));
+Vals[7]+=W[6];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[22];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[7]+=0x11002000U;
+W[7]+=W[0];
+W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U));
+Vals[5]+=W[7];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[23];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[8]=0x80000000U;
+W[8]+=W[1];
+W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U));
+Vals[2]+=W[8];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[24];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[9]=W[2];
+W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U));
+Vals[3]+=W[9];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[25];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[10]=W[3];
+W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U));
+Vals[4]+=W[10];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[26];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[11]=W[4];
+W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
+Vals[1]+=W[11];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[27];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[12]=W[5];
+W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U));
+Vals[0]+=W[12];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[28];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[13]=W[6];
+W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U));
+Vals[6]+=W[13];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[29];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[14]=0x00400022U;
+W[14]+=W[7];
+W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U));
+Vals[7]+=W[14];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[30];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[15]=0x00000100U;
+W[15]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U));
+W[15]+=W[8];
+W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U));
+Vals[5]+=W[15];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[31];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U));
+W[0]+=W[9];
+W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U));
+Vals[2]+=W[0];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[32];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U));
+W[1]+=W[10];
+W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U));
+Vals[3]+=W[1];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[33];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U));
+W[2]+=W[11];
+W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U));
+Vals[4]+=W[2];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[34];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U));
+W[3]+=W[12];
+W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U));
+Vals[1]+=W[3];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[35];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U));
+W[4]+=W[13];
+W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U));
+Vals[0]+=W[4];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[36];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U));
+W[5]+=W[14];
+W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U));
+Vals[6]+=W[5];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[37];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U));
+W[6]+=W[15];
+W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U));
+Vals[7]+=W[6];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[38];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U));
+W[7]+=W[0];
+W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U));
+Vals[5]+=W[7];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[39];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U));
+W[8]+=W[1];
+W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U));
+Vals[2]+=W[8];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[40];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U));
+W[9]+=W[2];
+W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U));
+Vals[3]+=W[9];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[41];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U));
+W[10]+=W[3];
+W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U));
+Vals[4]+=W[10];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[42];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[11]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U));
+W[11]+=W[4];
+W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
+Vals[1]+=W[11];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[43];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[12]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U));
+W[12]+=W[5];
+W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U));
+Vals[0]+=W[12];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[44];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[13]+=(rotr(W[14],7)^rotr(W[14],18)^(W[14]>>3U));
+W[13]+=W[6];
+W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U));
+Vals[6]+=W[13];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[45];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[14]+=(rotr(W[15],7)^rotr(W[15],18)^(W[15]>>3U));
+W[14]+=W[7];
+W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U));
+Vals[7]+=W[14];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[46];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[15]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U));
+W[15]+=W[8];
+W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U));
+Vals[5]+=W[15];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[47];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U));
+W[0]+=W[9];
+W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U));
+Vals[2]+=W[0];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[48];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U));
+W[1]+=W[10];
+W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U));
+Vals[3]+=W[1];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[49];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U));
+W[2]+=W[11];
+W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U));
+Vals[4]+=W[2];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[50];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U));
+W[3]+=W[12];
+W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U));
+Vals[1]+=W[3];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[51];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U));
+W[4]+=W[13];
+W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U));
+Vals[0]+=W[4];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[52];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U));
+W[5]+=W[14];
+W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U));
+Vals[6]+=W[5];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[53];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U));
+W[6]+=W[15];
+W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U));
+Vals[7]+=W[6];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[54];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U));
+W[7]+=W[0];
+W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U));
+Vals[5]+=W[7];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[55];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U));
+W[8]+=W[1];
+W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U));
+Vals[2]+=W[8];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[56];
+Vals[0]+=Vals[2];
+
+W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U));
+W[9]+=W[2];
+W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U));
+Vals[3]+=W[9];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[57];
+Vals[3]+=Vals[6];
+
+W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U));
+W[10]+=W[3];
+W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U));
+Vals[4]+=W[10];
+Vals[4]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[4]+=ch(Vals[3],Vals[0],Vals[1]);
+Vals[4]+=K[58];
+Vals[4]+=Vals[7];
+Vals[1]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[1]+=ch(Vals[4],Vals[3],Vals[0]);
+Vals[1]+=W[11];
+Vals[1]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U));
+Vals[1]+=W[4];
+Vals[1]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
+Vals[1]+=K[59];
+Vals[1]+=Vals[5];
+
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
+
+#if defined(VECTORS2) || defined(VECTORS4)
+	Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+	Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+	Vals[2]+=W[12];
+	Vals[2]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U));
+	Vals[2]+=W[5];
+	Vals[2]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U));
+	Vals[2]+=Vals[0];
+	Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+	Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+
+	if (any(Vals[2] == 0x136032edU)) {
+		if (Vals[2].x == 0x136032edU)
+			output[FOUND] = output[NFLAG & nonce.x] = nonce.x;
+		if (Vals[2].y == 0x136032edU)
+			output[FOUND] = output[NFLAG & nonce.y] = nonce.y;
+#if defined(VECTORS4)
+		if (Vals[2].z == 0x136032edU)
+			output[FOUND] = output[NFLAG & nonce.z] = nonce.z;
+		if (Vals[2].w == 0x136032edU)
+			output[FOUND] = output[NFLAG & nonce.w] = nonce.w;
+#endif
+	}
+#else
+	if ((Vals[2]+
+		Ma(Vals[6],Vals[5],Vals[7])+
+		(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22))+
+		W[12]+
+		(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U))+
+		W[5]+
+		(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U))+
+		Vals[0]+
+		(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25))+
+		ch(Vals[1],Vals[4],Vals[3])) == 0x136032edU)
+			output[FOUND] = output[NFLAG & nonce] =  nonce;
+#endif
+}

+ 757 - 0
scrypt120724.cl

@@ -0,0 +1,757 @@
+#define rotl(x,y) rotate(x,y)
+#define Ch(x,y,z) bitselect(z,y,x)
+#define Maj(x,y,z) Ch((x^z),y,z)
+
+#define EndianSwap(n) (rotl(n&0x00FF00FF,24U)|rotl(n&0xFF00FF00,8U))
+
+#define Tr2(x)		(rotl(x, 30U) ^ rotl(x, 19U) ^ rotl(x, 10U))
+#define Tr1(x)		(rotl(x, 26U) ^ rotl(x, 21U) ^ rotl(x, 7U))
+#define Wr2(x)		(rotl(x, 25U) ^ rotl(x, 14U) ^ (x>>3U))
+#define Wr1(x)		(rotl(x, 15U) ^ rotl(x, 13U) ^ (x>>10U))
+
+#define RND(a, b, c, d, e, f, g, h, k)			\
+	h += Tr1(e) + Ch(e, f, g) + k;		\
+	d += h;					\
+	h += Tr2(a) + Maj(a, b, c);
+
+void SHA256(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
+{
+	uint4 S0 = *state0;
+	uint4 S1 = *state1;
+	
+#define A S0.x
+#define B S0.y
+#define C S0.z
+#define D S0.w
+#define E S1.x
+#define F S1.y
+#define G S1.z
+#define H S1.w
+
+	uint4 W[4];
+
+	W[ 0].x = block0.x;
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x428a2f98U);
+	W[ 0].y = block0.y;
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x71374491U);
+	W[ 0].z = block0.z;
+	RND(G,H,A,B,C,D,E,F, W[0].z+0xb5c0fbcfU);
+	W[ 0].w = block0.w;
+	RND(F,G,H,A,B,C,D,E, W[0].w+0xe9b5dba5U);
+
+	W[ 1].x = block1.x;
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU);
+	W[ 1].y = block1.y;
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U);
+	W[ 1].z = block1.z;
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U);
+	W[ 1].w = block1.w;
+	RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U);
+
+	W[ 2].x = block2.x;
+	RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U);
+	W[ 2].y = block2.y;
+	RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U);
+	W[ 2].z = block2.z;
+	RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU);
+	W[ 2].w = block2.w;
+	RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U);
+
+	W[ 3].x = block3.x;
+	RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U);
+	W[ 3].y = block3.y;
+	RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU);
+	W[ 3].z = block3.z;
+	RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U);
+	W[ 3].w = block3.w;
+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U);
+
+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U);
+
+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U);
+
+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U);
+
+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU);
+
+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU);
+
+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU);
+
+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU);
+
+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU);
+
+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U);
+
+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU);
+
+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U);
+
+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U);
+
+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U);
+
+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U);
+
+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U);
+
+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U);
+
+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U);
+
+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U);
+
+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU);
+
+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U);
+
+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U);
+
+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU);
+
+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU);
+
+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U);
+
+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U);
+
+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU);
+
+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U);
+
+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U);
+
+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U);
+
+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U);
+
+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U);
+
+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U);
+
+	W[ 0].x += Wr1(W[ 3].z) + W[ 2].y + Wr2(W[ 0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U);
+
+	W[ 0].y += Wr1(W[ 3].w) + W[ 2].z + Wr2(W[ 0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U);
+
+	W[ 0].z += Wr1(W[ 0].x) + W[ 2].w + Wr2(W[ 0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU);
+
+	W[ 0].w += Wr1(W[ 0].y) + W[ 3].x + Wr2(W[ 1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U);
+
+	W[ 1].x += Wr1(W[ 0].z) + W[ 3].y + Wr2(W[ 1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U);
+
+	W[ 1].y += Wr1(W[ 0].w) + W[ 3].z + Wr2(W[ 1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU);
+
+	W[ 1].z += Wr1(W[ 1].x) + W[ 3].w + Wr2(W[ 1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU);
+
+	W[ 1].w += Wr1(W[ 1].y) + W[ 0].x + Wr2(W[ 2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U);
+
+	W[ 2].x += Wr1(W[ 1].z) + W[ 0].y + Wr2(W[ 2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU);
+
+	W[ 2].y += Wr1(W[ 1].w) + W[ 0].z + Wr2(W[ 2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU);
+
+	W[ 2].z += Wr1(W[ 2].x) + W[ 0].w + Wr2(W[ 2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U);
+
+	W[ 2].w += Wr1(W[ 2].y) + W[ 1].x + Wr2(W[ 3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U);
+
+	W[ 3].x += Wr1(W[ 2].z) + W[ 1].y + Wr2(W[ 3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU);
+
+	W[ 3].y += Wr1(W[ 2].w) + W[ 1].z + Wr2(W[ 3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU);
+
+	W[ 3].z += Wr1(W[ 3].x) + W[ 1].w + Wr2(W[ 3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U);
+
+	W[ 3].w += Wr1(W[ 3].y) + W[ 2].x + Wr2(W[ 0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U);
+	
+#undef A
+#undef B
+#undef C
+#undef D
+#undef E
+#undef F
+#undef G
+#undef H
+
+	*state0 += S0;
+	*state1 += S1;
+}
+
+void SHA256_fresh(uint4*restrict state0,uint4*restrict state1, const uint4 block0, const uint4 block1, const uint4 block2, const uint4 block3)
+{
+#define A (*state0).x
+#define B (*state0).y
+#define C (*state0).z
+#define D (*state0).w
+#define E (*state1).x
+#define F (*state1).y
+#define G (*state1).z
+#define H (*state1).w
+
+	uint4 W[4];
+
+	W[0].x = block0.x;
+	D=0x98c7e2a2U+W[0].x;
+	H=0xfc08884dU+W[0].x;
+
+	W[0].y = block0.y;
+	C=0xcd2a11aeU+Tr1(D)+Ch(D,0x510e527fU,0x9b05688cU)+W[0].y;
+	G=0xC3910C8EU+C+Tr2(H)+Ch(H,0xfb6feee7U,0x2a01a605U);
+
+	W[0].z = block0.z;
+	B=0x0c2e12e0U+Tr1(C)+Ch(C,D,0x510e527fU)+W[0].z;
+	F=0x4498517BU+B+Tr2(G)+Maj(G,H,0x6a09e667U);
+
+	W[0].w = block0.w;
+	A=0xa4ce148bU+Tr1(B)+Ch(B,C,D)+W[0].w; 
+	E=0x95F61999U+A+Tr2(F)+Maj(F,G,H);
+
+	W[1].x = block1.x;
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x3956c25bU);
+	W[1].y = block1.y;
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x59f111f1U);
+	W[1].z = block1.z;
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x923f82a4U);
+	W[1].w = block1.w;
+	RND(B,C,D,E,F,G,H,A, W[1].w+0xab1c5ed5U);
+	
+	W[2].x = block2.x;
+	RND(A,B,C,D,E,F,G,H, W[2].x+0xd807aa98U);
+	W[2].y = block2.y;
+	RND(H,A,B,C,D,E,F,G, W[2].y+0x12835b01U);
+	W[2].z = block2.z;
+	RND(G,H,A,B,C,D,E,F, W[2].z+0x243185beU);
+	W[2].w = block2.w;
+	RND(F,G,H,A,B,C,D,E, W[2].w+0x550c7dc3U);
+	
+	W[3].x = block3.x;
+	RND(E,F,G,H,A,B,C,D, W[3].x+0x72be5d74U);
+	W[3].y = block3.y;
+	RND(D,E,F,G,H,A,B,C, W[3].y+0x80deb1feU);
+	W[3].z = block3.z;
+	RND(C,D,E,F,G,H,A,B, W[3].z+0x9bdc06a7U);
+	W[3].w = block3.w;
+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc19bf174U);
+
+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0xe49b69c1U);
+
+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0xefbe4786U);
+
+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x0fc19dc6U);
+
+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x240ca1ccU);
+
+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x2de92c6fU);
+
+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4a7484aaU);
+
+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5cb0a9dcU);
+
+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x76f988daU);
+
+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0x983e5152U);
+
+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa831c66dU);
+
+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0xb00327c8U);
+
+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0xbf597fc7U);
+
+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0xc6e00bf3U);
+
+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd5a79147U);
+
+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0x06ca6351U);
+
+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0x14292967U);
+
+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x27b70a85U);
+
+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x2e1b2138U);
+
+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x4d2c6dfcU);
+
+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x53380d13U);
+
+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x650a7354U);
+
+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x766a0abbU);
+
+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x81c2c92eU);
+
+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x92722c85U);
+
+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0xa2bfe8a1U);
+
+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0xa81a664bU);
+
+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0xc24b8b70U);
+
+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0xc76c51a3U);
+
+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0xd192e819U);
+
+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xd6990624U);
+
+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0xf40e3585U);
+
+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0x106aa070U);
+
+	W[0].x += Wr1(W[3].z) + W[2].y + Wr2(W[0].y);
+	RND(A,B,C,D,E,F,G,H, W[0].x+0x19a4c116U);
+
+	W[0].y += Wr1(W[3].w) + W[2].z + Wr2(W[0].z);
+	RND(H,A,B,C,D,E,F,G, W[0].y+0x1e376c08U);
+
+	W[0].z += Wr1(W[0].x) + W[2].w + Wr2(W[0].w);
+	RND(G,H,A,B,C,D,E,F, W[0].z+0x2748774cU);
+
+	W[0].w += Wr1(W[0].y) + W[3].x + Wr2(W[1].x);
+	RND(F,G,H,A,B,C,D,E, W[0].w+0x34b0bcb5U);
+
+	W[1].x += Wr1(W[0].z) + W[3].y + Wr2(W[1].y);
+	RND(E,F,G,H,A,B,C,D, W[1].x+0x391c0cb3U);
+
+	W[1].y += Wr1(W[0].w) + W[3].z + Wr2(W[1].z);
+	RND(D,E,F,G,H,A,B,C, W[1].y+0x4ed8aa4aU);
+
+	W[1].z += Wr1(W[1].x) + W[3].w + Wr2(W[1].w);
+	RND(C,D,E,F,G,H,A,B, W[1].z+0x5b9cca4fU);
+
+	W[1].w += Wr1(W[1].y) + W[0].x + Wr2(W[2].x);
+	RND(B,C,D,E,F,G,H,A, W[1].w+0x682e6ff3U);
+
+	W[2].x += Wr1(W[1].z) + W[0].y + Wr2(W[2].y);
+	RND(A,B,C,D,E,F,G,H, W[2].x+0x748f82eeU);
+
+	W[2].y += Wr1(W[1].w) + W[0].z + Wr2(W[2].z);
+	RND(H,A,B,C,D,E,F,G, W[2].y+0x78a5636fU);
+
+	W[2].z += Wr1(W[2].x) + W[0].w + Wr2(W[2].w);
+	RND(G,H,A,B,C,D,E,F, W[2].z+0x84c87814U);
+
+	W[2].w += Wr1(W[2].y) + W[1].x + Wr2(W[3].x);
+	RND(F,G,H,A,B,C,D,E, W[2].w+0x8cc70208U);
+
+	W[3].x += Wr1(W[2].z) + W[1].y + Wr2(W[3].y);
+	RND(E,F,G,H,A,B,C,D, W[3].x+0x90befffaU);
+
+	W[3].y += Wr1(W[2].w) + W[1].z + Wr2(W[3].z);
+	RND(D,E,F,G,H,A,B,C, W[3].y+0xa4506cebU);
+
+	W[3].z += Wr1(W[3].x) + W[1].w + Wr2(W[3].w);
+	RND(C,D,E,F,G,H,A,B, W[3].z+0xbef9a3f7U);
+
+	W[3].w += Wr1(W[3].y) + W[2].x + Wr2(W[0].x);
+	RND(B,C,D,E,F,G,H,A, W[3].w+0xc67178f2U);
+	
+#undef A
+#undef B
+#undef C
+#undef D
+#undef E
+#undef F
+#undef G
+#undef H
+
+	*state0 += (uint4)(0x6A09E667U,0xBB67AE85U,0x3C6EF372U,0xA54FF53AU);
+	*state1 += (uint4)(0x510E527FU,0x9B05688CU,0x1F83D9ABU,0x5BE0CD19U);
+}
+
+__constant uint fixedW[64] =
+{
+	0x428a2f99,0xf1374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
+	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf794,
+	0xf59b89c2,0x73924787,0x23c6886e,0xa42ca65c,0x15ed3627,0x4d6edcbf,0xe28217fc,0xef02488f,
+	0xb707775c,0x0468c23f,0xe7e72b4c,0x49e1f1a2,0x4b99c816,0x926d1570,0xaa0fc072,0xadb36e2c,
+	0xad87a3ea,0xbcb1d3a3,0x7b993186,0x562b9420,0xbff3ca0c,0xda4b0c23,0x6cd8711a,0x8f337caa,
+	0xc91b1417,0xc359dce1,0xa83253a7,0x3b13c12d,0x9d3d725d,0xd9031a84,0xb1a03340,0x16f58012,
+	0xe64fb6a2,0xe84d923a,0xe93a5730,0x09837686,0x078ff753,0x29833341,0xd5de0b7e,0x6948ccf4,
+	0xe0a1adbe,0x7c728e11,0x511c78e4,0x315b45bd,0xfca71413,0xea28f96a,0x79703128,0x4e1ef848,
+};
+
+void SHA256_fixed(uint4*restrict state0,uint4*restrict state1)
+{
+	uint4 S0 = *state0;
+	uint4 S1 = *state1;
+
+#define A S0.x
+#define B S0.y
+#define C S0.z
+#define D S0.w
+#define E S1.x
+#define F S1.y
+#define G S1.z
+#define H S1.w
+
+	RND(A,B,C,D,E,F,G,H, fixedW[0]);
+	RND(H,A,B,C,D,E,F,G, fixedW[1]);
+	RND(G,H,A,B,C,D,E,F, fixedW[2]);
+	RND(F,G,H,A,B,C,D,E, fixedW[3]);
+	RND(E,F,G,H,A,B,C,D, fixedW[4]);
+	RND(D,E,F,G,H,A,B,C, fixedW[5]);
+	RND(C,D,E,F,G,H,A,B, fixedW[6]);
+	RND(B,C,D,E,F,G,H,A, fixedW[7]);
+	RND(A,B,C,D,E,F,G,H, fixedW[8]);
+	RND(H,A,B,C,D,E,F,G, fixedW[9]);
+	RND(G,H,A,B,C,D,E,F, fixedW[10]);
+	RND(F,G,H,A,B,C,D,E, fixedW[11]);
+	RND(E,F,G,H,A,B,C,D, fixedW[12]);
+	RND(D,E,F,G,H,A,B,C, fixedW[13]);
+	RND(C,D,E,F,G,H,A,B, fixedW[14]);
+	RND(B,C,D,E,F,G,H,A, fixedW[15]);
+	RND(A,B,C,D,E,F,G,H, fixedW[16]);
+	RND(H,A,B,C,D,E,F,G, fixedW[17]);
+	RND(G,H,A,B,C,D,E,F, fixedW[18]);
+	RND(F,G,H,A,B,C,D,E, fixedW[19]);
+	RND(E,F,G,H,A,B,C,D, fixedW[20]);
+	RND(D,E,F,G,H,A,B,C, fixedW[21]);
+	RND(C,D,E,F,G,H,A,B, fixedW[22]);
+	RND(B,C,D,E,F,G,H,A, fixedW[23]);
+	RND(A,B,C,D,E,F,G,H, fixedW[24]);
+	RND(H,A,B,C,D,E,F,G, fixedW[25]);
+	RND(G,H,A,B,C,D,E,F, fixedW[26]);
+	RND(F,G,H,A,B,C,D,E, fixedW[27]);
+	RND(E,F,G,H,A,B,C,D, fixedW[28]);
+	RND(D,E,F,G,H,A,B,C, fixedW[29]);
+	RND(C,D,E,F,G,H,A,B, fixedW[30]);
+	RND(B,C,D,E,F,G,H,A, fixedW[31]);
+	RND(A,B,C,D,E,F,G,H, fixedW[32]);
+	RND(H,A,B,C,D,E,F,G, fixedW[33]);
+	RND(G,H,A,B,C,D,E,F, fixedW[34]);
+	RND(F,G,H,A,B,C,D,E, fixedW[35]);
+	RND(E,F,G,H,A,B,C,D, fixedW[36]);
+	RND(D,E,F,G,H,A,B,C, fixedW[37]);
+	RND(C,D,E,F,G,H,A,B, fixedW[38]);
+	RND(B,C,D,E,F,G,H,A, fixedW[39]);
+	RND(A,B,C,D,E,F,G,H, fixedW[40]);
+	RND(H,A,B,C,D,E,F,G, fixedW[41]);
+	RND(G,H,A,B,C,D,E,F, fixedW[42]);
+	RND(F,G,H,A,B,C,D,E, fixedW[43]);
+	RND(E,F,G,H,A,B,C,D, fixedW[44]);
+	RND(D,E,F,G,H,A,B,C, fixedW[45]);
+	RND(C,D,E,F,G,H,A,B, fixedW[46]);
+	RND(B,C,D,E,F,G,H,A, fixedW[47]);
+	RND(A,B,C,D,E,F,G,H, fixedW[48]);
+	RND(H,A,B,C,D,E,F,G, fixedW[49]);
+	RND(G,H,A,B,C,D,E,F, fixedW[50]);
+	RND(F,G,H,A,B,C,D,E, fixedW[51]);
+	RND(E,F,G,H,A,B,C,D, fixedW[52]);
+	RND(D,E,F,G,H,A,B,C, fixedW[53]);
+	RND(C,D,E,F,G,H,A,B, fixedW[54]);
+	RND(B,C,D,E,F,G,H,A, fixedW[55]);
+	RND(A,B,C,D,E,F,G,H, fixedW[56]);
+	RND(H,A,B,C,D,E,F,G, fixedW[57]);
+	RND(G,H,A,B,C,D,E,F, fixedW[58]);
+	RND(F,G,H,A,B,C,D,E, fixedW[59]);
+	RND(E,F,G,H,A,B,C,D, fixedW[60]);
+	RND(D,E,F,G,H,A,B,C, fixedW[61]);
+	RND(C,D,E,F,G,H,A,B, fixedW[62]);
+	RND(B,C,D,E,F,G,H,A, fixedW[63]);
+	
+#undef A
+#undef B
+#undef C
+#undef D
+#undef E
+#undef F
+#undef G
+#undef H
+	*state0 += S0;
+	*state1 += S1;
+}
+
+void shittify(uint4 B[8])
+{
+	uint4 tmp[4];
+	tmp[0] = (uint4)(B[1].x,B[2].y,B[3].z,B[0].w);
+	tmp[1] = (uint4)(B[2].x,B[3].y,B[0].z,B[1].w);
+	tmp[2] = (uint4)(B[3].x,B[0].y,B[1].z,B[2].w);
+	tmp[3] = (uint4)(B[0].x,B[1].y,B[2].z,B[3].w);
+	
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i] = EndianSwap(tmp[i]);
+
+	tmp[0] = (uint4)(B[5].x,B[6].y,B[7].z,B[4].w);
+	tmp[1] = (uint4)(B[6].x,B[7].y,B[4].z,B[5].w);
+	tmp[2] = (uint4)(B[7].x,B[4].y,B[5].z,B[6].w);
+	tmp[3] = (uint4)(B[4].x,B[5].y,B[6].z,B[7].w);
+	
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i+4] = EndianSwap(tmp[i]);
+}
+
+void unshittify(uint4 B[8])
+{
+	uint4 tmp[4];
+	tmp[0] = (uint4)(B[3].x,B[2].y,B[1].z,B[0].w);
+	tmp[1] = (uint4)(B[0].x,B[3].y,B[2].z,B[1].w);
+	tmp[2] = (uint4)(B[1].x,B[0].y,B[3].z,B[2].w);
+	tmp[3] = (uint4)(B[2].x,B[1].y,B[0].z,B[3].w);
+	
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i] = EndianSwap(tmp[i]);
+
+	tmp[0] = (uint4)(B[7].x,B[6].y,B[5].z,B[4].w);
+	tmp[1] = (uint4)(B[4].x,B[7].y,B[6].z,B[5].w);
+	tmp[2] = (uint4)(B[5].x,B[4].y,B[7].z,B[6].w);
+	tmp[3] = (uint4)(B[6].x,B[5].y,B[4].z,B[7].w);
+	
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i+4] = EndianSwap(tmp[i]);
+}
+
+void salsa(uint4 B[8])
+{
+	uint4 w[4];
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		w[i] = (B[i]^=B[i+4]);
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+	{
+		w[0] ^= rotl(w[3]     +w[2]     , 7U);
+		w[1] ^= rotl(w[0]     +w[3]     , 9U);
+		w[2] ^= rotl(w[1]     +w[0]     ,13U);
+		w[3] ^= rotl(w[2]     +w[1]     ,18U);
+		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+	}
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		w[i] = (B[i+4]^=(B[i]+=w[i]));
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+	{
+		w[0] ^= rotl(w[3]     +w[2]     , 7U);
+		w[1] ^= rotl(w[0]     +w[3]     , 9U);
+		w[2] ^= rotl(w[1]     +w[0]     ,13U);
+		w[3] ^= rotl(w[2]     +w[1]     ,18U);
+		w[2] ^= rotl(w[3].wxyz+w[0].zwxy, 7U);
+		w[1] ^= rotl(w[2].wxyz+w[3].zwxy, 9U);
+		w[0] ^= rotl(w[1].wxyz+w[2].zwxy,13U);
+		w[3] ^= rotl(w[0].wxyz+w[1].zwxy,18U);
+	}
+
+#pragma unroll
+	for(uint i=0; i<4; ++i)
+		B[i+4] += w[i];
+}
+
+#define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
+#define CO Coord(z,x,y)
+
+void scrypt_core(uint4 X[8], __global uint4*restrict lookup)
+{
+	shittify(X);
+	const uint zSIZE = 8;
+	const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0));
+	const uint xSIZE = CONCURRENT_THREADS;
+	uint x = get_global_id(0)%xSIZE;
+
+	for(uint y=0; y<1024/LOOKUP_GAP; ++y)
+	{
+#pragma unroll
+		for(uint z=0; z<zSIZE; ++z)
+			lookup[CO] = X[z];
+		for(uint i=0; i<LOOKUP_GAP; ++i) 
+			salsa(X);
+	}
+#if (LOOKUP_GAP != 1) && (LOOKUP_GAP != 2) && (LOOKUP_GAP != 4) && (LOOKUP_GAP != 8)
+	{
+		uint y = (1024/LOOKUP_GAP);
+#pragma unroll
+		for(uint z=0; z<zSIZE; ++z)
+			lookup[CO] = X[z];
+		for(uint i=0; i<1024%LOOKUP_GAP; ++i)
+			salsa(X); 
+	}
+#endif
+	for (uint i=0; i<1024; ++i) 
+	{
+		uint4 V[8];
+		uint j = X[7].x & 0x3FF;
+		uint y = (j/LOOKUP_GAP);
+#pragma unroll
+		for(uint z=0; z<zSIZE; ++z)
+			V[z] = lookup[CO];
+
+#if (LOOKUP_GAP == 1)
+#elif (LOOKUP_GAP == 2)
+		if (j&1)
+			salsa(V);
+#else
+		uint val = j%LOOKUP_GAP;
+		for (uint z=0; z<val; ++z) 
+			salsa(V);
+#endif
+
+#pragma unroll
+		for(uint z=0; z<zSIZE; ++z)
+			X[z] ^= V[z];
+		salsa(X);
+	}
+	unshittify(X);
+}
+
+#define FOUND (0x800)
+#define NFLAG (0x7FF)
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(__global const uint4 * restrict input,
+__global uint*restrict output, __global uint4*restrict padcache,
+const uint4 midstate0, const uint4 midstate16, const uint target)
+{
+	uint gid = get_global_id(0);
+	uint4 X[8];
+	uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1;
+	uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid);
+	uint4 pad0 = midstate0, pad1 = midstate16;
+
+	SHA256(&pad0,&pad1, data, (uint4)(0x80000000U,0,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0,0x280));
+	SHA256_fresh(&ostate0,&ostate1, pad0^0x5C5C5C5CU, pad1^0x5C5C5C5CU, 0x5C5C5C5CU, 0x5C5C5C5CU);
+	SHA256_fresh(&tstate0,&tstate1, pad0^0x36363636U, pad1^0x36363636U, 0x36363636U, 0x36363636U);
+
+	tmp0 = tstate0;
+	tmp1 = tstate1;
+	SHA256(&tstate0, &tstate1, input[0],input[1],input[2],input[3]);
+
+#pragma unroll
+	for (uint i=0; i<4; i++) 
+	{
+		pad0 = tstate0;
+		pad1 = tstate1;
+		X[i*2 ] = ostate0;
+		X[i*2+1] = ostate1;
+
+		SHA256(&pad0,&pad1, data, (uint4)(i+1,0x80000000U,0,0), (uint4)(0,0,0,0), (uint4)(0,0,0,0x4a0U));
+		SHA256(X+i*2,X+i*2+1, pad0, pad1, (uint4)(0x80000000U, 0U, 0U, 0U), (uint4)(0U, 0U, 0U, 0x300U));
+	}
+	scrypt_core(X,padcache);
+	SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);
+	SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);
+	SHA256_fixed(&tmp0,&tmp1);
+	SHA256(&ostate0,&ostate1, tmp0, tmp1, (uint4)(0x80000000U, 0U, 0U, 0U), (uint4)(0U, 0U, 0U, 0x300U));
+
+	bool found = (EndianSwap(ostate1.w) <= target);
+	if (found)
+		output[FOUND] = output[NFLAG & gid] = gid;
+}
+
+/*-
+ * Copyright 2009 Colin Percival, 2011 ArtForz, 2011 pooler, 2012 mtrlt,
+ * 2012 Con Kolivas.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * This file was originally written by Colin Percival as part of the Tarsnap
+ * online backup system.
+ */