diakgcn120208.cl 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618
  1. // DiaKGCN 04-02-2012 - OpenCL kernel by Diapolo
  2. //
  3. // Parts and / or ideas for this kernel are based upon the public-domain poclbm project, the phatk kernel by Phateus and the DiabloMiner kernel by DiabloD3.
  4. // The kernel was rewritten by me (Diapolo) and is still public-domain!
  5. #if defined VECTORS4
  6. typedef uint4 u;
  7. #elif defined VECTORS2
  8. typedef uint2 u;
  9. #else
  10. typedef uint u;
  11. #endif
  12. #ifdef BFI_INT
  13. #pragma OPENCL EXTENSION cl_amd_media_ops : enable
  14. #define Ch(x, y, z) amd_bytealign(x, y, z)
  15. #define Ma(x, y, z) amd_bytealign(z ^ x, y, x)
  16. #else
  17. #define Ch(x, y, z) bitselect(z, y, x)
  18. #if defined(VECTORS2) || defined(VECTORS4) || defined(VECTORS8)
  19. // GCN - VEC2 or VEC4
  20. #define Ma(z, x, y) bitselect(z, y, z ^ x)
  21. #else
  22. // GCN - no VEC
  23. #define Ma(z, x, y) Ch(z ^ x, y, x)
  24. #endif
  25. #endif
  26. #ifdef GOFFSET
  27. typedef uint uu;
  28. #else
  29. #if defined VECTORS4
  30. typedef uint4 uu;
  31. #elif defined VECTORS2
  32. typedef uint2 uu;
  33. #else
  34. typedef uint uu;
  35. #endif
  36. #endif
  37. #define ch(n) Ch(V[(4 + 128 - n) % 8], V[(5 + 128 - n) % 8], V[(6 + 128 - n) % 8])
  38. #define ma(n) Ma(V[(1 + 128 - n) % 8], V[(2 + 128 - n) % 8], V[(0 + 128 - n) % 8])
  39. #define rot15(n) (rotate(n, 15U) ^ rotate(n, 13U) ^ (n >> 10U))
  40. #define rot25(n) (rotate(n, 25U) ^ rotate(n, 14U) ^ (n >> 3U))
  41. #define rot26(n) (rotate(n, 26U) ^ rotate(n, 21U) ^ rotate(n, 7U))
  42. #define rot30(n) (rotate(n, 30U) ^ rotate(n, 19U) ^ rotate(n, 10U))
  43. __kernel
  44. __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
  45. void search( const uu base, const uint PreVal4,
  46. const uint H1, const uint D1, const uint PreVal0, const uint B1, const uint C1,
  47. const uint F1, const uint G1, const uint C1addK5, const uint B1addK6, const uint PreVal0addK7,
  48. const uint W16addK16, const uint W17addK17,
  49. const uint PreW18, const uint PreW19,
  50. const uint W16, const uint W17,
  51. const uint PreW31, const uint PreW32,
  52. const uint state0, const uint state1, const uint state2, const uint state3,
  53. const uint state4, const uint state5, const uint state6, const uint state7,
  54. const uint state0A, const uint state0B,
  55. __global ulong * output)
  56. {
  57. u W[17];
  58. u V[8];
  59. #if defined VECTORS4
  60. #ifdef GOFFSET
  61. u nonce = base + (get_global_id(0)<<2) + (uint4)(0, 1, 2, 3);
  62. #else
  63. u nonce = base + (uint)(get_local_id(0)) * 4u + (uint)(get_group_id(0)) * (WORKSIZE * 4u);
  64. #endif
  65. #elif defined VECTORS2
  66. #ifdef GOFFSET
  67. u nonce = base + (get_global_id(0)<<1) + (uint2)(0, 1);
  68. #else
  69. u nonce = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
  70. #endif
  71. #else
  72. #ifdef GOFFSET
  73. u nonce = base + get_global_id(0);
  74. #else
  75. u nonce = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
  76. #endif
  77. #endif
  78. V[4] = PreVal4 + nonce;
  79. V[7] = H1 + (V[3] = D1 + Ch((PreVal0 + nonce), B1, C1) + rot26(PreVal0 + nonce));
  80. V[3] += rot30(V[4]) + Ma(F1, G1, V[4]);
  81. V[6] = G1 + (V[2] = C1addK5 + Ch(V[7], (PreVal0 + nonce), B1) + rot26(V[7]));
  82. V[2] += rot30(V[3]) + Ma(V[4], F1, V[3]);
  83. V[5] = F1 + (V[1] = B1addK6 + Ch(V[6], V[7], (PreVal0 + nonce)) + rot26(V[6]));
  84. V[1] += rot30(V[2]) + Ma(V[3], V[4], V[2]);
  85. V[4] += nonce + PreVal0addK7 + Ch(V[5], V[6], V[7]) + rot26(V[5]);
  86. V[0] = nonce + PreVal0addK7 + Ch(V[5], V[6], V[7]) + rot26(V[5]) +rot30(V[1]) + Ma(V[2], V[3], V[1]);
  87. V[3] += 0xd807aa98 + V[7] + Ch(V[4], V[5], V[6]) + rot26(V[4]);
  88. V[7] = 0xd807aa98 + V[7] + Ch(V[4], V[5], V[6]) + rot26(V[4]) + rot30(V[0]) + Ma(V[1], V[2], V[0]);
  89. V[2] += 0x12835b01 + V[6] + Ch(V[3], V[4], V[5]) + rot26(V[3]);
  90. V[6] = 0x12835b01 + V[6] + Ch(V[3], V[4], V[5]) + rot26(V[3]) + rot30(V[7]) + Ma(V[0], V[1], V[7]);
  91. V[1] += 0x243185be + V[5] + Ch(V[2], V[3], V[4]) + rot26(V[2]);
  92. V[5] = 0x243185be + V[5] + Ch(V[2], V[3], V[4]) + rot26(V[2]) + rot30(V[6]) + Ma(V[7], V[0], V[6]);
  93. V[0] += 0x550c7dc3 + V[4] + Ch(V[1], V[2], V[3]) + rot26(V[1]);
  94. V[4] = 0x550c7dc3 + V[4] + Ch(V[1], V[2], V[3]) + rot26(V[1]) + rot30(V[5]) + Ma(V[6], V[7], V[5]);
  95. //--------------- ch() + ma() replaced above ---------------
  96. V[7] += 0x72be5d74 + V[3] + ch(12) + rot26(V[0]);
  97. V[3] = 0x72be5d74 + V[3] + ch(12) + rot26(V[0]) + rot30(V[4]) + ma(12);
  98. V[6] += 0x80deb1fe + V[2] + ch(13) + rot26(V[7]);
  99. V[2] = 0x80deb1fe + V[2] + ch(13) + rot26(V[7]) + rot30(V[3]) + ma(13);
  100. V[5] += 0x9bdc06a7 + V[1] + ch(14) + rot26(V[6]);
  101. V[1] = 0x9bdc06a7 + V[1] + ch(14) + rot26(V[6]) + rot30(V[2]) + ma(14);
  102. V[4] += 0xc19bf3f4 + V[0] + ch(15) + rot26(V[5]);
  103. V[0] = 0xc19bf3f4 + V[0] + ch(15) + rot26(V[5]) + rot30(V[1]) + ma(15);
  104. V[3] += W16addK16 + V[7] + ch(16) + rot26(V[4]);
  105. V[7] = W16addK16 + V[7] + ch(16) + rot26(V[4]) + rot30(V[0]) + ma(16);
  106. V[2] += W17addK17 + V[6] + ch(17) + rot26(V[3]);
  107. V[6] = W17addK17 + V[6] + ch(17) + rot26(V[3]) + rot30(V[7]) + ma(17);
  108. //----------------------------------------------------------------------------------
  109. #ifdef VECTORS8
  110. W[0] = PreW18 + (u)(rot25(nonce.s0), rot25(nonce.s0) ^ 0x2004000, rot25(nonce.s0) ^ 0x4008000, rot25(nonce.s0) ^ 0x600C000,
  111. rot25(nonce.s0) ^ 0x8010000, rot25(nonce.s0) ^ 0xa014000, rot25(nonce.s0) ^ 0xc018000, rot25(nonce.s0) ^ 0xe01c000);
  112. #elif defined VECTORS4
  113. W[0] = PreW18 + (u)(rot25(nonce.x), rot25(nonce.x) ^ 0x2004000, rot25(nonce.x) ^ 0x4008000, rot25(nonce.x) ^ 0x600C000);
  114. #elif defined VECTORS2
  115. W[0] = PreW18 + (u)(rot25(nonce.x), rot25(nonce.x) ^ 0x2004000);
  116. #else
  117. W[0] = PreW18 + rot25(nonce);
  118. #endif
  119. W[1] = PreW19 + nonce;
  120. W[2] = 0x80000000 + rot15(W[0]);
  121. W[3] = rot15(W[1]);
  122. W[4] = 0x00000280 + rot15(W[2]);
  123. W[5] = W16 + rot15(W[3]);
  124. W[6] = W17 + rot15(W[4]);
  125. W[7] = W[0] + rot15(W[5]);
  126. W[8] = W[1] + rot15(W[6]);
  127. W[9] = W[2] + rot15(W[7]);
  128. W[10] = W[3] + rot15(W[8]);
  129. W[11] = W[4] + rot15(W[9]);
  130. W[12] = 0x00a00055 + W[5] + rot15(W[10]);
  131. W[13] = PreW31 + W[6] + rot15(W[11]);
  132. W[14] = PreW32 + W[7] + rot15(W[12]);
  133. W[15] = W17 + W[8] + rot15(W[13]) + rot25(W[0]);
  134. W[16] = W[0] + W[9] + rot15(W[14]) + rot25(W[1]);
  135. V[1] += 0x0fc19dc6 + V[5] + W[0] + ch(18) + rot26(V[2]);
  136. V[5] = 0x0fc19dc6 + V[5] + W[0] + ch(18) + rot26(V[2]) + rot30(V[6]) + ma(18);
  137. V[0] += 0x240ca1cc + V[4] + W[1] + ch(19) + rot26(V[1]);
  138. V[4] = 0x240ca1cc + V[4] + W[1] + ch(19) + rot26(V[1]) + rot30(V[5]) + ma(19);
  139. V[7] += 0x2de92c6f + V[3] + W[2] + ch(20) + rot26(V[0]);
  140. V[3] = 0x2de92c6f + V[3] + W[2] + ch(20) + rot26(V[0]) + rot30(V[4]) + ma(20);
  141. V[6] += 0x4a7484aa + V[2] + W[3] + ch(21) + rot26(V[7]);
  142. V[2] = 0x4a7484aa + V[2] + W[3] + ch(21) + rot26(V[7]) + rot30(V[3]) + ma(21);
  143. V[5] += 0x5cb0a9dc + V[1] + W[4] + ch(22) + rot26(V[6]);
  144. V[1] = 0x5cb0a9dc + V[1] + W[4] + ch(22) + rot26(V[6]) + rot30(V[2]) + ma(22);
  145. V[4] += 0x76f988da + V[0] + W[5] + ch(23) + rot26(V[5]);
  146. V[0] = 0x76f988da + V[0] + W[5] + ch(23) + rot26(V[5]) + rot30(V[1]) + ma(23);
  147. V[3] += 0x983e5152 + V[7] + W[6] + ch(24) + rot26(V[4]);
  148. V[7] = 0x983e5152 + V[7] + W[6] + ch(24) + rot26(V[4]) + rot30(V[0]) + ma(24);
  149. V[2] += 0xa831c66d + V[6] + W[7] + ch(25) + rot26(V[3]);
  150. V[6] = 0xa831c66d + V[6] + W[7] + ch(25) + rot26(V[3]) + rot30(V[7]) + ma(25);
  151. V[1] += 0xb00327c8 + V[5] + W[8] + ch(26) + rot26(V[2]);
  152. V[5] = 0xb00327c8 + V[5] + W[8] + ch(26) + rot26(V[2]) + rot30(V[6]) + ma(26);
  153. V[0] += 0xbf597fc7 + V[4] + W[9] + ch(27) + rot26(V[1]);
  154. V[4] = 0xbf597fc7 + V[4] + W[9] + ch(27) + rot26(V[1]) + rot30(V[5]) + ma(27);
  155. V[7] += 0xc6e00bf3 + V[3] + W[10] + ch(28) + rot26(V[0]);
  156. V[3] = 0xc6e00bf3 + V[3] + W[10] + ch(28) + rot26(V[0]) + rot30(V[4]) + ma(28);
  157. V[6] += 0xd5a79147 + V[2] + W[11] + ch(29) + rot26(V[7]);
  158. V[2] = 0xd5a79147 + V[2] + W[11] + ch(29) + rot26(V[7]) + rot30(V[3]) + ma(29);
  159. V[5] += 0x06ca6351 + V[1] + W[12] + ch(30) + rot26(V[6]);
  160. V[1] = 0x06ca6351 + V[1] + W[12] + ch(30) + rot26(V[6]) + rot30(V[2]) + ma(30);
  161. V[4] += 0x14292967 + V[0] + W[13] + ch(31) + rot26(V[5]);
  162. V[0] = 0x14292967 + V[0] + W[13] + ch(31) + rot26(V[5]) + rot30(V[1]) + ma(31);
  163. V[3] += 0x27b70a85 + V[7] + W[14] + ch(32) + rot26(V[4]);
  164. V[7] = 0x27b70a85 + V[7] + W[14] + ch(32) + rot26(V[4]) + rot30(V[0]) + ma(32);
  165. V[2] += 0x2e1b2138 + V[6] + W[15] + ch(33) + rot26(V[3]);
  166. V[6] = 0x2e1b2138 + V[6] + W[15] + ch(33) + rot26(V[3]) + rot30(V[7]) + ma(33);
  167. V[1] += 0x4d2c6dfc + V[5] + W[16] + ch(34) + rot26(V[2]);
  168. V[5] = 0x4d2c6dfc + V[5] + W[16] + ch(34) + rot26(V[2]) + rot30(V[6]) + ma(34);
  169. //----------------------------------------------------------------------------------
  170. W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]);
  171. W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]);
  172. W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]);
  173. W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]);
  174. W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]);
  175. W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]);
  176. W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]);
  177. W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]);
  178. W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]);
  179. W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]);
  180. W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]);
  181. W[11] = W[12] + W[4] + rot15( W[9]) + rot25(W[13]);
  182. W[12] = W[13] + W[5] + rot15(W[10]) + rot25(W[14]);
  183. W[13] = W[14] + W[6] + rot15(W[11]) + rot25(W[15]);
  184. W[14] = W[15] + W[7] + rot15(W[12]) + rot25(W[16]);
  185. W[15] = W[16] + W[8] + rot15(W[13]) + rot25( W[0]);
  186. W[16] = W[0] + W[9] + rot15(W[14]) + rot25( W[1]);
  187. V[0] += 0x53380d13 + V[4] + W[0] + ch(35) + rot26(V[1]);
  188. V[4] = 0x53380d13 + V[4] + W[0] + ch(35) + rot26(V[1]) + rot30(V[5]) + ma(35);
  189. V[7] += 0x650a7354 + V[3] + W[1] + ch(36) + rot26(V[0]);
  190. V[3] = 0x650a7354 + V[3] + W[1] + ch(36) + rot26(V[0]) + rot30(V[4]) + ma(36);
  191. V[6] += 0x766a0abb + V[2] + W[2] + ch(37) + rot26(V[7]);
  192. V[2] = 0x766a0abb + V[2] + W[2] + ch(37) + rot26(V[7]) + rot30(V[3]) + ma(37);
  193. V[5] += 0x81c2c92e + V[1] + W[3] + ch(38) + rot26(V[6]);
  194. V[1] = 0x81c2c92e + V[1] + W[3] + ch(38) + rot26(V[6]) + rot30(V[2]) + ma(38);
  195. V[4] += 0x92722c85 + V[0] + W[4] + ch(39) + rot26(V[5]);
  196. V[0] = 0x92722c85 + V[0] + W[4] + ch(39) + rot26(V[5]) + rot30(V[1]) + ma(39);
  197. V[3] += 0xa2bfe8a1 + V[7] + W[5] + ch(40) + rot26(V[4]);
  198. V[7] = 0xa2bfe8a1 + V[7] + W[5] + ch(40) + rot26(V[4]) + rot30(V[0]) + ma(40);
  199. V[2] += 0xa81a664b + V[6] + W[6] + ch(41) + rot26(V[3]);
  200. V[6] = 0xa81a664b + V[6] + W[6] + ch(41) + rot26(V[3]) + rot30(V[7]) + ma(41);
  201. V[1] += 0xc24b8b70 + V[5] + W[7] + ch(42) + rot26(V[2]);
  202. V[5] = 0xc24b8b70 + V[5] + W[7] + ch(42) + rot26(V[2]) + rot30(V[6]) + ma(42);
  203. V[0] += 0xc76c51a3 + V[4] + W[8] + ch(43) + rot26(V[1]);
  204. V[4] = 0xc76c51a3 + V[4] + W[8] + ch(43) + rot26(V[1]) + rot30(V[5]) + ma(43);
  205. V[7] += 0xd192e819 + V[3] + W[9] + ch(44) + rot26(V[0]);
  206. V[3] = 0xd192e819 + V[3] + W[9] + ch(44) + rot26(V[0]) + rot30(V[4]) + ma(44);
  207. V[6] += 0xd6990624 + V[2] + W[10] + ch(45) + rot26(V[7]);
  208. V[2] = 0xd6990624 + V[2] + W[10] + ch(45) + rot26(V[7]) + rot30(V[3]) + ma(45);
  209. V[5] += 0xf40e3585 + V[1] + W[11] + ch(46) + rot26(V[6]);
  210. V[1] = 0xf40e3585 + V[1] + W[11] + ch(46) + rot26(V[6]) + rot30(V[2]) + ma(46);
  211. V[4] += 0x106aa070 + V[0] + W[12] + ch(47) + rot26(V[5]);
  212. V[0] = 0x106aa070 + V[0] + W[12] + ch(47) + rot26(V[5]) + rot30(V[1]) + ma(47);
  213. V[3] += 0x19a4c116 + V[7] + W[13] + ch(48) + rot26(V[4]);
  214. V[7] = 0x19a4c116 + V[7] + W[13] + ch(48) + rot26(V[4]) + rot30(V[0]) + ma(48);
  215. V[2] += 0x1e376c08 + V[6] + W[14] + ch(49) + rot26(V[3]);
  216. V[6] = 0x1e376c08 + V[6] + W[14] + ch(49) + rot26(V[3]) + rot30(V[7]) + ma(49);
  217. V[1] += 0x2748774c + V[5] + W[15] + ch(50) + rot26(V[2]);
  218. V[5] = 0x2748774c + V[5] + W[15] + ch(50) + rot26(V[2]) + rot30(V[6]) + ma(50);
  219. V[0] += 0x34b0bcb5 + V[4] + W[16] + ch(51) + rot26(V[1]);
  220. V[4] = 0x34b0bcb5 + V[4] + W[16] + ch(51) + rot26(V[1]) + rot30(V[5]) + ma(51);
  221. //----------------------------------------------------------------------------------
  222. W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]);
  223. W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]);
  224. W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]);
  225. W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]);
  226. W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]);
  227. W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]);
  228. W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]);
  229. W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]);
  230. W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]);
  231. W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]);
  232. W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]);
  233. W[11] = W[12] + W[4] + rot15( W[9]) + rot25(W[13]);
  234. V[7] += 0x391c0cb3 + V[3] + W[0] + ch(52) + rot26(V[0]);
  235. V[3] = 0x391c0cb3 + V[3] + W[0] + ch(52) + rot26(V[0]) + rot30(V[4]) + ma(52);
  236. V[6] += 0x4ed8aa4a + V[2] + W[1] + ch(53) + rot26(V[7]);
  237. V[2] = 0x4ed8aa4a + V[2] + W[1] + ch(53) + rot26(V[7]) + rot30(V[3]) + ma(53);
  238. V[5] += 0x5b9cca4f + V[1] + W[2] + ch(54) + rot26(V[6]);
  239. V[1] = 0x5b9cca4f + V[1] + W[2] + ch(54) + rot26(V[6]) + rot30(V[2]) + ma(54);
  240. V[4] += 0x682e6ff3 + V[0] + W[3] + ch(55) + rot26(V[5]);
  241. V[0] = 0x682e6ff3 + V[0] + W[3] + ch(55) + rot26(V[5]) + rot30(V[1]) + ma(55);
  242. V[3] += 0x748f82ee + V[7] + W[4] + ch(56) + rot26(V[4]);
  243. V[7] = 0x748f82ee + V[7] + W[4] + ch(56) + rot26(V[4]) + rot30(V[0]) + ma(56);
  244. V[2] += 0x78a5636f + V[6] + W[5] + ch(57) + rot26(V[3]);
  245. V[6] = 0x78a5636f + V[6] + W[5] + ch(57) + rot26(V[3]) + rot30(V[7]) + ma(57);
  246. V[1] += 0x84c87814 + V[5] + W[6] + ch(58) + rot26(V[2]);
  247. V[5] = 0x84c87814 + V[5] + W[6] + ch(58) + rot26(V[2]) + rot30(V[6]) + ma(58);
  248. V[0] += 0x8cc70208 + V[4] + W[7] + ch(59) + rot26(V[1]);
  249. V[4] = 0x8cc70208 + V[4] + W[7] + ch(59) + rot26(V[1]) + rot30(V[5]) + ma(59);
  250. V[7] += 0x90befffa + V[3] + W[8] + ch(60) + rot26(V[0]);
  251. V[3] = 0x90befffa + V[3] + W[8] + ch(60) + rot26(V[0]) + rot30(V[4]) + ma(60);
  252. V[6] += 0xa4506ceb + V[2] + W[9] + ch(61) + rot26(V[7]);
  253. V[2] = 0xa4506ceb + V[2] + W[9] + ch(61) + rot26(V[7]) + rot30(V[3]) + ma(61);
  254. V[5] += 0xbef9a3f7 + V[1] + W[10] + ch(62) + rot26(V[6]);
  255. V[1] = 0xbef9a3f7 + V[1] + W[10] + ch(62) + rot26(V[6]) + rot30(V[2]) + ma(62);
  256. V[4] += 0xc67178f2 + V[0] + W[11] + ch(63) + rot26(V[5]);
  257. V[0] = 0xc67178f2 + V[0] + W[11] + ch(63) + rot26(V[5]) + rot30(V[1]) + ma(63);
  258. //----------------------------------------------------------------------------------
  259. W[0] = state0 + V[0];
  260. W[1] = state1 + V[1];
  261. W[2] = state2 + V[2];
  262. W[3] = state3 + V[3];
  263. W[4] = state4 + V[4];
  264. W[5] = state5 + V[5];
  265. W[6] = state6 + V[6];
  266. W[7] = state7 + V[7];
  267. // 0x98c7e2a2 + W[0]
  268. u state0AaddV0 = state0A + V[0];
  269. // 0xfc08884d + W[0]
  270. u state0BaddV0 = state0B + V[0];
  271. V[2] = 0x3c6ef372 + (V[6] = 0x90bb1e3c + W[1] + Ch(state0AaddV0, 0x510e527fU, 0x9b05688cU) + rot26(state0AaddV0));
  272. V[6] += rot30(state0BaddV0) + Ma(0x6a09e667U, 0xbb67ae85U, state0BaddV0);
  273. V[1] = 0xbb67ae85 + (V[5] = 0x50c6645b + W[2] + Ch(V[2], state0AaddV0, 0x510e527fU) + rot26(V[2]));
  274. V[5] += rot30(V[6]) + Ma(state0BaddV0, 0x6a09e667U, V[6]);
  275. V[0] = 0x6a09e667 + (V[4] = 0x3ac42e24 + W[3] + Ch(V[1], V[2], state0AaddV0) + rot26(V[1]));
  276. V[4] += rot30(V[5]) + Ma(V[6], state0BaddV0, V[5]);
  277. V[7] = (state0BaddV0) + (V[3] = 0x3956c25b + state0AaddV0 + W[4] + Ch(V[0], V[1], V[2]) + rot26(V[0]));
  278. V[3] += rot30(V[4]) + Ma(V[5], V[6], V[4]);
  279. //--------------- ch() + ma() replaced above ---------------
  280. V[6] += 0x59f111f1 + V[2] + W[5] + ch(69) + rot26(V[7]);
  281. V[2] = 0x59f111f1 + V[2] + W[5] + ch(69) + rot26(V[7]) + rot30(V[3]) + ma(69);
  282. V[5] += 0x923f82a4 + V[1] + W[6] + ch(70) + rot26(V[6]);
  283. V[1] = 0x923f82a4 + V[1] + W[6] + ch(70) + rot26(V[6]) + rot30(V[2]) + ma(70);
  284. V[4] += 0xab1c5ed5 + V[0] + W[7] + ch(71) + rot26(V[5]);
  285. V[0] = 0xab1c5ed5 + V[0] + W[7] + ch(71) + rot26(V[5]) + rot30(V[1]) + ma(71);
  286. V[3] += 0x5807aa98 + V[7] + ch(72) + rot26(V[4]);
  287. V[7] = 0x5807aa98 + V[7] + ch(72) + rot26(V[4]) + rot30(V[0]) + ma(72);
  288. V[2] += 0x12835b01 + V[6] + ch(73) + rot26(V[3]);
  289. V[6] = 0x12835b01 + V[6] + ch(73) + rot26(V[3]) + rot30(V[7]) + ma(73);
  290. V[1] += 0x243185be + V[5] + ch(74) + rot26(V[2]);
  291. V[5] = 0x243185be + V[5] + ch(74) + rot26(V[2]) + rot30(V[6]) + ma(74);
  292. V[0] += 0x550c7dc3 + V[4] + ch(75) + rot26(V[1]);
  293. V[4] = 0x550c7dc3 + V[4] + ch(75) + rot26(V[1]) + rot30(V[5]) + ma(75);
  294. V[7] += 0x72be5d74 + V[3] + ch(76) + rot26(V[0]);
  295. V[3] = 0x72be5d74 + V[3] + ch(76) + rot26(V[0]) + rot30(V[4]) + ma(76);
  296. V[6] += 0x80deb1fe + V[2] + ch(77) + rot26(V[7]);
  297. V[2] = 0x80deb1fe + V[2] + ch(77) + rot26(V[7]) + rot30(V[3]) + ma(77);
  298. V[5] += 0x9bdc06a7 + V[1] + ch(78) + rot26(V[6]);
  299. V[1] = 0x9bdc06a7 + V[1] + ch(78) + rot26(V[6]) + rot30(V[2]) + ma(78);
  300. V[4] += 0xc19bf274 + V[0] + ch(79) + rot26(V[5]);
  301. V[0] = 0xc19bf274 + V[0] + ch(79) + rot26(V[5]) + rot30(V[1]) + ma(79);
  302. //----------------------------------------------------------------------------------
  303. W[0] = W[0] + rot25(W[1]);
  304. W[1] = 0x00a00000 + W[1] + rot25(W[2]);
  305. W[2] = W[2] + rot15(W[0]) + rot25(W[3]);
  306. W[3] = W[3] + rot15(W[1]) + rot25(W[4]);
  307. W[4] = W[4] + rot15(W[2]) + rot25(W[5]);
  308. W[5] = W[5] + rot15(W[3]) + rot25(W[6]);
  309. W[6] = 0x00000100 + W[6] + rot15(W[4]) + rot25(W[7]);
  310. W[7] = 0x11002000 + W[7] + W[0] + rot15(W[5]);
  311. W[8] = 0x80000000 + W[1] + rot15(W[6]);
  312. W[9] = W[2] + rot15(W[7]);
  313. W[10] = W[3] + rot15(W[8]);
  314. W[11] = W[4] + rot15(W[9]);
  315. W[12] = W[5] + rot15(W[10]);
  316. W[13] = W[6] + rot15(W[11]);
  317. W[14] = 0x00400022 + W[7] + rot15( W[12]);
  318. W[15] = 0x00000100 + W[8] + rot15( W[13]) + rot25(W[0]);
  319. W[16] = W[0] + W[9] + rot15( W[14]) + rot25(W[1]);
  320. V[3] += 0xe49b69c1 + V[7] + W[0] + ch(80) + rot26(V[4]);
  321. V[7] = 0xe49b69c1 + V[7] + W[0] + ch(80) + rot26(V[4]) + rot30(V[0]) + ma(80);
  322. V[2] += 0xefbe4786 + V[6] + W[1] + ch(81) + rot26(V[3]);
  323. V[6] = 0xefbe4786 + V[6] + W[1] + ch(81) + rot26(V[3]) + rot30(V[7]) + ma(81);
  324. V[1] += 0x0fc19dc6 + V[5] + W[2] + ch(82) + rot26(V[2]);
  325. V[5] = 0x0fc19dc6 + V[5] + W[2] + ch(82) + rot26(V[2]) + rot30(V[6]) + ma(82);
  326. V[0] += 0x240ca1cc + V[4] + W[3] + ch(83) + rot26(V[1]);
  327. V[4] = 0x240ca1cc + V[4] + W[3] + ch(83) + rot26(V[1]) + rot30(V[5]) + ma(83);
  328. V[7] += 0x2de92c6f + V[3] + W[4] + ch(84) + rot26(V[0]);
  329. V[3] = 0x2de92c6f + V[3] + W[4] + ch(84) + rot26(V[0]) + rot30(V[4]) + ma(84);
  330. V[6] += 0x4a7484aa + V[2] + W[5] + ch(85) + rot26(V[7]);
  331. V[2] = 0x4a7484aa + V[2] + W[5] + ch(85) + rot26(V[7]) + rot30(V[3]) + ma(85);
  332. V[5] += 0x5cb0a9dc + V[1] + W[6] + ch(86) + rot26(V[6]);
  333. V[1] = 0x5cb0a9dc + V[1] + W[6] + ch(86) + rot26(V[6]) + rot30(V[2]) + ma(86);
  334. V[4] += 0x76f988da + V[0] + W[7] + ch(87) + rot26(V[5]);
  335. V[0] = 0x76f988da + V[0] + W[7] + ch(87) + rot26(V[5]) + rot30(V[1]) + ma(87);
  336. V[3] += 0x983e5152 + V[7] + W[8] + ch(88) + rot26(V[4]);
  337. V[7] = 0x983e5152 + V[7] + W[8] + ch(88) + rot26(V[4]) + rot30(V[0]) + ma(88);
  338. V[2] += 0xa831c66d + V[6] + W[9] + ch(89) + rot26(V[3]);
  339. V[6] = 0xa831c66d + V[6] + W[9] + ch(89) + rot26(V[3]) + rot30(V[7]) + ma(89);
  340. V[1] += 0xb00327c8 + V[5] + W[10] + ch(90) + rot26(V[2]);
  341. V[5] = 0xb00327c8 + V[5] + W[10] + ch(90) + rot26(V[2]) + rot30(V[6]) + ma(90);
  342. V[0] += 0xbf597fc7 + V[4] + W[11] + ch(91) + rot26(V[1]);
  343. V[4] = 0xbf597fc7 + V[4] + W[11] + ch(91) + rot26(V[1]) + rot30(V[5]) + ma(91);
  344. V[7] += 0xc6e00bf3 + V[3] + W[12] + ch(92) + rot26(V[0]);
  345. V[3] = 0xc6e00bf3 + V[3] + W[12] + ch(92) + rot26(V[0]) + rot30(V[4]) + ma(92);
  346. V[6] += 0xd5a79147 + V[2] + W[13] + ch(93) + rot26(V[7]);
  347. V[2] = 0xd5a79147 + V[2] + W[13] + ch(93) + rot26(V[7]) + rot30(V[3]) + ma(93);
  348. V[5] += 0x06ca6351 + V[1] + W[14] + ch(94) + rot26(V[6]);
  349. V[1] = 0x06ca6351 + V[1] + W[14] + ch(94) + rot26(V[6]) + rot30(V[2]) + ma(94);
  350. V[4] += 0x14292967 + V[0] + W[15] + ch(95) + rot26(V[5]);
  351. V[0] = 0x14292967 + V[0] + W[15] + ch(95) + rot26(V[5]) + rot30(V[1]) + ma(95);
  352. V[3] += 0x27b70a85 + V[7] + W[16] + ch(96) + rot26(V[4]);
  353. V[7] = 0x27b70a85 + V[7] + W[16] + ch(96) + rot26(V[4]) + rot30(V[0]) + ma(96);
  354. //----------------------------------------------------------------------------------
  355. W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]);
  356. W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]);
  357. W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]);
  358. W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]);
  359. W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]);
  360. W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]);
  361. W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]);
  362. W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]);
  363. W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]);
  364. W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]);
  365. W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]);
  366. W[11] = W[12] + W[4] + rot15( W[9]) + rot25(W[13]);
  367. W[12] = W[13] + W[5] + rot15(W[10]) + rot25(W[14]);
  368. W[13] = W[14] + W[6] + rot15(W[11]) + rot25(W[15]);
  369. W[14] = W[15] + W[7] + rot15(W[12]) + rot25(W[16]);
  370. W[15] = W[16] + W[8] + rot15(W[13]) + rot25( W[0]);
  371. W[16] = W[0] + W[9] + rot15(W[14]) + rot25( W[1]);
  372. V[2] += 0x2e1b2138 + V[6] + W[0] + ch(97) + rot26(V[3]);
  373. V[6] = 0x2e1b2138 + V[6] + W[0] + ch(97) + rot26(V[3]) + rot30(V[7]) + ma(97);
  374. V[1] += 0x4d2c6dfc + V[5] + W[1] + ch(98) + rot26(V[2]);
  375. V[5] = 0x4d2c6dfc + V[5] + W[1] + ch(98) + rot26(V[2]) + rot30(V[6]) + ma(98);
  376. V[0] += 0x53380d13 + V[4] + W[2] + ch(99) + rot26(V[1]);
  377. V[4] = 0x53380d13 + V[4] + W[2] + ch(99) + rot26(V[1]) + rot30(V[5]) + ma(99);
  378. V[7] += 0x650a7354 + V[3] + W[3] + ch(100) + rot26(V[0]);
  379. V[3] = 0x650a7354 + V[3] + W[3] + ch(100) + rot26(V[0]) + rot30(V[4]) + ma(100);
  380. V[6] += 0x766a0abb + V[2] + W[4] + ch(101) + rot26(V[7]);
  381. V[2] = 0x766a0abb + V[2] + W[4] + ch(101) + rot26(V[7]) + rot30(V[3]) + ma(101);
  382. V[5] += 0x81c2c92e + V[1] + W[5] + ch(102) + rot26(V[6]);
  383. V[1] = 0x81c2c92e + V[1] + W[5] + ch(102) + rot26(V[6]) + rot30(V[2]) + ma(102);
  384. V[4] += 0x92722c85 + V[0] + W[6] + ch(103) + rot26(V[5]);
  385. V[0] = 0x92722c85 + V[0] + W[6] + ch(103) + rot26(V[5]) + rot30(V[1]) + ma(103);
  386. V[3] += 0xa2bfe8a1 + V[7] + W[7] + ch(104) + rot26(V[4]);
  387. V[7] = 0xa2bfe8a1 + V[7] + W[7] + ch(104) + rot26(V[4]) + rot30(V[0]) + ma(104);
  388. V[2] += 0xa81a664b + V[6] + W[8] + ch(105) + rot26(V[3]);
  389. V[6] = 0xa81a664b + V[6] + W[8] + ch(105) + rot26(V[3]) + rot30(V[7]) + ma(105);
  390. V[1] += 0xc24b8b70 + V[5] + W[9] + ch(106) + rot26(V[2]);
  391. V[5] = 0xc24b8b70 + V[5] + W[9] + ch(106) + rot26(V[2]) + rot30(V[6]) + ma(106);
  392. V[0] += 0xc76c51a3 + V[4] + W[10] + ch(107) + rot26(V[1]);
  393. V[4] = 0xc76c51a3 + V[4] + W[10] + ch(107) + rot26(V[1]) + rot30(V[5]) + ma(107);
  394. V[7] += 0xd192e819 + V[3] + W[11] + ch(108) + rot26(V[0]);
  395. V[3] = 0xd192e819 + V[3] + W[11] + ch(108) + rot26(V[0]) + rot30(V[4]) + ma(108);
  396. V[6] += 0xd6990624 + V[2] + W[12] + ch(109) + rot26(V[7]);
  397. V[2] = 0xd6990624 + V[2] + W[12] + ch(109) + rot26(V[7]) + rot30(V[3]) + ma(109);
  398. V[5] += 0xf40e3585 + V[1] + W[13] + ch(110) + rot26(V[6]);
  399. V[1] = 0xf40e3585 + V[1] + W[13] + ch(110) + rot26(V[6]) + rot30(V[2]) + ma(110);
  400. V[4] += 0x106aa070 + V[0] + W[14] + ch(111) + rot26(V[5]);
  401. V[0] = 0x106aa070 + V[0] + W[14] + ch(111) + rot26(V[5]) + rot30(V[1]) + ma(111);
  402. V[3] += 0x19a4c116 + V[7] + W[15] + ch(112) + rot26(V[4]);
  403. V[7] = 0x19a4c116 + V[7] + W[15] + ch(112) + rot26(V[4]) + rot30(V[0]) + ma(112);
  404. V[2] += 0x1e376c08 + V[6] + W[16] + ch(113) + rot26(V[3]);
  405. V[6] = 0x1e376c08 + V[6] + W[16] + ch(113) + rot26(V[3]) + rot30(V[7]) + ma(113);
  406. //----------------------------------------------------------------------------------
  407. W[0] = W[1] + W[10] + rot15(W[15]) + rot25( W[2]);
  408. W[1] = W[2] + W[11] + rot15(W[16]) + rot25( W[3]);
  409. W[2] = W[3] + W[12] + rot15( W[0]) + rot25( W[4]);
  410. W[3] = W[4] + W[13] + rot15( W[1]) + rot25( W[5]);
  411. W[4] = W[5] + W[14] + rot15( W[2]) + rot25( W[6]);
  412. W[5] = W[6] + W[15] + rot15( W[3]) + rot25( W[7]);
  413. W[6] = W[7] + W[16] + rot15( W[4]) + rot25( W[8]);
  414. W[7] = W[8] + W[0] + rot15( W[5]) + rot25( W[9]);
  415. W[8] = W[9] + W[1] + rot15( W[6]) + rot25(W[10]);
  416. W[9] = W[10] + W[2] + rot15( W[7]) + rot25(W[11]);
  417. W[10] = W[11] + W[3] + rot15( W[8]) + rot25(W[12]);
  418. V[1] += 0x2748774c + V[5] + W[0] + ch(114) + rot26(V[2]);
  419. V[5] = 0x2748774c + V[5] + W[0] + ch(114) + rot26(V[2]) + rot30(V[6]) + ma(114);
  420. V[0] += 0x34b0bcb5 + V[4] + W[1] + ch(115) + rot26(V[1]);
  421. V[4] = 0x34b0bcb5 + V[4] + W[1] + ch(115) + rot26(V[1]) + rot30(V[5]) + ma(115);
  422. V[7] += 0x391c0cb3 + V[3] + W[2] + ch(116) + rot26(V[0]);
  423. V[3] = 0x391c0cb3 + V[3] + W[2] + ch(116) + rot26(V[0]) + rot30(V[4]) + ma(116);
  424. V[6] += 0x4ed8aa4a + V[2] + W[3] + ch(117) + rot26(V[7]);
  425. V[2] = 0x4ed8aa4a + V[2] + W[3] + ch(117) + rot26(V[7]) + rot30(V[3]) + ma(117);
  426. V[5] += 0x5b9cca4f + V[1] + W[4] + ch(118) + rot26(V[6]);
  427. V[1] = 0x5b9cca4f + V[1] + W[4] + ch(118) + rot26(V[6]) + rot30(V[2]) + ma(118);
  428. V[4] += 0x682e6ff3 + V[0] + W[5] + ch(119) + rot26(V[5]);
  429. V[0] = 0x682e6ff3 + V[0] + W[5] + ch(119) + rot26(V[5]) + rot30(V[1]) + ma(119);
  430. V[3] += 0x748f82ee + V[7] + W[6] + ch(120) + rot26(V[4]);
  431. V[7] = 0x748f82ee + V[7] + W[6] + ch(120) + rot26(V[4]) + rot30(V[0]) + ma(120);
  432. V[2] += 0x78a5636f + V[6] + W[7] + ch(121) + rot26(V[3]);
  433. V[1] += 0x84c87814 + V[5] + W[8] + ch(122) + rot26(V[2]);
  434. V[0] += 0x8cc70208 + V[4] + W[9] + ch(123) + rot26(V[1]);
  435. V[7] += V[3] + W[10] + ch(124) + rot26(V[0]);
  436. #define FOUND (0x80)
  437. #define NFLAG (0x7F)
  438. #ifdef VECTORS4
  439. V[7] ^= 0x136032ed;
  440. bool result = V[7].x & V[7].y & V[7].z & V[7].w;
  441. if (!result) {
  442. if (!V[7].x)
  443. output[FOUND] = output[NFLAG & W[3].x] = W[3].x;
  444. if (!V[7].y)
  445. output[FOUND] = output[NFLAG & W[3].y] = W[3].y;
  446. if (!V[7].z)
  447. output[FOUND] = output[NFLAG & W[3].z] = W[3].z;
  448. if (!V[7].w)
  449. output[FOUND] = output[NFLAG & W[3].w] = W[3].w;
  450. }
  451. #else
  452. #ifdef VECTORS2
  453. V[7] ^= 0x136032ed;
  454. bool result = V[7].x & V[7].y;
  455. if (!result) {
  456. if (!V[7].x)
  457. output[FOUND] = output[NFLAG & W[3].x] = W[3].x;
  458. if (!V[7].y)
  459. output[FOUND] = output[NFLAG & W[3].y] = W[3].y;
  460. }
  461. #else
  462. if (V[7] == 0x136032ed)
  463. output[FOUND] = output[NFLAG & W[3]] = W[3];
  464. #endif
  465. #endif
  466. }