diakgcn120216.cl 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656
  1. // DiaKGCN 18-02-2012 - OpenCL kernel by Diapolo
  2. //
  3. // Parts and / or ideas for this kernel are based upon the public-domain poclbm project, the phatk kernel by Phateus and the DiabloMiner kernel by DiabloD3.
  4. // The kernel was rewritten by me (Diapolo) and is still public-domain!
  5. #ifdef VECTORS8
  6. typedef uint8 u;
  7. #elif defined VECTORS4
  8. typedef uint4 u;
  9. #elif defined VECTORS2
  10. typedef uint2 u;
  11. #else
  12. typedef uint u;
  13. #endif
  14. #ifdef BITALIGN
  15. #pragma OPENCL EXTENSION cl_amd_media_ops : enable
  16. #ifdef BFI_INT
  17. #define ch(x, y, z) amd_bytealign(x, y, z)
  18. #define ma(x, y, z) amd_bytealign(z ^ x, y, x)
  19. #else
  20. #define ch(x, y, z) bitselect(z, y, x)
  21. #if defined(VECTORS2) || defined(VECTORS4) || defined(VECTORS8)
  22. // GCN - VEC2 or VEC4 or VEC8
  23. #define ma(z, x, y) bitselect(z, y, z ^ x)
  24. #else
  25. // GCN - no VEC
  26. #define ma(z, x, y) ch(z ^ x, y, x)
  27. #endif
  28. #endif
  29. #else //BITALIGN
  30. #define ch(x, y, z) (z ^ (x & (y ^ z)))
  31. #define ma(x, y, z) ((x & z) | (y & (x | z)))
  32. #endif
  33. #define rotr15(n) (rotate(n, 15U) ^ rotate(n, 13U) ^ (n >> 10U))
  34. #define rotr25(n) (rotate(n, 25U) ^ rotate(n, 14U) ^ (n >> 3U))
  35. #define rotr26(n) (rotate(n, 26U) ^ rotate(n, 21U) ^ rotate(n, 7U))
  36. #define rotr30(n) (rotate(n, 30U) ^ rotate(n, 19U) ^ rotate(n, 10U))
  37. __kernel
  38. __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
  39. void search(
  40. #ifndef GOFFSET
  41. const u base,
  42. #endif
  43. const uint PreVal0, const uint PreVal4,
  44. const uint H1, const uint D1A, const uint B1, const uint C1,
  45. const uint F1, const uint G1, const uint C1addK5, const uint B1addK6, const uint PreVal0addK7,
  46. const uint W16addK16, const uint W17addK17,
  47. const uint PreW18, const uint PreW19,
  48. const uint W16, const uint W17,
  49. const uint PreW31, const uint PreW32,
  50. const uint state0, const uint state1, const uint state2, const uint state3,
  51. const uint state4, const uint state5, const uint state6, const uint state7,
  52. const uint state0A, const uint state0B,
  53. const uint state1A, const uint state2A, const uint state3A, const uint state4A,
  54. const uint state5A, const uint state6A, const uint state7A,
  55. __global uint * output)
  56. {
  57. u W[16];
  58. u V[8];
  59. #ifdef VECTORS8
  60. #ifdef GOFFSET
  61. const u nonce = ((uint)get_global_id(0) << 3) + (u)(0, 1, 2, 3, 4, 5, 6, 7);
  62. #else
  63. const u nonce = ((uint)get_group_id(0) * (uint)get_local_size(0) << 3) + ((uint)get_local_id(0) << 3) + base;
  64. #endif
  65. #elif defined VECTORS4
  66. #ifdef GOFFSET
  67. const u nonce = ((uint)get_global_id(0) << 2) + (u)(0, 1, 2, 3);
  68. #else
  69. const u nonce = ((uint)get_group_id(0) * (uint)get_local_size(0) << 2) + ((uint)get_local_id(0) << 2) + base;
  70. #endif
  71. #elif defined VECTORS2
  72. #ifdef GOFFSET
  73. const u nonce = ((uint)get_global_id(0) << 1) + (u)(0, 1);
  74. #else
  75. const u nonce = ((uint)get_group_id(0) * (uint)get_local_size(0) << 1) + ((uint)get_local_id(0) << 1) + base;
  76. #endif
  77. #else
  78. #ifdef GOFFSET
  79. const u nonce = (uint)get_global_id(0);
  80. #else
  81. const u nonce = ((uint)get_group_id(0) * (uint)get_local_size(0)) + (uint)get_local_id(0) + base;
  82. #endif
  83. #endif
  84. V[0] = PreVal0 + nonce;
  85. V[1] = B1;
  86. V[2] = C1;
  87. V[3] = D1A;
  88. V[4] = PreVal4 + nonce;
  89. V[5] = F1;
  90. V[6] = G1;
  91. V[7] = H1;
  92. V[7] += V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  93. V[3] = V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  94. V[6] += C1addK5 + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  95. V[2] = C1addK5 + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  96. V[5] += B1addK6 + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  97. V[1] = B1addK6 + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  98. V[4] += PreVal0addK7 + nonce + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  99. V[0] = PreVal0addK7 + nonce + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  100. V[3] += 0xd807aa98 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  101. V[7] = 0xd807aa98 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  102. V[2] += 0x12835b01 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  103. V[6] = 0x12835b01 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  104. V[1] += 0x243185be + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  105. V[5] = 0x243185be + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  106. V[0] += 0x550c7dc3 + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  107. V[4] = 0x550c7dc3 + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  108. V[7] += 0x72be5d74 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  109. V[3] = 0x72be5d74 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  110. V[6] += 0x80deb1fe + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  111. V[2] = 0x80deb1fe + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  112. V[5] += 0x9bdc06a7 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  113. V[1] = 0x9bdc06a7 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  114. V[4] += 0xc19bf3f4 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  115. V[0] = 0xc19bf3f4 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  116. V[3] += W16addK16 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  117. V[7] = W16addK16 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  118. V[2] += W17addK17 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  119. V[6] = W17addK17 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  120. //----------------------------------------------------------------------------------
  121. #ifdef VECTORS8
  122. W[0] = PreW18 + (u)( rotr25(nonce.s0), rotr25(nonce.s0) ^ 0x2004000, rotr25(nonce.s0) ^ 0x4008000, rotr25(nonce.s0) ^ 0x600C000,
  123. rotr25(nonce.s0) ^ 0x8010000, rotr25(nonce.s0) ^ 0xa014000, rotr25(nonce.s0) ^ 0xc018000, rotr25(nonce.s0) ^ 0xe01c000);
  124. #elif defined VECTORS4
  125. W[0] = PreW18 + (u)(rotr25(nonce.x), rotr25(nonce.x) ^ 0x2004000, rotr25(nonce.x) ^ 0x4008000, rotr25(nonce.x) ^ 0x600C000);
  126. #elif defined VECTORS2
  127. W[0] = PreW18 + (u)(rotr25(nonce.x), rotr25(nonce.x) ^ 0x2004000);
  128. #else
  129. W[0] = PreW18 + rotr25(nonce);
  130. #endif
  131. W[1] = PreW19 + nonce;
  132. W[2] = 0x80000000 + rotr15(W[0]);
  133. W[3] = rotr15(W[1]);
  134. W[4] = 0x00000280 + rotr15(W[2]);
  135. W[5] = W16 + rotr15(W[3]);
  136. W[6] = W17 + rotr15(W[4]);
  137. W[7] = W[0] + rotr15(W[5]);
  138. W[8] = W[1] + rotr15(W[6]);
  139. W[9] = W[2] + rotr15(W[7]);
  140. W[10] = W[3] + rotr15(W[8]);
  141. W[11] = W[4] + rotr15(W[9]);
  142. W[12] = W[5] + 0x00a00055 + rotr15(W[10]);
  143. W[13] = W[6] + PreW31 + rotr15(W[11]);
  144. W[14] = W[7] + PreW32 + rotr15(W[12]);
  145. W[15] = W[8] + W17 + rotr15(W[13]) + rotr25(W[0]);
  146. V[1] += 0x0fc19dc6 + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  147. V[5] = 0x0fc19dc6 + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  148. V[0] += 0x240ca1cc + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  149. V[4] = 0x240ca1cc + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  150. V[7] += 0x2de92c6f + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  151. V[3] = 0x2de92c6f + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  152. V[6] += 0x4a7484aa + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  153. V[2] = 0x4a7484aa + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  154. V[5] += 0x5cb0a9dc + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  155. V[1] = 0x5cb0a9dc + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  156. V[4] += 0x76f988da + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  157. V[0] = 0x76f988da + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  158. V[3] += 0x983e5152 + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  159. V[7] = 0x983e5152 + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  160. V[2] += 0xa831c66d + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  161. V[6] = 0xa831c66d + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  162. V[1] += 0xb00327c8 + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  163. V[5] = 0xb00327c8 + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  164. V[0] += 0xbf597fc7 + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  165. V[4] = 0xbf597fc7 + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  166. V[7] += 0xc6e00bf3 + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  167. V[3] = 0xc6e00bf3 + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  168. V[6] += 0xd5a79147 + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  169. V[2] = 0xd5a79147 + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  170. V[5] += 0x06ca6351 + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  171. V[1] = 0x06ca6351 + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  172. V[4] += 0x14292967 + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  173. V[0] = 0x14292967 + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  174. V[3] += 0x27b70a85 + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  175. V[7] = 0x27b70a85 + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  176. V[2] += 0x2e1b2138 + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  177. V[6] = 0x2e1b2138 + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  178. //----------------------------------------------------------------------------------
  179. W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]);
  180. W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
  181. W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
  182. W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
  183. W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
  184. W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
  185. W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
  186. W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]);
  187. W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]);
  188. W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]);
  189. W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]);
  190. W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]);
  191. W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]);
  192. W[13] = W[13] + W[6] + rotr15(W[11]) + rotr25(W[14]);
  193. W[14] = W[14] + W[7] + rotr15(W[12]) + rotr25(W[15]);
  194. W[15] = W[15] + W[8] + rotr15(W[13]) + rotr25( W[0]);
  195. V[1] += 0x4d2c6dfc + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  196. V[5] = 0x4d2c6dfc + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  197. V[0] += 0x53380d13 + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  198. V[4] = 0x53380d13 + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  199. V[7] += 0x650a7354 + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  200. V[3] = 0x650a7354 + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  201. V[6] += 0x766a0abb + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  202. V[2] = 0x766a0abb + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  203. V[5] += 0x81c2c92e + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  204. V[1] = 0x81c2c92e + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  205. V[4] += 0x92722c85 + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  206. V[0] = 0x92722c85 + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  207. V[3] += 0xa2bfe8a1 + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  208. V[7] = 0xa2bfe8a1 + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  209. V[2] += 0xa81a664b + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  210. V[6] = 0xa81a664b + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  211. V[1] += 0xc24b8b70 + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  212. V[5] = 0xc24b8b70 + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  213. V[0] += 0xc76c51a3 + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  214. V[4] = 0xc76c51a3 + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  215. V[7] += 0xd192e819 + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  216. V[3] = 0xd192e819 + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  217. V[6] += 0xd6990624 + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  218. V[2] = 0xd6990624 + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  219. V[5] += 0xf40e3585 + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  220. V[1] = 0xf40e3585 + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  221. V[4] += 0x106aa070 + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  222. V[0] = 0x106aa070 + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  223. V[3] += 0x19a4c116 + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  224. V[7] = 0x19a4c116 + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  225. V[2] += 0x1e376c08 + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  226. V[6] = 0x1e376c08 + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  227. //----------------------------------------------------------------------------------
  228. W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]);
  229. W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
  230. W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
  231. W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
  232. W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
  233. W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
  234. W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
  235. W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]);
  236. W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]);
  237. W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]);
  238. W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]);
  239. W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]);
  240. W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]);
  241. W[13] = W[13] + W[6] + rotr15(W[11]) + rotr25(W[14]);
  242. V[1] += 0x2748774c + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  243. V[5] = 0x2748774c + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  244. V[0] += 0x34b0bcb5 + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  245. V[4] = 0x34b0bcb5 + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  246. V[7] += 0x391c0cb3 + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  247. V[3] = 0x391c0cb3 + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  248. V[6] += 0x4ed8aa4a + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  249. V[2] = 0x4ed8aa4a + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  250. V[5] += 0x5b9cca4f + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  251. V[1] = 0x5b9cca4f + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  252. V[4] += 0x682e6ff3 + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  253. V[0] = 0x682e6ff3 + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  254. V[3] += 0x748f82ee + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  255. V[7] = 0x748f82ee + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  256. V[2] += 0x78a5636f + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  257. V[6] = 0x78a5636f + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  258. V[1] += 0x84c87814 + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  259. V[5] = 0x84c87814 + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  260. V[0] += 0x8cc70208 + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  261. V[4] = 0x8cc70208 + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  262. V[7] += 0x90befffa + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  263. V[3] = 0x90befffa + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  264. V[6] += 0xa4506ceb + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  265. V[2] = 0xa4506ceb + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  266. V[5] += 0xbef9a3f7 + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  267. V[1] = 0xbef9a3f7 + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  268. V[4] += 0xc67178f2 + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  269. V[0] = 0xc67178f2 + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  270. //----------------------------------------------------------------------------------
  271. W[0] = state0 + V[0] + rotr25(state1 + V[1]);
  272. W[1] = state1 + V[1] + 0x00a00000 + rotr25(state2 + V[2]);
  273. W[2] = state2 + V[2] + rotr15(W[0]) + rotr25(state3 + V[3]);
  274. W[3] = state3 + V[3] + rotr15(W[1]) + rotr25(state4 + V[4]);
  275. W[4] = state4 + V[4] + rotr15(W[2]) + rotr25(state5 + V[5]);
  276. W[5] = state5 + V[5] + rotr15(W[3]) + rotr25(state6 + V[6]);
  277. W[6] = state6 + V[6] + 0x00000100 + rotr15(W[4]) + rotr25(state7 + V[7]);
  278. W[7] = state7 + V[7] + W[0] + 0x11002000 + rotr15(W[5]);
  279. W[8] = W[1] + 0x80000000 + rotr15(W[6]);
  280. W[9] = W[2] + rotr15(W[7]);
  281. W[10] = W[3] + rotr15(W[8]);
  282. W[11] = W[4] + rotr15(W[9]);
  283. W[12] = W[5] + rotr15(W[10]);
  284. W[13] = W[6] + rotr15(W[11]);
  285. W[14] = W[7] + 0x00400022 + rotr15(W[12]);
  286. W[15] = W[8] + 0x00000100 + rotr15(W[13]) + rotr25(W[0]);
  287. // 0x71374491 + 0x1f83d9ab + state1
  288. const u state1AaddV1 = state1A + V[1];
  289. // 0xb5c0fbcf + 0x9b05688c + state2
  290. const u state2AaddV2 = state2A + V[2];
  291. // 0x510e527f + 0xe9b5dba5 + state3
  292. const u state3AaddV3 = state3A + V[3];
  293. // 0x3956c25b + state4
  294. const u state4AaddV4 = state4A + V[4];
  295. // 0x59f111f1 + state5
  296. const u state5AaddV5 = state5A + V[5];
  297. // 0x923f82a4 + state6
  298. const u state6AaddV6 = state6A + V[6];
  299. // 0xab1c5ed5 + state7
  300. const u state7AaddV7 = state7A + V[7];
  301. // 0x98c7e2a2 + state0
  302. V[3] = state0A + V[0];
  303. // 0xfc08884d + state0
  304. V[7] = state0B + V[0];
  305. V[0] = 0x6a09e667;
  306. V[1] = 0xbb67ae85;
  307. V[2] = 0x3c6ef372;
  308. V[4] = 0x510e527f;
  309. V[5] = 0x9b05688c;
  310. V[6] = 0x1f83d9ab;
  311. V[2] += state1AaddV1 + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  312. V[6] = state1AaddV1 + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  313. V[1] += state2AaddV2 + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  314. V[5] = state2AaddV2 + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  315. V[0] += state3AaddV3 + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  316. V[4] = state3AaddV3 + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  317. V[7] += state4AaddV4 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  318. V[3] = state4AaddV4 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  319. V[6] += state5AaddV5 + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  320. V[2] = state5AaddV5 + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  321. V[5] += state6AaddV6 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  322. V[1] = state6AaddV6 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  323. V[4] += state7AaddV7 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  324. V[0] = state7AaddV7 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  325. V[3] += 0x5807aa98 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  326. V[7] = 0x5807aa98 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  327. V[2] += 0x12835b01 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  328. V[6] = 0x12835b01 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  329. V[1] += 0x243185be + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  330. V[5] = 0x243185be + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  331. V[0] += 0x550c7dc3 + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  332. V[4] = 0x550c7dc3 + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  333. V[7] += 0x72be5d74 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  334. V[3] = 0x72be5d74 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  335. V[6] += 0x80deb1fe + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  336. V[2] = 0x80deb1fe + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  337. V[5] += 0x9bdc06a7 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  338. V[1] = 0x9bdc06a7 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  339. V[4] += 0xc19bf274 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  340. V[0] = 0xc19bf274 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  341. V[3] += 0xe49b69c1 + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  342. V[7] = 0xe49b69c1 + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  343. V[2] += 0xefbe4786 + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  344. V[6] = 0xefbe4786 + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  345. V[1] += 0x0fc19dc6 + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  346. V[5] = 0x0fc19dc6 + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  347. V[0] += 0x240ca1cc + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  348. V[4] = 0x240ca1cc + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  349. V[7] += 0x2de92c6f + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  350. V[3] = 0x2de92c6f + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  351. V[6] += 0x4a7484aa + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  352. V[2] = 0x4a7484aa + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  353. V[5] += 0x5cb0a9dc + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  354. V[1] = 0x5cb0a9dc + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  355. V[4] += 0x76f988da + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  356. V[0] = 0x76f988da + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  357. V[3] += 0x983e5152 + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  358. V[7] = 0x983e5152 + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  359. V[2] += 0xa831c66d + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  360. V[6] = 0xa831c66d + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  361. V[1] += 0xb00327c8 + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  362. V[5] = 0xb00327c8 + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  363. V[0] += 0xbf597fc7 + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  364. V[4] = 0xbf597fc7 + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  365. V[7] += 0xc6e00bf3 + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  366. V[3] = 0xc6e00bf3 + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  367. V[6] += 0xd5a79147 + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  368. V[2] = 0xd5a79147 + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  369. V[5] += 0x06ca6351 + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  370. V[1] = 0x06ca6351 + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  371. V[4] += 0x14292967 + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  372. V[0] = 0x14292967 + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  373. //----------------------------------------------------------------------------------
  374. W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]);
  375. W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
  376. W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
  377. W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
  378. W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
  379. W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
  380. W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
  381. W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]);
  382. W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]);
  383. W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]);
  384. W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]);
  385. W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]);
  386. W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]);
  387. W[13] = W[13] + W[6] + rotr15(W[11]) + rotr25(W[14]);
  388. W[14] = W[14] + W[7] + rotr15(W[12]) + rotr25(W[15]);
  389. W[15] = W[15] + W[8] + rotr15(W[13]) + rotr25( W[0]);
  390. V[3] += 0x27b70a85 + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  391. V[7] = 0x27b70a85 + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  392. V[2] += 0x2e1b2138 + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  393. V[6] = 0x2e1b2138 + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  394. V[1] += 0x4d2c6dfc + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  395. V[5] = 0x4d2c6dfc + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  396. V[0] += 0x53380d13 + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  397. V[4] = 0x53380d13 + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  398. V[7] += 0x650a7354 + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  399. V[3] = 0x650a7354 + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  400. V[6] += 0x766a0abb + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  401. V[2] = 0x766a0abb + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  402. V[5] += 0x81c2c92e + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  403. V[1] = 0x81c2c92e + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  404. V[4] += 0x92722c85 + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  405. V[0] = 0x92722c85 + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  406. V[3] += 0xa2bfe8a1 + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  407. V[7] = 0xa2bfe8a1 + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  408. V[2] += 0xa81a664b + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  409. V[6] = 0xa81a664b + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  410. V[1] += 0xc24b8b70 + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  411. V[5] = 0xc24b8b70 + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  412. V[0] += 0xc76c51a3 + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  413. V[4] = 0xc76c51a3 + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  414. V[7] += 0xd192e819 + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  415. V[3] = 0xd192e819 + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  416. V[6] += 0xd6990624 + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  417. V[2] = 0xd6990624 + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  418. V[5] += 0xf40e3585 + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  419. V[1] = 0xf40e3585 + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  420. V[4] += 0x106aa070 + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  421. V[0] = 0x106aa070 + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  422. //----------------------------------------------------------------------------------
  423. W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]);
  424. W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
  425. W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
  426. W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
  427. W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
  428. W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
  429. W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
  430. W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]);
  431. W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]);
  432. W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]);
  433. W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]);
  434. W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]);
  435. W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]);
  436. V[3] += 0x19a4c116 + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  437. V[7] = 0x19a4c116 + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  438. V[2] += 0x1e376c08 + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  439. V[6] = 0x1e376c08 + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  440. V[1] += 0x2748774c + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  441. V[5] = 0x2748774c + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  442. V[0] += 0x34b0bcb5 + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  443. V[4] = 0x34b0bcb5 + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  444. V[7] += 0x391c0cb3 + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  445. V[3] = 0x391c0cb3 + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  446. V[6] += 0x4ed8aa4a + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  447. V[2] = 0x4ed8aa4a + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  448. V[5] += 0x5b9cca4f + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  449. V[1] = 0x5b9cca4f + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  450. V[4] += 0x682e6ff3 + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  451. V[0] = 0x682e6ff3 + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  452. V[3] += 0x748f82ee + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  453. V[7] = 0x748f82ee + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  454. V[2] += 0x78a5636f + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  455. V[1] += 0x84c87814 + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  456. V[0] += 0x8cc70208 + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  457. V[7] += V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  458. #define FOUND (0x80)
  459. #define NFLAG (0x7F)
  460. #ifdef VECTORS8
  461. V[7] ^= 0x136032ed;
  462. bool result = V[7].s0 & V[7].s1 & V[7].s2 & V[7].s3 & V[7].s4 & V[7].s5 & V[7].s6 & V[7].s7;
  463. if (!result) {
  464. if (!V[7].s0)
  465. output[FOUND] = output[NFLAG & nonce.s0] = nonce.s0;
  466. if (!V[7].s1)
  467. output[FOUND] = output[NFLAG & nonce.s1] = nonce.s1;
  468. if (!V[7].s2)
  469. output[FOUND] = output[NFLAG & nonce.s2] = nonce.s2;
  470. if (!V[7].s3)
  471. output[FOUND] = output[NFLAG & nonce.s3] = nonce.s3;
  472. if (!V[7].s4)
  473. output[FOUND] = output[NFLAG & nonce.s4] = nonce.s4;
  474. if (!V[7].s5)
  475. output[FOUND] = output[NFLAG & nonce.s5] = nonce.s5;
  476. if (!V[7].s6)
  477. output[FOUND] = output[NFLAG & nonce.s6] = nonce.s6;
  478. if (!V[7].s7)
  479. output[FOUND] = output[NFLAG & nonce.s7] = nonce.s7;
  480. }
  481. #elif defined VECTORS4
  482. V[7] ^= 0x136032ed;
  483. bool result = V[7].x & V[7].y & V[7].z & V[7].w;
  484. if (!result) {
  485. if (!V[7].x)
  486. output[FOUND] = output[NFLAG & nonce.x] = nonce.x;
  487. if (!V[7].y)
  488. output[FOUND] = output[NFLAG & nonce.y] = nonce.y;
  489. if (!V[7].z)
  490. output[FOUND] = output[NFLAG & nonce.z] = nonce.z;
  491. if (!V[7].w)
  492. output[FOUND] = output[NFLAG & nonce.w] = nonce.w;
  493. }
  494. #elif defined VECTORS2
  495. V[7] ^= 0x136032ed;
  496. bool result = V[7].x & V[7].y;
  497. if (!result) {
  498. if (!V[7].x)
  499. output[FOUND] = output[NFLAG & nonce.x] = nonce.x;
  500. if (!V[7].y)
  501. output[FOUND] = output[NFLAG & nonce.y] = nonce.y;
  502. }
  503. #else
  504. if (V[7] == 0x136032ed)
  505. output[FOUND] = output[NFLAG & nonce] = nonce;
  506. #endif
  507. }