diakgcn120223.cl 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. // DiaKGCN 16-03-2012 - OpenCL kernel by Diapolo
  2. //
  3. // Parts and / or ideas for this kernel are based upon the public-domain poclbm project, the phatk kernel by Phateus and the DiabloMiner kernel by DiabloD3.
  4. // The kernel was rewritten by me (Diapolo) and is still public-domain!
  5. #ifdef VECTORS4
  6. typedef uint4 u;
  7. #elif defined VECTORS2
  8. typedef uint2 u;
  9. #else
  10. typedef uint u;
  11. #endif
  12. #ifdef BITALIGN
  13. #pragma OPENCL EXTENSION cl_amd_media_ops : enable
  14. #ifdef BFI_INT
  15. #define ch(x, y, z) amd_bytealign(x, y, z)
  16. #define ma(x, y, z) amd_bytealign(z ^ x, y, x)
  17. #else
  18. #define ch(x, y, z) bitselect(z, y, x)
  19. #define ma(z, x, y) bitselect(z, y, z ^ x)
  20. #endif
  21. #else
  22. #define ch(x, y, z) (z ^ (x & (y ^ z)))
  23. #define ma(x, y, z) ((x & z) | (y & (x | z)))
  24. #endif
  25. #define rotr15(n) (rotate(n, 15U) ^ rotate(n, 13U) ^ (n >> 10U))
  26. #define rotr25(n) (rotate(n, 25U) ^ rotate(n, 14U) ^ (n >> 3U))
  27. #define rotr26(n) (rotate(n, 26U) ^ rotate(n, 21U) ^ rotate(n, 7U))
  28. #define rotr30(n) (rotate(n, 30U) ^ rotate(n, 19U) ^ rotate(n, 10U))
  29. __kernel
  30. __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
  31. void search(
  32. const u base,
  33. const uint PreVal0, const uint PreVal4,
  34. const uint H1, const uint D1A, const uint B1, const uint C1,
  35. const uint F1, const uint G1, const uint C1addK5, const uint B1addK6, const uint PreVal0addK7,
  36. const uint W16addK16, const uint W17addK17,
  37. const uint PreW18, const uint PreW19,
  38. const uint W16, const uint W17,
  39. const uint PreW31, const uint PreW32,
  40. const uint state0, const uint state1, const uint state2, const uint state3,
  41. const uint state4, const uint state5, const uint state6, const uint state7,
  42. const uint state0A, const uint state0B,
  43. const uint state1A, const uint state2A, const uint state3A, const uint state4A,
  44. const uint state5A, const uint state6A, const uint state7A,
  45. __global uint * output)
  46. {
  47. u V[8];
  48. u W[16];
  49. #ifdef VECTORS4
  50. const u nonce = (uint)(get_local_id(0)) * 4U + (uint)(get_group_id(0)) * (uint)(WORKVEC) + base;
  51. #elif defined VECTORS2
  52. const u nonce = (uint)(get_local_id(0)) * 2U + (uint)(get_group_id(0)) * (uint)(WORKVEC) + base;
  53. #else
  54. const u nonce = (uint)(get_local_id(0)) + (uint)(get_group_id(0)) * (uint)(WORKSIZE) + base;
  55. #endif
  56. V[0] = PreVal0 + nonce;
  57. V[1] = B1;
  58. V[2] = C1;
  59. V[3] = D1A;
  60. V[4] = PreVal4 + nonce;
  61. V[5] = F1;
  62. V[6] = G1;
  63. V[7] = H1;
  64. V[7] += V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  65. V[3] = V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  66. V[6] += C1addK5 + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  67. V[2] = C1addK5 + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  68. V[5] += B1addK6 + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  69. V[1] = B1addK6 + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  70. V[4] += PreVal0addK7 + nonce + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  71. V[0] = PreVal0addK7 + nonce + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  72. V[3] += 0xd807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  73. V[7] = 0xd807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  74. V[2] += 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  75. V[6] = 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  76. V[1] += 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  77. V[5] = 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  78. V[0] += 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  79. V[4] = 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  80. V[7] += 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  81. V[3] = 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  82. V[6] += 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  83. V[2] = 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  84. V[5] += 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  85. V[1] = 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  86. V[4] += 0xc19bf3f4U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  87. V[0] = 0xc19bf3f4U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  88. V[3] += W16addK16 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  89. V[7] = W16addK16 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  90. V[2] += W17addK17 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  91. V[6] = W17addK17 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  92. //----------------------------------------------------------------------------------
  93. #ifdef VECTORS4
  94. W[0] = PreW18 + (u)(rotr25(nonce.x), rotr25(nonce.x) ^ 0x2004000U, rotr25(nonce.x) ^ 0x4008000U, rotr25(nonce.x) ^ 0x600c000U);
  95. #elif defined VECTORS2
  96. W[0] = PreW18 + (u)(rotr25(nonce.x), rotr25(nonce.x) ^ 0x2004000U);
  97. #else
  98. W[0] = PreW18 + rotr25(nonce);
  99. #endif
  100. W[1] = PreW19 + nonce;
  101. W[2] = 0x80000000U + rotr15(W[0]);
  102. W[3] = rotr15(W[1]);
  103. W[4] = 0x00000280U + rotr15(W[2]);
  104. W[5] = W16 + rotr15(W[3]);
  105. W[6] = W17 + rotr15(W[4]);
  106. W[7] = W[0] + rotr15(W[5]);
  107. W[8] = W[1] + rotr15(W[6]);
  108. W[9] = W[2] + rotr15(W[7]);
  109. W[10] = W[3] + rotr15(W[8]);
  110. W[11] = W[4] + rotr15(W[9]);
  111. W[12] = W[5] + 0x00a00055U + rotr15(W[10]);
  112. W[13] = W[6] + PreW31 + rotr15(W[11]);
  113. W[14] = W[7] + PreW32 + rotr15(W[12]);
  114. W[15] = W[8] + W17 + rotr15(W[13]) + rotr25(W[0]);
  115. V[1] += 0x0fc19dc6U + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + W[0];
  116. V[5] = 0x0fc19dc6U + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + W[0] + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  117. V[0] += 0x240ca1ccU + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  118. V[4] = 0x240ca1ccU + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  119. V[7] += 0x2de92c6fU + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  120. V[3] = 0x2de92c6fU + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  121. V[6] += 0x4a7484aaU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  122. V[2] = 0x4a7484aaU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  123. V[5] += 0x5cb0a9dcU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  124. V[1] = 0x5cb0a9dcU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  125. V[4] += 0x76f988daU + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  126. V[0] = 0x76f988daU + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  127. V[3] += 0x983e5152U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  128. V[7] = 0x983e5152U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  129. V[2] += 0xa831c66dU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  130. V[6] = 0xa831c66dU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  131. V[1] += 0xb00327c8U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  132. V[5] = 0xb00327c8U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  133. V[0] += 0xbf597fc7U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  134. V[4] = 0xbf597fc7U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  135. V[7] += 0xc6e00bf3U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  136. V[3] = 0xc6e00bf3U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  137. V[6] += 0xd5a79147U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  138. V[2] = 0xd5a79147U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  139. V[5] += 0x06ca6351U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  140. V[1] = 0x06ca6351U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  141. V[4] += 0x14292967U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  142. V[0] = 0x14292967U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  143. V[3] += 0x27b70a85U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  144. V[7] = 0x27b70a85U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  145. V[2] += 0x2e1b2138U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  146. V[6] = 0x2e1b2138U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  147. //----------------------------------------------------------------------------------
  148. W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]);
  149. W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
  150. W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
  151. W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
  152. W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
  153. W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
  154. W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
  155. W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]);
  156. W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]);
  157. W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]);
  158. W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]);
  159. W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]);
  160. W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]);
  161. W[13] = W[13] + W[6] + rotr15(W[11]) + rotr25(W[14]);
  162. W[14] = W[14] + W[7] + rotr15(W[12]) + rotr25(W[15]);
  163. W[15] = W[15] + W[8] + rotr15(W[13]) + rotr25( W[0]);
  164. V[1] += 0x4d2c6dfcU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  165. V[5] = 0x4d2c6dfcU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  166. V[0] += 0x53380d13U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  167. V[4] = 0x53380d13U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  168. V[7] += 0x650a7354U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  169. V[3] = 0x650a7354U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  170. V[6] += 0x766a0abbU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  171. V[2] = 0x766a0abbU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  172. V[5] += 0x81c2c92eU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  173. V[1] = 0x81c2c92eU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  174. V[4] += 0x92722c85U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  175. V[0] = 0x92722c85U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  176. V[3] += 0xa2bfe8a1U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  177. V[7] = 0xa2bfe8a1U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  178. V[2] += 0xa81a664bU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  179. V[6] = 0xa81a664bU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  180. V[1] += 0xc24b8b70U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  181. V[5] = 0xc24b8b70U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  182. V[0] += 0xc76c51a3U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  183. V[4] = 0xc76c51a3U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  184. V[7] += 0xd192e819U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  185. V[3] = 0xd192e819U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  186. V[6] += 0xd6990624U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  187. V[2] = 0xd6990624U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  188. V[5] += 0xf40e3585U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  189. V[1] = 0xf40e3585U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  190. V[4] += 0x106aa070U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  191. V[0] = 0x106aa070U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  192. V[3] += 0x19a4c116U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  193. V[7] = 0x19a4c116U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  194. V[2] += 0x1e376c08U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  195. V[6] = 0x1e376c08U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  196. //----------------------------------------------------------------------------------
  197. W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]);
  198. W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
  199. W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
  200. W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
  201. W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
  202. W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
  203. W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
  204. W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]);
  205. W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]);
  206. W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]);
  207. W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]);
  208. W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]);
  209. W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]);
  210. W[13] = W[13] + W[6] + rotr15(W[11]) + rotr25(W[14]);
  211. V[1] += 0x2748774cU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  212. V[5] = 0x2748774cU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  213. V[0] += 0x34b0bcb5U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  214. V[4] = 0x34b0bcb5U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  215. V[7] += 0x391c0cb3U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  216. V[3] = 0x391c0cb3U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  217. V[6] += 0x4ed8aa4aU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  218. V[2] = 0x4ed8aa4aU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  219. V[5] += 0x5b9cca4fU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  220. V[1] = 0x5b9cca4fU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  221. V[4] += 0x682e6ff3U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  222. V[0] = 0x682e6ff3U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  223. V[3] += 0x748f82eeU + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  224. V[7] = 0x748f82eeU + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  225. V[2] += 0x78a5636fU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  226. V[6] = 0x78a5636fU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  227. V[1] += 0x84c87814U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  228. V[5] = 0x84c87814U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  229. V[0] += 0x8cc70208U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  230. V[4] = 0x8cc70208U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  231. V[7] += 0x90befffaU + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  232. V[3] = 0x90befffaU + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  233. V[6] += 0xa4506cebU + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  234. V[2] = 0xa4506cebU + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  235. V[5] += 0xbef9a3f7U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  236. V[1] = 0xbef9a3f7U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  237. V[4] += 0xc67178f2U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  238. V[0] = 0xc67178f2U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  239. //----------------------------------------------------------------------------------
  240. W[0] = state0 + V[0] + rotr25(state1 + V[1]);
  241. W[1] = state1 + V[1] + 0x00a00000U + rotr25(state2 + V[2]);
  242. W[2] = state2 + V[2] + rotr15(W[0]) + rotr25(state3 + V[3]);
  243. W[3] = state3 + V[3] + rotr15(W[1]) + rotr25(state4 + V[4]);
  244. W[4] = state4 + V[4] + rotr15(W[2]) + rotr25(state5 + V[5]);
  245. W[5] = state5 + V[5] + rotr15(W[3]) + rotr25(state6 + V[6]);
  246. W[6] = state6 + V[6] + 0x00000100U + rotr15(W[4]) + rotr25(state7 + V[7]);
  247. W[7] = state7 + V[7] + W[0] + 0x11002000U + rotr15(W[5]);
  248. W[8] = W[1] + 0x80000000U + rotr15(W[6]);
  249. W[9] = W[2] + rotr15(W[7]);
  250. W[10] = W[3] + rotr15(W[8]);
  251. W[11] = W[4] + rotr15(W[9]);
  252. W[12] = W[5] + rotr15(W[10]);
  253. W[13] = W[6] + rotr15(W[11]);
  254. W[14] = W[7] + 0x00400022U + rotr15(W[12]);
  255. W[15] = W[8] + 0x00000100U + rotr15(W[13]) + rotr25(W[0]);
  256. // 0x71374491U + 0x1f83d9abU + state1
  257. const u state1AaddV1 = state1A + V[1];
  258. // 0xb5c0fbcfU + 0x9b05688cU + state2
  259. const u state2AaddV2 = state2A + V[2];
  260. // 0x510e527fU + 0xe9b5dba5U + state3
  261. const u state3AaddV3 = state3A + V[3];
  262. // 0x3956c25bU + state4
  263. const u state4AaddV4 = state4A + V[4];
  264. // 0x59f111f1U + state5
  265. const u state5AaddV5 = state5A + V[5];
  266. // 0x923f82a4U + state6
  267. const u state6AaddV6 = state6A + V[6];
  268. // 0xab1c5ed5U + state7
  269. const u state7AaddV7 = state7A + V[7];
  270. // 0x98c7e2a2U + state0
  271. V[3] = state0A + V[0];
  272. // 0xfc08884dU + state0
  273. V[7] = state0B + V[0];
  274. V[0] = 0x6a09e667U;
  275. V[1] = 0xbb67ae85U;
  276. V[2] = 0x3c6ef372U;
  277. V[4] = 0x510e527fU;
  278. V[5] = 0x9b05688cU;
  279. V[6] = 0x1f83d9abU;
  280. V[2] += state1AaddV1 + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  281. V[6] = state1AaddV1 + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  282. V[1] += state2AaddV2 + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  283. V[5] = state2AaddV2 + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  284. V[0] += state3AaddV3 + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  285. V[4] = state3AaddV3 + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  286. V[7] += state4AaddV4 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  287. V[3] = state4AaddV4 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  288. V[6] += state5AaddV5 + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  289. V[2] = state5AaddV5 + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  290. V[5] += state6AaddV6 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  291. V[1] = state6AaddV6 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  292. V[4] += state7AaddV7 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  293. V[0] = state7AaddV7 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  294. V[3] += 0x5807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  295. V[7] = 0x5807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  296. V[2] += 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  297. V[6] = 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  298. V[1] += 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  299. V[5] = 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  300. V[0] += 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  301. V[4] = 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  302. V[7] += 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  303. V[3] = 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  304. V[6] += 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  305. V[2] = 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  306. V[5] += 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  307. V[1] = 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  308. V[4] += 0xc19bf274U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  309. V[0] = 0xc19bf274U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  310. V[3] += 0xe49b69c1U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  311. V[7] = 0xe49b69c1U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  312. V[2] += 0xefbe4786U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  313. V[6] = 0xefbe4786U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  314. V[1] += 0x0fc19dc6U + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  315. V[5] = 0x0fc19dc6U + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  316. V[0] += 0x240ca1ccU + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  317. V[4] = 0x240ca1ccU + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  318. V[7] += 0x2de92c6fU + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  319. V[3] = 0x2de92c6fU + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  320. V[6] += 0x4a7484aaU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  321. V[2] = 0x4a7484aaU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  322. V[5] += 0x5cb0a9dcU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  323. V[1] = 0x5cb0a9dcU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  324. V[4] += 0x76f988daU + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  325. V[0] = 0x76f988daU + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  326. V[3] += 0x983e5152U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  327. V[7] = 0x983e5152U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  328. V[2] += 0xa831c66dU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  329. V[6] = 0xa831c66dU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  330. V[1] += 0xb00327c8U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  331. V[5] = 0xb00327c8U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  332. V[0] += 0xbf597fc7U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  333. V[4] = 0xbf597fc7U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  334. V[7] += 0xc6e00bf3U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  335. V[3] = 0xc6e00bf3U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  336. V[6] += 0xd5a79147U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  337. V[2] = 0xd5a79147U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  338. V[5] += 0x06ca6351U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  339. V[1] = 0x06ca6351U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  340. V[4] += 0x14292967U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  341. V[0] = 0x14292967U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  342. //----------------------------------------------------------------------------------
  343. W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]);
  344. W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
  345. W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
  346. W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
  347. W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
  348. W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
  349. W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
  350. W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]);
  351. W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]);
  352. W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]);
  353. W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]);
  354. W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]);
  355. W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]);
  356. W[13] = W[13] + W[6] + rotr15(W[11]) + rotr25(W[14]);
  357. W[14] = W[14] + W[7] + rotr15(W[12]) + rotr25(W[15]);
  358. W[15] = W[15] + W[8] + rotr15(W[13]) + rotr25( W[0]);
  359. V[3] += 0x27b70a85U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  360. V[7] = 0x27b70a85U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  361. V[2] += 0x2e1b2138U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  362. V[6] = 0x2e1b2138U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  363. V[1] += 0x4d2c6dfcU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  364. V[5] = 0x4d2c6dfcU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  365. V[0] += 0x53380d13U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  366. V[4] = 0x53380d13U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  367. V[7] += 0x650a7354U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  368. V[3] = 0x650a7354U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  369. V[6] += 0x766a0abbU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  370. V[2] = 0x766a0abbU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  371. V[5] += 0x81c2c92eU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  372. V[1] = 0x81c2c92eU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  373. V[4] += 0x92722c85U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  374. V[0] = 0x92722c85U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  375. V[3] += 0xa2bfe8a1U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  376. V[7] = 0xa2bfe8a1U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  377. V[2] += 0xa81a664bU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  378. V[6] = 0xa81a664bU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  379. V[1] += 0xc24b8b70U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  380. V[5] = 0xc24b8b70U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  381. V[0] += 0xc76c51a3U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  382. V[4] = 0xc76c51a3U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  383. V[7] += 0xd192e819U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  384. V[3] = 0xd192e819U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  385. V[6] += 0xd6990624U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  386. V[2] = 0xd6990624U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  387. V[5] += 0xf40e3585U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  388. V[1] = 0xf40e3585U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  389. V[4] += 0x106aa070U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  390. V[0] = 0x106aa070U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  391. //----------------------------------------------------------------------------------
  392. W[0] = W[0] + W[9] + rotr15(W[14]) + rotr25( W[1]);
  393. W[1] = W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
  394. W[2] = W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
  395. W[3] = W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
  396. W[4] = W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
  397. W[5] = W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
  398. W[6] = W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
  399. W[7] = W[7] + W[0] + rotr15( W[5]) + rotr25( W[8]);
  400. W[8] = W[8] + W[1] + rotr15( W[6]) + rotr25( W[9]);
  401. W[9] = W[9] + W[2] + rotr15( W[7]) + rotr25(W[10]);
  402. W[10] = W[10] + W[3] + rotr15( W[8]) + rotr25(W[11]);
  403. W[11] = W[11] + W[4] + rotr15( W[9]) + rotr25(W[12]);
  404. W[12] = W[12] + W[5] + rotr15(W[10]) + rotr25(W[13]);
  405. V[3] += 0x19a4c116U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  406. V[7] = 0x19a4c116U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  407. V[2] += 0x1e376c08U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  408. V[6] = 0x1e376c08U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
  409. V[1] += 0x2748774cU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  410. V[5] = 0x2748774cU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
  411. V[0] += 0x34b0bcb5U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  412. V[4] = 0x34b0bcb5U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
  413. V[7] += 0x391c0cb3U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  414. V[3] = 0x391c0cb3U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
  415. V[6] += 0x4ed8aa4aU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
  416. V[2] = 0x4ed8aa4aU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
  417. V[5] += 0x5b9cca4fU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
  418. V[1] = 0x5b9cca4fU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
  419. V[4] += 0x682e6ff3U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
  420. V[0] = 0x682e6ff3U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
  421. V[3] += 0x748f82eeU + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
  422. V[7] = 0x748f82eeU + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
  423. V[2] += 0x78a5636fU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
  424. V[1] += 0x84c87814U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
  425. V[0] += 0x8cc70208U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
  426. V[7] += V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
  427. #define FOUND (0x80)
  428. #define NFLAG (0x7F)
  429. #ifdef VECTORS4
  430. if ((V[7].x == 0x136032edU) ^ (V[7].y == 0x136032edU) ^ (V[7].z == 0x136032edU) ^ (V[7].w == 0x136032edU))
  431. output[FOUND] = output[NFLAG & nonce.x] = (V[7].x == 0x136032edU) ? nonce.x : ((V[7].y == 0x136032edU) ? nonce.y : ((V[7].z == 0x136032edU) ? nonce.z : nonce.w));
  432. #elif defined VECTORS2
  433. if ((V[7].x == 0x136032edU) + (V[7].y == 0x136032edU))
  434. output[FOUND] = output[NFLAG & nonce.x] = (V[7].x == 0x136032edU) ? nonce.x : nonce.y;
  435. #else
  436. if (V[7] == 0x136032edU)
  437. output[FOUND] = output[NFLAG & nonce] = nonce;
  438. #endif
  439. }