phatk110722.cl 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. // This file is taken and modified from the public-domain poclbm project, and
  2. // we have therefore decided to keep it public-domain in Phoenix.
  3. // 2011-07-12: further modified by Diapolo and still public-domain
  4. // -ck version to be compatible with cgminer
  5. // 2011-07-14: shorter code
  6. #define VECTORSX
  7. #ifdef VECTORS4
  8. typedef uint4 u;
  9. #elif defined VECTORS2
  10. typedef uint2 u;
  11. #else
  12. typedef uint u;
  13. #endif
  14. __constant uint K[64] = {
  15. 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
  16. 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
  17. 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
  18. 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
  19. 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
  20. 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
  21. 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
  22. 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
  23. };
  24. // H[6] = 0x08909ae5U + 0xb0edbdd0 + K[0] == 0xfc08884d
  25. // H[7] = -0x5be0cd19 - (0x90befffa) K[60] == -0xec9fcd13
  26. __constant uint H[8] = {
  27. 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0xfc08884d, 0xec9fcd13
  28. };
  29. // L = 0xa54ff53a + 0xb0edbdd0 + K[0] == 0x198c7e2a2
  30. __constant ulong L = 0x198c7e2a2;
  31. #define BFI_INTX
  32. #define BITALIGNX
  33. #define O 15
  34. #ifdef BITALIGN
  35. #pragma OPENCL EXTENSION cl_amd_media_ops : enable
  36. #define rot(x, y) amd_bitalign(x, x, (u)(32 - y))
  37. #else
  38. #define rot(x, y) rotate(x, (u)y)
  39. #endif
  40. #ifdef BFI_INT
  41. #define Ch(x, y, z) amd_bytealign(x, y, z)
  42. #else
  43. #define Ch(x, y, z) bitselect(z, y, x)
  44. #endif
  45. // Ma now uses the Ch function, if BFI_INT is enabled, the optimized Ch version is used
  46. #define Ma(x, y, z) Ch((z ^ x), y, x)
  47. // Various intermediate calculations for each SHA round
  48. #define s0(n) (rot(Vals[(128 - n) % 8], 30) ^ rot(Vals[(128 - n) % 8], 19) ^ rot(Vals[(128 - n) % 8], 10))
  49. #define s1(n) (rot(Vals[(132 - n) % 8], 26) ^ rot(Vals[(132 - n) % 8], 21) ^ rot(Vals[(132 - n) % 8], 7))
  50. #define ch(n) (Ch(Vals[(132 - n) % 8], Vals[(133 - n) % 8], Vals[(134 - n) % 8]))
  51. #define ma(n) (Ma(Vals[(129 - n) % 8], Vals[(130 - n) % 8], Vals[(128 - n) % 8]))
  52. #define t1(n) (K[n % 64] + Vals[(135 - n) % 8] + W[n - O] + s1(n) + ch(n))
  53. #define t1_no_W(n) (K[n % 64] + Vals[(135 - n) % 8] + s1(n) + ch(n))
  54. // intermediate W calculations
  55. #define P1(x) (rot(W[x - 2 - O], 15) ^ rot(W[x - 2 - O], 13) ^ (W[x - 2 - O] >> 10U))
  56. #define P2(x) (rot(W[x - 15 - O], 25) ^ rot(W[x - 15 - O], 14) ^ (W[x - 15 - O] >> 3U))
  57. #define P3(x) W[x - 7 - O]
  58. #define P4(x) W[x - 16 - O]
  59. // full W calculation
  60. #define W(x) (W[x - O] = P4(x) + P3(x) + P2(x) + P1(x))
  61. // SHA round without W calc
  62. #define sharound(n) { Vals[(131 - n) % 8] += t1(n); Vals[(135 - n) % 8] = t1(n) + s0(n) + ma(n); }
  63. #define sharound_no_W(n) { Vals[(131 - n) % 8] += t1_no_W(n); Vals[(135 - n) % 8] = t1_no_W(n) + s0(n) + ma(n); }
  64. __kernel void search( const uint state0, const uint state1, const uint state2, const uint state3,
  65. const uint state4, const uint state5, const uint state6, const uint state7,
  66. const uint B1, const uint C1, const uint C1addK5, const uint D1,
  67. const uint F1, const uint G1, const uint H1,
  68. const uint base,
  69. const uint W2,
  70. const uint W16, const uint W17, const uint W17_2,
  71. const uint PreVal4addT1, const uint T1substate0,
  72. __global uint * output)
  73. {
  74. u W[124 - O];
  75. u Vals[8];
  76. #ifdef VECTORS4
  77. u W_3 = base + (get_global_id(0) << 2) + (uint4)(0, 1, 2, 3);
  78. #elif defined VECTORS2
  79. u W_3 = base + (get_global_id(0) << 1) + (uint2)(0, 1);
  80. #else
  81. u W_3 = base + get_global_id(0);
  82. #endif
  83. u Temp;
  84. Vals[0] = W_3 + PreVal4addT1 + T1substate0;
  85. Vals[1] = B1;
  86. Vals[2] = C1;
  87. Vals[4] = W_3 + PreVal4addT1;
  88. Vals[5] = F1;
  89. Vals[6] = G1;
  90. // used in: P2(19) == 285220864 (0x11002000), P4(20)
  91. // W[4] = 0x80000000U;
  92. // P1(x) is 0 for x == 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
  93. // P2(x) is 0 for x == 20, 21, 22, 23, 24, 25, 26, 27, 28, 29
  94. // P3(x) is 0 for x == 12, 13, 14, 15, 16, 17, 18, 19, 20, 21
  95. // P4(x) is 0 for x == 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
  96. // W[x] in sharound(x) is 0 for x == 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
  97. // W[14] = W[13] = W[12] = W[11] = W[10] = W[9] = W[8] = W[7] = W[6] = W[5] = 0x00000000U;
  98. // used in: P2(30) == 10485845 (0xA00055), P3(22), P4(31)
  99. // K[15] + W[15] == 0xc19bf174 + 0x00000280U = 0xc19bf3f4
  100. W[15 - O] = 0x00000280U;
  101. W[16 - O] = W16;
  102. W[17 - O] = W17;
  103. W[18 - O] = W2 + (rot(W_3, 25) ^ rot(W_3, 14) ^ (W_3 >> 3U));
  104. W[19 - O] = W_3 + W17_2;
  105. W[20 - O] = (u)0x80000000U + P1(20);
  106. W[21 - O] = P1(21);
  107. W[22 - O] = P1(22) + P3(22);
  108. W[23 - O] = P1(23) + P3(23);
  109. W[24 - O] = P1(24) + P3(24);
  110. W[25 - O] = P1(25) + P3(25);
  111. W[26 - O] = P1(26) + P3(26);
  112. W[27 - O] = P1(27) + P3(27);
  113. W[28 - O] = P1(28) + P3(28);
  114. W[29 - O] = P1(29) + P3(29);
  115. W[30 - O] = (u)0xA00055 + P1(30) + P3(30);
  116. // Round 4
  117. Temp = D1 + ch(4) + s1(4);
  118. Vals[7] = Temp + H1;
  119. Vals[3] = Temp + ma(4) + s0(4);
  120. // Round 5
  121. Temp = C1addK5 + ch(5) + s1(5);
  122. Vals[6] = Temp + G1;
  123. Vals[2] = Temp + ma(5) + s0(5);
  124. // W[6] to W[14] are 0, so no need to add them!
  125. sharound_no_W(6);
  126. sharound_no_W(7);
  127. sharound_no_W(8);
  128. sharound_no_W(9);
  129. sharound_no_W(10);
  130. sharound_no_W(11);
  131. sharound_no_W(12);
  132. sharound_no_W(13);
  133. sharound_no_W(14);
  134. // #define sharound(n) { Vals[(131 - n) % 8] += t1(n); Vals[(135 - n) % 8] = t1(n) + s0(n) + ma(n); }
  135. // #define t1(n) (K[n % 64] + Vals[(135 - n) % 8] + W[n] + s1(n) + ch(n))
  136. // Vals[(131 - 15) % 8] += (Vals[(135 - 15) % 8] = (u)0xc19bf3f4 + Vals[(135 - 15) % 8] + s1(15) + ch(15));
  137. // Vals[(135 - 15) % 8] += s0(15) + ma(15);
  138. sharound(15);
  139. sharound(16);
  140. sharound(17);
  141. sharound(18);
  142. sharound(19);
  143. sharound(20);
  144. sharound(21);
  145. sharound(22);
  146. sharound(23);
  147. sharound(24);
  148. sharound(25);
  149. sharound(26);
  150. sharound(27);
  151. sharound(28);
  152. sharound(29);
  153. sharound(30);
  154. W(31);
  155. sharound(31);
  156. W(32);
  157. sharound(32);
  158. W(33);
  159. sharound(33);
  160. W(34);
  161. sharound(34);
  162. W(35);
  163. sharound(35);
  164. W(36);
  165. sharound(36);
  166. W(37);
  167. sharound(37);
  168. W(38);
  169. sharound(38);
  170. W(39);
  171. sharound(39);
  172. W(40);
  173. sharound(40);
  174. W(41);
  175. sharound(41);
  176. W(42);
  177. sharound(42);
  178. W(43);
  179. sharound(43);
  180. W(44);
  181. sharound(44);
  182. W(45);
  183. sharound(45);
  184. W(46);
  185. sharound(46);
  186. W(47);
  187. sharound(47);
  188. W(48);
  189. sharound(48);
  190. W(49);
  191. sharound(49);
  192. W(50);
  193. sharound(50);
  194. W(51);
  195. sharound(51);
  196. W(52);
  197. sharound(52);
  198. W(53);
  199. sharound(53);
  200. W(54);
  201. sharound(54);
  202. W(55);
  203. sharound(55);
  204. W(56);
  205. sharound(56);
  206. W(57);
  207. sharound(57);
  208. W(58);
  209. sharound(58);
  210. W(59);
  211. sharound(59);
  212. W(60);
  213. sharound(60);
  214. W(61);
  215. sharound(61);
  216. W(62);
  217. sharound(62);
  218. W(63);
  219. sharound(63);
  220. W[64 - O] = state0 + Vals[0];
  221. W[65 - O] = state1 + Vals[1];
  222. W[66 - O] = state2 + Vals[2];
  223. W[67 - O] = state3 + Vals[3];
  224. W[68 - O] = state4 + Vals[4];
  225. W[69 - O] = state5 + Vals[5];
  226. W[70 - O] = state6 + Vals[6];
  227. W[71 - O] = state7 + Vals[7];
  228. // used in: P2(87) = 285220864 (0x11002000), P4(88)
  229. // K[72] + W[72] ==
  230. W[72 - O] = 0x80000000U;
  231. // P1(x) is 0 for x == 75, 76, 77, 78, 79, 80
  232. // P2(x) is 0 for x == 88, 89, 90, 91, 92, 93
  233. // P3(x) is 0 for x == 80, 81, 82, 83, 84, 85
  234. // P4(x) is 0 for x == 89, 90, 91, 92, 93, 94
  235. // W[x] in sharound(x) is 0 for x == 73, 74, 75, 76, 77, 78
  236. // W[78] = W[77] = W[76] = W[75] = W[74] = W[73] = 0x00000000U;
  237. // used in: P1(81) = 10485760 (0xA00000), P2(94) = 4194338 (0x400022), P3(86), P4(95)
  238. // K[79] + W[79] ==
  239. W[79 - O] = 0x00000100U;
  240. Vals[0] = H[0];
  241. Vals[1] = H[1];
  242. Vals[2] = H[2];
  243. Vals[3] = (u)L + W[64 - O];
  244. Vals[4] = H[3];
  245. Vals[5] = H[4];
  246. Vals[6] = H[5];
  247. Vals[7] = H[6] + W[64 - O];
  248. sharound(65);
  249. sharound(66);
  250. sharound(67);
  251. sharound(68);
  252. sharound(69);
  253. sharound(70);
  254. sharound(71);
  255. sharound(72);
  256. // W is also zero for these rounds
  257. sharound_no_W(73);
  258. sharound_no_W(74);
  259. sharound_no_W(75);
  260. sharound_no_W(76);
  261. sharound_no_W(77);
  262. sharound_no_W(78);
  263. sharound(79);
  264. W[80 - O] = P2(80) + P4(80);
  265. W[81 - O] = (u)0xA00000 + P4(81) + P2(81);
  266. W[82 - O] = P4(82) + P2(82) + P1(82);
  267. W[83 - O] = P4(83) + P2(83) + P1(83);
  268. W[84 - O] = P4(84) + P2(84) + P1(84);
  269. W[85 - O] = P4(85) + P2(85) + P1(85);
  270. W(86);
  271. sharound(80);
  272. sharound(81);
  273. sharound(82);
  274. sharound(83);
  275. sharound(84);
  276. sharound(85);
  277. sharound(86);
  278. W[87 - O] = (u)0x11002000 + P4(87) + P3(87) + P1(87);
  279. sharound(87);
  280. W[88 - O] = (u)0x80000000U + P3(88) + P1(88);
  281. sharound(88);
  282. W[89 - O] = P3(89) + P1(89);
  283. sharound(89);
  284. W[90 - O] = P3(90) + P1(90);
  285. sharound(90);
  286. W[91 - O] = P3(91) + P1(91);
  287. sharound(91);
  288. W[92 - O] = P3(92) + P1(92);
  289. sharound(92);
  290. W[93 - O] = P3(93) + P1(93);
  291. sharound(93);
  292. W[94 - O] = (u)0x400022 + P3(94) + P1(94);
  293. sharound(94);
  294. W[95 - O] = (u)0x00000100U + P3(95) + P2(95) + P1(95);
  295. sharound(95);
  296. W(96);
  297. sharound(96);
  298. W(97);
  299. sharound(97);
  300. W(98);
  301. sharound(98);
  302. W(99);
  303. sharound(99);
  304. W(100);
  305. sharound(100);
  306. W(101);
  307. sharound(101);
  308. W(102);
  309. sharound(102);
  310. W(103);
  311. sharound(103);
  312. W(104);
  313. sharound(104);
  314. W(105);
  315. sharound(105);
  316. W(106);
  317. sharound(106);
  318. W(107);
  319. sharound(107);
  320. W(108);
  321. sharound(108);
  322. W(109);
  323. sharound(109);
  324. W(110);
  325. sharound(110);
  326. W(111);
  327. sharound(111);
  328. W(112);
  329. sharound(112);
  330. W(113);
  331. sharound(113);
  332. W(114);
  333. sharound(114);
  334. W(115);
  335. sharound(115);
  336. W(116);
  337. sharound(116);
  338. W(117);
  339. sharound(117);
  340. W(118);
  341. sharound(118);
  342. W(119);
  343. sharound(119);
  344. W(120);
  345. sharound(120);
  346. W(121);
  347. sharound(121);
  348. W(122);
  349. sharound(122);
  350. W(123);
  351. sharound(123);
  352. // Round 124
  353. Vals[7] += Vals[3] + P4(124) + P3(124) + P2(124) + P1(124) + s1(124) + ch(124);
  354. #define MAXBUFFERS (4095)
  355. #define NFLAG (0xFFEUL)
  356. #if defined(VECTORS4) || defined(VECTORS2)
  357. if (Vals[7].x == -H[7])
  358. {
  359. output[MAXBUFFERS] = output[NFLAG & W[3].x] = W_3.x;
  360. }
  361. if (Vals[7].y == -H[7])
  362. {
  363. output[MAXBUFFERS] = output[NFLAG & W[3].y] = W_3.y;
  364. }
  365. #ifdef VECTORS4
  366. if (Vals[7].z == -H[7])
  367. {
  368. output[MAXBUFFERS] = output[NFLAG & W[3].z] = W_3.z;
  369. }
  370. if (Vals[7].w == -H[7])
  371. {
  372. output[MAXBUFFERS] = output[NFLAG & W[3].w] = W_3.w;
  373. }
  374. #endif
  375. #else
  376. if (Vals[7] == -H[7])
  377. {
  378. output[MAXBUFFERS] = output[NFLAG & W[3]] = W_3;
  379. }
  380. #endif
  381. }