Browse Source

Merge branch 'master' into align_temps

Conflicts:
	cgminer.c
Luke Dashjr 13 years ago
parent
commit
0d92069760
46 changed files with 10178 additions and 2986 deletions
  1. 2 1
      ADL_SDK/readme.txt
  2. 1 1
      AUTHORS
  3. 623 289
      COPYING
  4. 1 1
      LICENSE
  5. 42 32
      Makefile.am
  6. 523 2
      NEWS
  7. 228 77
      README
  8. 318 102
      adl.c
  9. 4 2
      adl.h
  10. 1 1
      api-example.c
  11. 573 69
      api.c
  12. 93 978
      cgminer.c
  13. 10 1
      compat.h
  14. 98 19
      configure.ac
  15. 1274 0
      diablo120328.cl
  16. 581 0
      diakgcn120223.cl
  17. 164 58
      driver-bitforce.c
  18. 842 0
      driver-cpu.c
  19. 59 0
      driver-cpu.h
  20. 341 0
      driver-icarus.c
  21. 1447 0
      driver-opencl.c
  22. 29 0
      driver-opencl.h
  23. 1 3
      example.conf
  24. 30 22
      findnonce.c
  25. 176 0
      logging.c
  26. 38 0
      logging.h
  27. 110 130
      miner.h
  28. 535 92
      miner.php
  29. 277 227
      ocl.c
  30. 8 3
      ocl.h
  31. 45 44
      phatk120223.cl
  32. 0 650
      poclbm110817.cl
  33. 1353 0
      poclbm120327.cl
  34. 11 11
      sha2.c
  35. 4 2
      sha2.h
  36. 1 2
      sha256_4way.c
  37. 1 2
      sha256_altivec_4way.c
  38. 1 3
      sha256_sse2_amd64.c
  39. 1 3
      sha256_sse2_i386.c
  40. 1 3
      sha256_sse4_amd64.c
  41. 1 1
      sha256_via.c
  42. 85 155
      util.c
  43. 224 0
      windows-build.txt
  44. 7 0
      x86_32/sha256_xmm.asm
  45. 7 0
      x86_64/sha256_sse4_amd64.asm
  46. 7 0
      x86_64/sha256_xmm_amd64.asm

+ 2 - 1
ADL_SDK/readme.txt

@@ -1,2 +1,3 @@
 Please insert AMD ADL files adl_defines.h adl_sdk.h adl_structures.h here.
-(http://developer.amd.com/gpu/adlsdk/Pages/default.aspx)
+They can be found here:
+http://developer.amd.com/sdks/ADLSDK/Pages/default.aspx

+ 1 - 1
AUTHORS

@@ -1,4 +1,4 @@
 Original CPU mining software: Jeff Garzik <jgarzik@pobox.com>
 GPU mining and rewrite: Con Kolivas <kernel@kolivas.org> 15qSxP1SQcUX3o4nhkfdbgyoWEFMomJ4rZ
 BitFORCE FPGA mining and refactor: Luke Dashjr <luke-jr+cgminer@utopios.org> 1NbRmS6a4dniwHHoSS9v3tEYUpP1Z5VVdL
-API+: Andrew Smith <kanoi@kano-kun.net> 1Jjk2LmktEQKnv8r2cZ9MvLiZwZ9gxabKm
+API+: Andrew Smith <kanoi2@kano-kun.net> 1Jjk2LmktEQKnv8r2cZ9MvLiZwZ9gxabKm

+ 623 - 289
COPYING

@@ -1,285 +1,626 @@
-		    GNU GENERAL PUBLIC LICENSE
-		       Version 2, June 1991
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
 
- Copyright (C) 1989, 1991 Free Software Foundation, Inc.
-     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  Everyone is permitted to copy and distribute verbatim copies
  of this license document, but changing it is not allowed.
 
-			    Preamble
+                            Preamble
 
-  The licenses for most software are designed to take away your
-freedom to share and change it.  By contrast, the GNU General Public
-License is intended to guarantee your freedom to share and change free
-software--to make sure the software is free for all its users.  This
-General Public License applies to most of the Free Software
-Foundation's software and to any other program whose authors commit to
-using it.  (Some other Free Software Foundation software is covered by
-the GNU Library General Public License instead.)  You can apply it to
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
 your programs, too.
 
   When we speak of free software, we are referring to freedom, not
 price.  Our General Public Licenses are designed to make sure that you
 have the freedom to distribute copies of free software (and charge for
-this service if you wish), that you receive source code or can get it
-if you want it, that you can change the software or use pieces of it
-in new free programs; and that you know you can do these things.
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
 
-  To protect your rights, we need to make restrictions that forbid
-anyone to deny you these rights or to ask you to surrender the rights.
-These restrictions translate to certain responsibilities for you if you
-distribute copies of the software, or if you modify it.
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
 
   For example, if you distribute copies of such a program, whether
-gratis or for a fee, you must give the recipients all the rights that
-you have.  You must make sure that they, too, receive or can get the
-source code.  And you must show them these terms so they know their
-rights.
-
-  We protect your rights with two steps: (1) copyright the software, and
-(2) offer you this license which gives you legal permission to copy,
-distribute and/or modify the software.
-
-  Also, for each author's protection and ours, we want to make certain
-that everyone understands that there is no warranty for this free
-software.  If the software is modified by someone else and passed on, we
-want its recipients to know that what they have is not the original, so
-that any problems introduced by others will not reflect on the original
-authors' reputations.
-
-  Finally, any free program is threatened constantly by software
-patents.  We wish to avoid the danger that redistributors of a free
-program will individually obtain patent licenses, in effect making the
-program proprietary.  To prevent this, we have made it clear that any
-patent must be licensed for everyone's free use or not licensed at all.
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
 
   The precise terms and conditions for copying, distribution and
 modification follow.
-
-		    GNU GENERAL PUBLIC LICENSE
-   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
-  0. This License applies to any program or other work which contains
-a notice placed by the copyright holder saying it may be distributed
-under the terms of this General Public License.  The "Program", below,
-refers to any such program or work, and a "work based on the Program"
-means either the Program or any derivative work under copyright law:
-that is to say, a work containing the Program or a portion of it,
-either verbatim or with modifications and/or translated into another
-language.  (Hereinafter, translation is included without limitation in
-the term "modification".)  Each licensee is addressed as "you".
-
-Activities other than copying, distribution and modification are not
-covered by this License; they are outside its scope.  The act of
-running the Program is not restricted, and the output from the Program
-is covered only if its contents constitute a work based on the
-Program (independent of having been made by running the Program).
-Whether that is true depends on what the Program does.
-
-  1. You may copy and distribute verbatim copies of the Program's
-source code as you receive it, in any medium, provided that you
-conspicuously and appropriately publish on each copy an appropriate
-copyright notice and disclaimer of warranty; keep intact all the
-notices that refer to this License and to the absence of any warranty;
-and give any other recipients of the Program a copy of this License
-along with the Program.
-
-You may charge a fee for the physical act of transferring a copy, and
-you may at your option offer warranty protection in exchange for a fee.
-
-  2. You may modify your copy or copies of the Program or any portion
-of it, thus forming a work based on the Program, and copy and
-distribute such modifications or work under the terms of Section 1
-above, provided that you also meet all of these conditions:
-
-    a) You must cause the modified files to carry prominent notices
-    stating that you changed the files and the date of any change.
-
-    b) You must cause any work that you distribute or publish, that in
-    whole or in part contains or is derived from the Program or any
-    part thereof, to be licensed as a whole at no charge to all third
-    parties under the terms of this License.
-
-    c) If the modified program normally reads commands interactively
-    when run, you must cause it, when started running for such
-    interactive use in the most ordinary way, to print or display an
-    announcement including an appropriate copyright notice and a
-    notice that there is no warranty (or else, saying that you provide
-    a warranty) and that users may redistribute the program under
-    these conditions, and telling the user how to view a copy of this
-    License.  (Exception: if the Program itself is interactive but
-    does not normally print such an announcement, your work based on
-    the Program is not required to print an announcement.)
-
-These requirements apply to the modified work as a whole.  If
-identifiable sections of that work are not derived from the Program,
-and can be reasonably considered independent and separate works in
-themselves, then this License, and its terms, do not apply to those
-sections when you distribute them as separate works.  But when you
-distribute the same sections as part of a whole which is a work based
-on the Program, the distribution of the whole must be on the terms of
-this License, whose permissions for other licensees extend to the
-entire whole, and thus to each and every part regardless of who wrote it.
-
-Thus, it is not the intent of this section to claim rights or contest
-your rights to work written entirely by you; rather, the intent is to
-exercise the right to control the distribution of derivative or
-collective works based on the Program.
-
-In addition, mere aggregation of another work not based on the Program
-with the Program (or with a work based on the Program) on a volume of
-a storage or distribution medium does not bring the other work under
-the scope of this License.
-
-  3. You may copy and distribute the Program (or a work based on it,
-under Section 2) in object code or executable form under the terms of
-Sections 1 and 2 above provided that you also do one of the following:
-
-    a) Accompany it with the complete corresponding machine-readable
-    source code, which must be distributed under the terms of Sections
-    1 and 2 above on a medium customarily used for software interchange; or,
-
-    b) Accompany it with a written offer, valid for at least three
-    years, to give any third party, for a charge no more than your
-    cost of physically performing source distribution, a complete
-    machine-readable copy of the corresponding source code, to be
-    distributed under the terms of Sections 1 and 2 above on a medium
-    customarily used for software interchange; or,
-
-    c) Accompany it with the information you received as to the offer
-    to distribute corresponding source code.  (This alternative is
-    allowed only for noncommercial distribution and only if you
-    received the program in object code or executable form with such
-    an offer, in accord with Subsection b above.)
-
-The source code for a work means the preferred form of the work for
-making modifications to it.  For an executable work, complete source
-code means all the source code for all modules it contains, plus any
-associated interface definition files, plus the scripts used to
-control compilation and installation of the executable.  However, as a
-special exception, the source code distributed need not include
-anything that is normally distributed (in either source or binary
-form) with the major components (compiler, kernel, and so on) of the
-operating system on which the executable runs, unless that component
-itself accompanies the executable.
-
-If distribution of executable or object code is made by offering
-access to copy from a designated place, then offering equivalent
-access to copy the source code from the same place counts as
-distribution of the source code, even though third parties are not
-compelled to copy the source along with the object code.
-
-  4. You may not copy, modify, sublicense, or distribute the Program
-except as expressly provided under this License.  Any attempt
-otherwise to copy, modify, sublicense or distribute the Program is
-void, and will automatically terminate your rights under this License.
-However, parties who have received copies, or rights, from you under
-this License will not have their licenses terminated so long as such
-parties remain in full compliance.
-
-  5. You are not required to accept this License, since you have not
-signed it.  However, nothing else grants you permission to modify or
-distribute the Program or its derivative works.  These actions are
-prohibited by law if you do not accept this License.  Therefore, by
-modifying or distributing the Program (or any work based on the
-Program), you indicate your acceptance of this License to do so, and
-all its terms and conditions for copying, distributing or modifying
-the Program or works based on it.
-
-  6. Each time you redistribute the Program (or any work based on the
-Program), the recipient automatically receives a license from the
-original licensor to copy, distribute or modify the Program subject to
-these terms and conditions.  You may not impose any further
-restrictions on the recipients' exercise of the rights granted herein.
-You are not responsible for enforcing compliance by third parties to
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
 this License.
 
-  7. If, as a consequence of a court judgment or allegation of patent
-infringement or for any other reason (not limited to patent issues),
-conditions are imposed on you (whether by court order, agreement or
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
 otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot
-distribute so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you
-may not distribute the Program at all.  For example, if a patent
-license would not permit royalty-free redistribution of the Program by
-all those who receive copies directly or indirectly through you, then
-the only way you could satisfy both it and this License would be to
-refrain entirely from distribution of the Program.
-
-If any portion of this section is held invalid or unenforceable under
-any particular circumstance, the balance of the section is intended to
-apply and the section as a whole is intended to apply in other
-circumstances.
-
-It is not the purpose of this section to induce you to infringe any
-patents or other property right claims or to contest validity of any
-such claims; this section has the sole purpose of protecting the
-integrity of the free software distribution system, which is
-implemented by public license practices.  Many people have made
-generous contributions to the wide range of software distributed
-through that system in reliance on consistent application of that
-system; it is up to the author/donor to decide if he or she is willing
-to distribute software through any other system and a licensee cannot
-impose that choice.
-
-This section is intended to make thoroughly clear what is believed to
-be a consequence of the rest of this License.
-
-  8. If the distribution and/or use of the Program is restricted in
-certain countries either by patents or by copyrighted interfaces, the
-original copyright holder who places the Program under this License
-may add an explicit geographical distribution limitation excluding
-those countries, so that distribution is permitted only in or among
-countries not thus excluded.  In such case, this License incorporates
-the limitation as if written in the body of this License.
-
-  9. The Free Software Foundation may publish revised and/or new versions
-of the General Public License from time to time.  Such new versions will
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
 be similar in spirit to the present version, but may differ in detail to
 address new problems or concerns.
 
-Each version is given a distinguishing version number.  If the Program
-specifies a version number of this License which applies to it and "any
-later version", you have the option of following the terms and conditions
-either of that version or of any later version published by the Free
-Software Foundation.  If the Program does not specify a version number of
-this License, you may choose any version ever published by the Free Software
-Foundation.
-
-  10. If you wish to incorporate parts of the Program into other free
-programs whose distribution conditions are different, write to the author
-to ask for permission.  For software which is copyrighted by the Free
-Software Foundation, write to the Free Software Foundation; we sometimes
-make exceptions for this.  Our decision will be guided by the two goals
-of preserving the free status of all derivatives of our free software and
-of promoting the sharing and reuse of software generally.
-
-			    NO WARRANTY
-
-  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
-FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
-OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
-PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
-OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
-MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
-TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
-PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
-REPAIR OR CORRECTION.
-
-  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
-WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
-REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
-INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
-OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
-TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
-YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
-PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
-POSSIBILITY OF SUCH DAMAGES.
-
-		     END OF TERMS AND CONDITIONS
-
-	    How to Apply These Terms to Your New Programs
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
 
   If you develop a new program, and you want it to be of the greatest
 possible use to the public, the best way to achieve this is to make it
@@ -287,15 +628,15 @@ free software which everyone can redistribute and change under these terms.
 
   To do so, attach the following notices to the program.  It is safest
 to attach them to the start of each source file to most effectively
-convey the exclusion of warranty; and each file should have at least
+state the exclusion of warranty; and each file should have at least
 the "copyright" line and a pointer to where the full notice is found.
 
     <one line to give the program's name and a brief idea of what it does.>
     Copyright (C) <year>  <name of author>
 
-    This program is free software; you can redistribute it and/or modify
+    This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
-    the Free Software Foundation; either version 2 of the License, or
+    the Free Software Foundation, either version 3 of the License, or
     (at your option) any later version.
 
     This program is distributed in the hope that it will be useful,
@@ -304,37 +645,30 @@ the "copyright" line and a pointer to where the full notice is found.
     GNU General Public License for more details.
 
     You should have received a copy of the GNU General Public License
-    along with this program; if not, write to the Free Software
-    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 Also add information on how to contact you by electronic and paper mail.
 
-If the program is interactive, make it output a short notice like this
-when it starts in an interactive mode:
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
 
-    Gnomovision version 69, Copyright (C) year  name of author
-    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
     This is free software, and you are welcome to redistribute it
     under certain conditions; type `show c' for details.
 
 The hypothetical commands `show w' and `show c' should show the appropriate
-parts of the General Public License.  Of course, the commands you use may
-be called something other than `show w' and `show c'; they could even be
-mouse-clicks or menu items--whatever suits your program.
-
-You should also get your employer (if you work as a programmer) or your
-school, if any, to sign a "copyright disclaimer" for the program, if
-necessary.  Here is a sample; alter the names:
-
-  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
-  `Gnomovision' (which makes passes at compilers) written by James Hacker.
-
-  <signature of Ty Coon>, 1 April 1989
-  Ty Coon, President of Vice
-
-This General Public License does not permit incorporating your program into
-proprietary programs.  If your program is a subroutine library, you may
-consider it more useful to permit linking proprietary applications with the
-library.  If this is what you want to do, use the GNU Library General
-Public License instead of this License.
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.

+ 1 - 1
LICENSE

@@ -1,3 +1,3 @@
-cpuminer is available under the terms of the GNU Public License version 2.
+Cgminer is available under the terms of the GNU Public License version 3.
 
 See COPYING for details.

+ 42 - 32
Makefile.am

@@ -9,7 +9,7 @@ endif
 
 EXTRA_DIST	= example.conf m4/gnulib-cache.m4 linux-usb-cgminer \
 		  ADL_SDK/readme.txt api-example.php miner.php	\
-		  API.class API.java api-example.c
+		  API.class API.java api-example.c windows-build.txt
 
 SUBDIRS		= lib compat ccan
 
@@ -17,50 +17,60 @@ INCLUDES	= $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES)
 
 bin_PROGRAMS	= cgminer
 
-bin_SCRIPTS	= phatk110817.cl poclbm110817.cl
-
-if HAS_CPUMINE
-cgminer_SOURCES	= elist.h miner.h compat.h bench_block.h	\
-		  main.c util.c uthash.h			\
-		  ocl.c ocl.h findnonce.c findnonce.h 		\
-		  sha256_generic.c sha256_4way.c sha256_via.c	\
-		  sha256_cryptopp.c sha256_sse2_amd64.c		\
-		  sha256_sse4_amd64.c sha256_sse2_i386.c	\
-		  sha256_altivec_4way.c				\
-		  adl.c	adl.h adl_functions.h			\
-		  phatk110817.cl poclbm110817.cl \
-		  sha2.c sha2.h api.c
-else
-cgminer_SOURCES	= elist.h miner.h compat.h bench_block.h	\
-		  main.c util.c uthash.h			\
-		  ocl.c ocl.h findnonce.c findnonce.h 		\
-		  adl.c	adl.h adl_functions.h			\
-		  phatk110817.cl poclbm110817.cl \
-		  sha2.c sha2.h api.c
-endif
+bin_SCRIPTS	= *.cl
 
 cgminer_LDFLAGS	= $(PTHREAD_FLAGS)
 cgminer_LDADD	= $(DLOPEN_FLAGS) @LIBCURL_LIBS@ @JANSSON_LIBS@ @PTHREAD_LIBS@ \
 		  @OPENCL_LIBS@ @NCURSES_LIBS@ @PDCURSES_LIBS@ @WS2_LIBS@ \
+		  @UDEV_LIBS@ \
 		  @MATH_LIBS@ lib/libgnu.a ccan/libccan.a
 cgminer_CPPFLAGS = -I$(top_builddir)/lib -I$(top_srcdir)/lib @OPENCL_FLAGS@
 
+# common sources
+cgminer_SOURCES := cgminer.c
+
+cgminer_SOURCES	+= elist.h miner.h compat.h bench_block.h	\
+		   util.c uthash.h logging.h			\
+		   sha2.c sha2.h api.c
+
+cgminer_SOURCES	+= logging.c
+
+# GPU sources, TODO: make them selectable
+# the GPU portion extracted from original main.c
+cgminer_SOURCES += driver-opencl.h driver-opencl.c
+
+# the original GPU related sources, unchanged
+cgminer_SOURCES += ocl.c ocl.h findnonce.c findnonce.h
+cgminer_SOURCES += adl.c adl.h adl_functions.h
+cgminer_SOURCES += *.cl
+
 if HAS_CPUMINE
-if HAVE_x86_64
+# original CPU related sources, unchanged
+cgminer_SOURCES	+= \
+		  sha256_generic.c sha256_4way.c sha256_via.c	\
+		  sha256_cryptopp.c sha256_sse2_amd64.c		\
+		  sha256_sse4_amd64.c sha256_sse2_i386.c	\
+		  sha256_altivec_4way.c
+
+# the CPU portion extracted from original main.c
+cgminer_SOURCES += driver-cpu.h driver-cpu.c
+
 if HAS_YASM
+AM_CFLAGS	= -DHAS_YASM
+if HAVE_x86_64
 SUBDIRS		+= x86_64
 cgminer_LDADD	+= x86_64/libx8664.a
-AM_CFLAGS	= -DHAS_YASM
-endif
-else
-if HAS_YASM
+else # HAVE_x86_64
 SUBDIRS		+= x86_32
 cgminer_LDADD	+= x86_32/libx8632.a
-AM_CFLAGS	= -DHAS_YASM
-endif
-endif
+endif # HAVE_x86_64
+endif # HAS_YASM
+endif # HAS_CPUMINE
+
+if HAS_BITFORCE
+cgminer_SOURCES += driver-bitforce.c
 endif
 
-if USE_BITFORCE
-cgminer_SOURCES += bitforce.c
+if HAS_ICARUS
+cgminer_SOURCES += driver-icarus.c
 endif

+ 523 - 2
NEWS

@@ -1,5 +1,526 @@
-Version 2.2.0
-
+Version 2.3.3 - April 15, 2012
+
+- Don't even display that cpumining is disabled on ./configure to discourage
+people from enabling it.
+- Do a complete cgminer restart if the ATI Display Library fails, as it does on
+windows after running for some time, when fanspeed reporting fails.
+- Cache the initial arguments passed to cgminer and implement an attempted
+restart option from the settings menu.
+- Disable per-device status lines when there are more than 8 devices since
+screen output will be corrupted, enumerating them to the log output instead at
+startup.
+- Reuse Vals[] array more than W[] till they're re-initialised on the second
+sha256 cycle in poclbm kernel.
+- Minor variable alignment in poclbm kernel.
+- Make sure to disable devices with any status not being DEV_ENABLED to ensure
+that thermal cutoff code works as it was setting the status to DEV_RECOVER.
+- Re-initialising ADL simply made the driver fail since it is corruption over
+time within the windows driver that's responsible. Revert "Attempt to
+re-initialise ADL should a device that previously reported fanspeed stops
+reporting it."
+- Microoptimise poclbm kernel by ordering Val variables according to usage
+frequency.
+
+
+Version 2.3.2 - March 31, 2012
+
+- Damping small changes in hashrate so dramatically has the tendency to always
+make the hashrate underread so go back to gentle damping instead.
+- Revert the crossover of variables from Vals to W in poclbm kernel now that
+Vals are the first declared variables so they're used more frequently.
+- Vals variables appearing first in the array in poclbm is faster.
+- Change the preferred vector width to 1 for Tahiti only, not all poclbm
+kernels.
+- Use a time constant 0.63 for when large changes in hashrate are detected to
+damp change in case the large change is an aliasing artefact instead of a real
+chang
+- Only increment stale counter if the detected stales are discarded.
+- Attempt to re-initialise ADL should a device that previously reported fanspeed
+stops reporting it.
+- Move the ADL setup and clearing to separate functions and provide a reinit_adl
+function to be used when adl fails while running.
+- Use slightly more damping on the decay time function in the never-ending quest
+to smooth off the hashmeter.
+- Set the starting fanspeed to a safe and fairly neutral 50% when autofan is
+enabled.
+- Provide locking around updates of cgpu hashrates as well to prevent multiple
+threads accessing data fields on the same device.
+- Display the beginning of the new block in verbose mode in the logs.
+- Reinstate old diablo kernel variable ordering from 120222, adding only goffset
+and vector size hint. The massive variable ordering change only helped one SDK
+on
+- Change the version number on the correct kernels.
+- api.c devicecode/osinfo incorrectly swapped for json
+- Add extensive instructions on how to make a native windows build.
+- Update version numbers of poclbm and diablo kernels as their APIs have also
+changed.
+- Use global offset parameter to diablo and poclbm kernel ONLY for 1 vector
+kernels.
+- Use poclbm preferentially on Tahiti now regardless of SDK.
+- Remove unused constant passed to poclbm.
+- Clean up use of macros in poclbm and use bitselect everywhere possible.
+- Add vector type hint to diablo kernel.
+- Add worksize and vector attribute hints to the poclbm kernel.
+- Spaces for non-aligned variables in poclbm.
+- More tidying of poclbm.
+- Swap Vals and W variables where they can overlap in poclbm.
+- More tidying of poclbm.
+- Tidy up first half of poclbm.
+- Clean up use of any() by diablo and poclbm kernels.
+- Minor variable symmetry changes in poclbm.
+- Put additions on separate lines for consistency in poclbm.
+- Consolidate last use of W11 into Vals4 in poclbm.
+- Change email due to SPAM
+- api.c miner.php add a '*' to the front of all notify counters - simplifies
+future support of new counters
+- miner.php add display 'notify' command
+- Small change to help arch's without processor affinity
+- Fix bitforce compile error
+- api.c notify should report disabled devices also - of course
+- API returns the simple device history with the 'notify' command
+- code changes for supporting a simple device history
+- api.c Report an OS string in config to help with device issues
+- api.c fix Log Interval - integer in JSON
+- api.c config 'Device Code' to show list of compiled devices + README
+- api.c increase buffer size close to current code allowable limit
+- removed 8-component vector support from kernel, as this is not supported in
+CGMINER anyway
+- forgot to update kernel modification date, fixed ;)
+- reordered an addition in the kernel, which results in less instructions used
+in the GPU ISA code for GCN
+- miner.php: option for readonly or check privileged access
+- Ignore reduntant-with-build options --disable-gpu, --no-adl, and --no-restart
+- miner.php: ereg_replace is DEPRECATED so use preg_replace instead
+- Make curses TUI support optional at compile-time.
+- Bugfix: AC_ARG_WITH provides withval instead of enableval
+- miner.php split devs output for different devices
+- api.c: correct error messages
+- icarus.c modify (regular) timeout warning to only be debug
+- icarus.c set the windows TODO timeout
+- Allow specifying a specific driver for --scan-serial
+- optimized nonce-check and output code for -v 2 and -v 4
+- Bugfix: Check for libudev header (not just library) in configure, and document
+optional dependency
+- Add API support for Icarus and Bitforce
+- Next API version is 1.4 (1.3 is current)
+- README/api.c add "When" the request was processed to STATUS
+- Bugfix: ZLX to read BitFORCE temp, not ZKX -.-
+- Use libudev to autodetect BitFORCE GPUs, if available
+- Use the return value of fan_autotune to set fan_optimal instead of passing it
+as a pointer.
+- Pass the lasttemp from the device we're using to adjust fanspeed in twin
+devices.
+- fix the name to 3 chars, fix the multi-icarus support
+- Bugfix: "-S auto" is the default if no -S is specified, and there is no such
+delay in using it
+- README add information missing from --scan-serial
+- Update README RPC API Version comment
+- Bugfix: Allow enabling CPU even without OpenCL support
+- Change failed-to-mine number of requested shares messge to avoid segfault on
+recursive calling of quit().
+- Get rid of extra char which is just truncated in poclbm kernel.
+- only small code formating changes
+- removed vec_step() as this could lead to errors on older SDKs
+- unified code for generating nonce in kernel and moved addition of base to the
+end -> faster
+
+Version 2.3.1 - February 24, 2012
+
+- Revert input and output code on diakgcn and phatk kernels to old style which
+worked better for older hardware and SDKs.
+- Add a vector*worksize parameter passed to those kernels to avoid one op.
+- Increase the speed of hashrate adaptation.
+- Only send out extra longpoll requests if we want longpolls.
+- API implement addpool command
+- API return the untouched Total MH also (API now version 1.3)
+- Add enable/disablepool to miner.php example and reduce font size 1pt
+
+
+Version 2.3.0 - February 23, 2012
+
+- Consider extra longpoll work items as staged_extra so as to make sure we queue
+more work if queueing regular work items as longpolls.
+- Use diablo kernel on all future SDKs for Tahiti and set preferred vector width
+to 1 on poclbm kernel only.
+- Explicitly type the constants in diakgcn kernel as uint, to be in line with
+poclbm kernel.
+- Reset all hash counters at the same time as resetting start times to get
+accurate hashrates on exiting which is mandatory for benchmarking.
+- Report thread out before it starts to avoid being flagged as sick when waiting
+for the first work item.
+- Don't disable and re-enable devices as they may recover and in the meantime
+have their status set to OFF.
+- API new commands enablepool and disablepool (version already incremented)
+- Tolerate new-format temperature readings for bitforce
+- Modify cgminer.c pool control to allow API to call it
+- Bugfix: Fix BitFORCE driver memory leak in debug logging
+- Extra byte was being unused in poclbm leading to failure on some platforms.
+- Explicitly type the constants in poclbm kernel as uint.
+- Don't save 'include' when saving the configuration
+- Allow configuration file to include another recursively
+- Use the SDK and hardware information to choose good performing default
+kernels.
+- Move phatk kernel to offset vector based nonce bases as well.
+- Add a --benchmark feature which works on a fake item indefinitely to compare
+device performance without any server or networking influence.
+- Allow writing of multiple worksizes to the configuration file.
+- Allow writing of multiple vector sizes to the configuration file.
+- Allow writing of multiple kernels to the configuration file.
+- Allow multiple different kernels to be chosen per device.
+- Allow the worksize to be set per-device.
+- Allow different vectors to be set per device.
+- If we're well below the target temperature, increase gpu engine speed back to
+maximum in case we have gotten lost between profiles during an idle period.
+- We should be setting the value of fan_optimal, not its address.
+- As all kernels will be new versions it's an opportunity to change the .bin
+format and make it simpler. Specifying bitalign is redundant and long can be l.
+- Use any() in kernel output code.
+- Put the nonce for each vector offset in advance, avoiding one extra addition
+in the kernel.
+- Reset times after all mining threads are started to make estimating hashrates
+easier at startup.
+- Bugfix: allow no-exec (NX) stack
+- Fix minor warning.
+- fix the bitforce.c code style follow 1TBS
+- fix icarus.c compile warning
+- small changes to speedup no vec for AMD 898.1 OCL runtime
+- Update licensing to GPL V3.
+- Reset the longpoll flag after it's been used once to prevent it restarting
+work again.
+- Begin import of DiabloMiner kernel.
+- Modify API debug messages to say API instead of DBG
+- When API shuts down cgminer don't kill itself
+- Don't make rolled work from the longpoll be seen as other longpoll work items.
+- API add 'privileged' command so can verify access level
+- Set the lp_sent variable under lock since there will almost always be a race
+on setting this variable, potentially leading to multiple LPs being sent out.
+- API restrict access to all non display commands by default
+- Update API version to 1.2 for new 'Log Interval'
+- API add --log Interval to 'config' reply
+- --api-allow special case 0/0 means all
+
+
+Version 2.2.7 - February 20, 2012
+
+- Send out extra longpolls when we have switched pools and the longpoll thread
+is still bound to the old one. This is particularly useful with p2pool where
+longpolls do not correlate with main bitcoin block change and would have led to
+high reject rates on failover.
+- Store whether a work item is the result of a longpoll or not in struct work
+and use it to help determine block changes directly from the work longpoll bool.
+- Keep track of when a longpoll has been sent for a pool and if the current pool
+is requesting work but has not sent a longpoll request, convert one of the work
+items to a longpoll.
+- Store the longpoll url in the pool struct and update it from the pool_active
+test in case it changes. This is to allow further changes to longpoll management
+on switching pools.
+- Re-check for a longpoll supporting pool every 30 seconds if none is found
+initially.
+- Report threads as busy waiting on getwork on startup to avoid them being
+flagged sick on startup during slow networking.
+- Allow devices that are disabled due to overheating to be flagged as recovering
+instead of disabling them and re-enable them if they're below ideal temperatures
+- Tahiti prefers worksize 64 with poclbm.
+- No need to expressly retain the opencl program now that the zero binary issue
+is fixed. This actually fixes cgminer to work with the latest SDK included with
+the ATI catalyst driver 12.2.
+- Show error code on any opencl failure status.
+- Add detection for version 898.1 SDK as well but only give SDK 2.6 warning once
+on startup instead of with each device initialisation.
+- Always use a fresh connection for longpoll as prolonged persistent connections
+can fail for many reasons.
+- Keep track of intended engine clock speed and only adjust up if it's higher
+than the last intended speed. This avoids setting the clock speed to one
+relative to a lower profile one by mistake.
+- Use gpu-memdiff on startup if an engine clockspeed is set and a memdiff value
+is set.
+- Revert "Adjust engine speed up according to performance level engine setting,
+not the current engine speed." - ineffectual.
+- Freeze the queues on all threads that are sent the pause message to prevent
+them trying to start up again with saved pings in their queues.
+- Updates to diakgcn kernel/
+- Consolidate all screen updates to the watchdog thread and touch both windows
+before refresh.
+- Curses will be disabled in clean_up so don't do it early in kill_work, and
+disable_adl so that GPU settings may be restored to normal in case shutting down
+curses leads to instability on windows.
+- Stop the mining threads before trying to kill them.
+- Plain refresh() does not give reliably screen updates so get rid of all uses
+of it.
+- First release with working diakgcn kernel.
+
+Version 2.2.6 - February 16, 2012
+
+- Provide warning on each startup about sdk 2.6
+- Fix unused warnings on win32.
+- bitforce: Simplify BFopen WIN32 ifdef/else
+- Fix initialization warning with jansson 1.3
+- bitforce: Cleanup extraneous TODO that isn't needed
+- Move tcsetattr (and new tcflush) into *nix BFopen to simplify things a bit
+- Add message explaining 2nd thread disabling for dynamic mode and how to tune
+it.
+- Move logwindow down once number of devices is known.
+- Automatically choose phatk kernel for bitalign non-gcn ATI cards, and then
+only select poclbm if SDK2.6 is detected.
+- Allow the refresh interval to be adjusted in dynamic intensity with a
+--gpu-dyninterval parameter.
+- Make curses display visible right from the beginning and fix the window sizes
+so the initial messages don't get lost once the status window is drawn.
+- The amount of work scanned can fluctuate when intensity changes and since we
+do this one cycle behind, we increment the work more than enough to prevent
+repeati
+- bitforce: Set a 30 second timeout for serial port on Windows, since the
+default is undefined
+- Use PreVal4addT1 instead of PreVal4 in poclbm kernel.
+- Import PreVal4 and PreVal0 into poclbm kernel.
+- Import more prepared constants into poclbm kernel.
+- Keep variables in one array but use Vals[] name for consistency with other
+kernel designs.
+- Replace constants that are mandatorily added in poclbm kernel with one value.
+- Remove addition of final constant before testing for result in poclbm kernel.
+- Hand optimise variable addition order.
+- Hand optimise first variable declaration order in poclbm kernel.
+- Radical reordering machine based first pass to change variables as late as
+possible, bringing their usage close together.
+- fix strcpy NULL pointer if env HOME unset.
+- bitforce: Disable automatic scanning when at least one device is specified
+manually
+- Unroll all poclbm additions to enable further optimisations.
+
+
+Version 2.2.5 - February 13, 2012
+
+- Make output buffer write only as per Diapolo's suggestion.
+- Constify nonce in poclbm.
+- Use local and group id on poclbm kernel as well.
+- Microoptimise phatk kernel on return code.
+- Adjust engine speed up according to performance level engine setting, not the
+current engine speed.
+- Try to load a binary if we've defaulted to the poclbm kernel on SDK2.6
+- Use the poclbm kernel on SDK2.6 with bitalign devices only if there is no
+binary available.
+- Further generic microoptimisations to poclbm kernel.
+- The longstanding generation of a zero sized binary appears to be due to the
+OpenCL library putting the binary in a RANDOM SLOT amongst 4 possible binary
+locations. Iterate over each of them after building from source till the real
+binary is found and use that.
+- Fix harmless warnings with -Wsign-compare to allow cgminer to build with -W.
+- Fix missing field initialisers warnings.
+- Put win32 equivalents of nanosleep and sleep into compat.h fixing sleep() for
+adl.c.
+- Restore compatibility with Jansson 1.3 and 2.0 (api.c required 2.1)
+- Modularized logging, support for priority based logging
+- Move CPU chipset specific optimization into device-cpu
+
+
+Version 2.2.4 - February 11, 2012
+
+- Fix double definition of A0 B0 to zeroA zeroB.
+- Retain cl program after successfully loading a binary image. May decrease
+failures to build kernels at startup.
+- Variable unused after this so remove setting it.
+- BFI INT patching is not necessarily true on binary loading of files and not
+true on ATI SDK2.6+. Report bitalign instead.
+- Various string fixes for reject reason.
+- Generalize --temp-cutoff and implement support for reading temperature from
+BitFORCE FPGAs
+- Change message from recovered to alive since it is used on startup as well as
+when a pool has recovered.
+- Start mining as soon as any pool is found active and rely on the watchpool
+thread to bring up other pools.
+- Delayed responses from testing pools that are down can hold up the watchdog
+thread from getting to its device testing code, leading to false detection of
+the GPU not checking in, and can substantially delay auto gpu/auto fan
+management leading to overheating. Move pool watching to its own thread.
+- Bugfix: BitFORCE index needs to be static to count correctly
+- Space out retrieval of extra work according to the number of mining threads.
+- Make shutdown more robust. Enable the input thread only after the other
+threads exist. Don't kill off the workio thread and use it to exit main() only
+if there is an unexpected problem. Use kill_work() for all anticipated shutdowns
+where possible. Remove unused thread entry.
+- Change poclbm version number.
+- One array is faster than 2 separate arrays so change to that in poclbm kernel.
+- Microoptimisations to poclbm kernel which increase throughput slightly.
+- Import diablominer kernel. Currently disabled as not working.
+- Import diapolo kernel. Currently disabled as not working.
+- Conflicting entries of cl_kernel may have been causing problems, and
+automatically chosen kernel type was not being passed on. Rename the enum to
+cl_kernels and store the chosen kernel in each clState.
+- Set cl_amd_media_ops with the BITALIGN flag and allow non-bitselect devices to
+build.
+- ALlow much longer filenames for kernels to load properly.
+- Allow different kernels to be used by different devices and fix the logic fail
+of overcorrecting on last commit with !strstr.
+- Fix kernel selection process and build error.
+- queue_phatk_kernel now uses CL_SET_VARG() for base-nonce(s), too
+- added OpenCL >= 1.1 detection code, in preparation of OpenCL 1.1 global offset
+parameter support
+- Use K array explicitly to make it clear what is being added.
+- Work items have a tendency to expire at exactly the same time and we don't
+queue extra items when there are plenty in the queue, regardless of age. Allow
+extra work items to be queued if adequate time has passed since we last
+requested work even if over the limit.
+- Discard work when failover-only is enabled and the work has come from a
+different pool.
+- Missing include to build on newer mingw32.
+- Move from the thread safe localtime_r to regular localtime which is the only
+one supported on newer pthread libraries on mingw32 to make it compile with the
+newer ming. Thread safety is of no importance where localtime is used in this
+code.
+- Define in_addr_t in windows if required
+- sys/wait.h not required in windows
+- Allow API to restrict access by IP address
+- Add pool switching to example miner.php
+- Display X-Reject-Reason, when provided
+- Remove the test for whether the device is on the highest profil level before
+raising the GPU speed as it is ineffectual and may prevent raising the GPU
+speed.
+- Remove unnecessary check for opt_debug one every invocation of applog at
+LOG_DEBUG level and place the check in applog().
+
+
+Version 2.2.3 - February 6, 2012
+
+- Revert "Rewrite the convoluted get_work() function to be much simpler and roll
+work as much as possible with each new work item." This seems to cause a race on
+work in free_work(). Presumably other threads are still accessing the structure.
+
+
+Version 2.2.2 - February 6, 2012
+
+- Provide support for the submitold extension on a per-pool basis based on the
+value being detected in a longpoll.
+- Don't send a ping to a dynamic device if it's not enabled as that will just
+enable it for one pass and then disable it again.
+- Rewrite the convoluted get_work() function to be much simpler and roll work as
+much as possible with each new work item.
+- Roll as much work as possible from the work returned from a longpoll.
+- Rolling work on each loop through the mining thread serves no purpose.
+- Allow to stage more than necessary work items if we're just rolling work.
+- Replace divide_work with reuse_work function used twice.
+- Give rolled work a new ID to make sure there is no confusion in the hashtable
+lookups.
+- Remove now-defunct hash_div variables.
+- Remove unused get_dondata function.
+- Silence ADL warnings.
+- Silence unused parameter warnings.
+- Stagger the restart of every next thread per device to keep devices busy ahead
+of accessory threads per device.
+- Deprecate the --donation feature. Needlessly complex, questionable usefulness,
+depends on author's server and a central pool of some kind, and was not heavily
+adopted.
+- It's devices that report back now, not threads, update message.
+- Continue auto-management of fan and engine speeds even if a device is disabled
+for safety reasons.
+- No need to check we're highest performance level when throttling GPU engine
+speed.
+- Abstract out tests for whether work has come from a block that has been seen
+before and whether a string is from a previously seen block.
+- Probe but don't set the timeout to 15 seconds as some networks take a long
+time to timeout.
+- Remove most compiler warnings from api.c
+- Add last share's pool info in cgpu_info
+- Allow the OpenCL platform ID to be chosen with --gpu-platform.
+- Iterate over all platforms displaying their information and number of devices
+when --ndevs is called.
+- Deprecate main.c
+- Some networks can take a long time to resolve so go back to 60 second timeouts
+instead of 15.
+- Only enable curses on failure if curses is desired.
+- Fix warnings in bitforce.c
+- Bugfix: Need to open BitForce tty for read-write
+- Fix various build issues.
+- Modularize code: main.c -> device-cpu + device-gpu
+- Fix phatk kernel not working on non-bitalign capable devices (Nvidia, older
+ATI).
+- Update poclbm kernel for better performance on GCN and new SDKs with bitalign
+support when not BFI INT patching. Update phatk kernel to work properly for non
+BFI INT patched kernels, providing support for phatk to run on GCN and non-ATI
+cards.
+- Return last accepted share pool/time for devices
+- Display accepted share pool/time for CPUs
+- Bug intensity always shows GPU 0
+- Update example web miner.php to use new API commands
+
+
+Version 2.2.1 - January 30, 2012
+
+NOTE - The GPU Device reordering in 2.2.0 by default was considered a bad idea
+so the original GPU ordering is used by default again unless reordering is
+explicitly requested.
+
+- Fix bitforce failing to build into cgminer.
+- Add missing options to write config function.
+- Add a --gpu-reorder option to only reorder devices according to PCI Bus ID
+when requested.
+- Fix for midstate support being broken on pools that supported no-midstate
+work by ensuring numbers are 32 bits in sha2.c
+- Set virtual GPUs to work when ADL is disabled or all mining will occur on GPU
+0.
+- Add information about paused threads in the menu status.
+- Disable all but the first thread on GPUs in dynamic mode for better
+interactivity.
+- Set the latest network access time on share submission for --net-delay even if
+we're not delaying that submission for further network access.
+- Clear adl on exiting after probing values since it may attempt to overclock.
+- As share submission is usually staggered, and delays can be costly, submit
+shares without delay even when --net-delay is enabled.
+- Display GPU number and device name when ADL is successfully enabled on it.
+- Display GPU ordering remapping in verbose mode.
+- Don't fail in the case the number of ADL and OpenCL devices do not match, and
+do not attempt to reorder devices unless they match. Instead give a warning
+about
+- Display error codes should ADL not return ADL_OK in the more critical function
+calls.
+- Fix unused warning.
+- Fix compile warnings in api.c
+- Add extensive ADL based device info in debug mode.
+- Make --ndevs display verbose opencl information as well to make debugging
+version information easier.
+- Display information about the opencl platform with verbose enabled.
+- Explicitly check for nvidia in opencl platform strings as well.
+
+
+Version 2.2.0 - January 29, 2012
+
+NOTE: GPU Device order will change with this release with ATI GPUs as cgminer
+now can enumerate them according to their Bus ID which means the values should
+now correlate with their physical position on the motherboard.
+
+- Default to poclbm kernel on Tahiti (7970) since phatk does not work, even
+though performance is sub-standard so that at least it will mine successfully by
+defau
+- Retain cl program after every possible place we might build the program.
+- Update ADL SDK URL.
+- Fix potential overflow.
+- Map GPU devices to virtual devices in their true physical order based on
+BusNumber.
+- Change the warning that comes with failure to init cl on a device to be more
+generic and accurate.
+- Advertise longpoll support in X-Mining-Extensions
+- Detect dual GPU cards by iterating through all GPUs, finding ones without
+fanspeed and matching twins with fanspeed one bus ID apart.
+- Do not attempt to build the program that becomes the kernel twice. This could
+have been leading to failures on initialising cl.
+- Some opencl compilers have issues with no spaces after -D in the compiler
+options.
+- Allow intensity up to 14.
+- Use calloced stack memory for CompilerOptions to ensure sprintf writes to the
+beginning of the char.
+- Whitelist 79x0 cards to prefer no vectors as they perform better without.
+- Adjust fan speed gently while in the optimal range when temperature is
+drifting to minimise overshoot in either direction.
+- Detect dual GPU cards via the indirect information of - 1st card has a fan
+controller. 2nd card does not have a fan controller, cards share the same device
+name
+- Instead of using the BFI_INT patching hack on any device reporting
+cl_amd_media_ops, create a whitelist of devices that need it. This should enable
+GCN architec
+- Fixed API compiling issue on OS X
+- Add more explanation of JSON format and the 'save' command
+- Return an error if using ADL API commands when it's not available
+- Read off lpThermalControllerInfo from each ADL device.
+- Add ADL_Overdrive5_ThermalDevices_Enum interface.
 - Add API commands: config, switchpool, gpu settings, save
 - Implement socks4 proxy support.
 - Fix send() for JSON strings

+ 228 - 77
README

@@ -1,11 +1,11 @@
 
-This is a multi-threaded multi-pool CPU, GPU, and FPGA miner with ATI GPU
+This is a multi-threaded multi-pool GPU, FPGA and CPU miner with ATI GPU
 monitoring, (over)clocking and fanspeed support for bitcoin and derivative
 coins. Do not use on multiple block chains at the same time!
 
 This code is provided entirely free of charge by the programmer in his spare
-time so donations would be greatly appreciated. Please consider using the
---donation feature or donate directly to the address below.
+time so donations would be greatly appreciated. Please consider donating to the
+address below.
 
 Con Kolivas <kernel@kolivas.org>
 15qSxP1SQcUX3o4nhkfdbgyoWEFMomJ4rZ
@@ -41,18 +41,22 @@ Dependencies:
 	yasm 1.0.1+ http://yasm.tortall.net/
 	(yasm is optional, gives assembly routines for CPU mining)
 	AMD APP SDK		http://developer.amd.com/sdks/AMDAPPSDK
-	(This sdk is optional and gives support for GPU mining)
+	(This sdk is mandatory for GPU mining)
 	AMD ADL SDK		http://developer.amd.com/sdks/ADLSDK
-	(This sdk is optional and gives support for ATI GPU monitoring & clocking)
+	(This sdk is mandatory for ATI GPU monitoring & clocking)
+	libudev headers
+	(This is only required for FPGA auto-detection)
 
 CGMiner specific configuration options:
 	--enable-cpumining      Build with cpu mining support(default disabled)
 	--disable-opencl        Override detection and disable building with opencl
 	--disable-adl           Override detection and disable building with adl
+	--enable-bitforce       Compile support for BitForce FPGAs(default disabled)
+	--enable-icarus         Compile support for Icarus Board(default disabled)
 
 Basic *nix build instructions:
 	To build with GPU mining support:
-	Install AMD APP sdk, latest version - there is no official place to
+	Install AMD APP sdk, ideal version (see FAQ!) - no official place to
 	install it so just keep track of where it is if you're not installing
 	the include files and library files into the system directory.
 	(Do NOT install the ati amd sdk if you are on nvidia.)
@@ -99,10 +103,10 @@ Basic WIN32 build instructions (LIKELY OUTDATED INFO. requires mingw32):
 	make
 	./mknsis.sh
 	
-Native WIN32 build instructions (on mingw32, on windows):
+Native WIN32 build instructions (outdated, see windows-build.txt)
 	Install the Microsoft platform SDK
-	Install AMD APP sdk, latest version (only if you want GPU mining)
-	Install AMD ADL sdk, latest version (only if you want GPU monitoring)
+	Install AMD APP sdk, (if you want GPU mining)
+	Install AMD ADL sdk, (if you want GPU monitoring)
 	(Do NOT install the ati amd sdk if you are on nvidia)
 	Install mingw32
 	Install libcurl, copy libcurl.m4 into /mingw/share/aclocal
@@ -118,14 +122,19 @@ Usage instructions:  Run "cgminer --help" to see options:
 
 Usage: . [-atDdGCgIKklmpPQqrRsTouvwOchnV] 
 Options for both config file and command line:
+--api-allow         Allow API access (if enabled) only to the given list of [W:]IP[/Prefix] address[/subnets]
+                    This overrides --api-network and you must specify 127.0.0.1 if it is required
+                    W: in front of the IP address gives that address privileged access to all api commands
 --api-description   Description placed in the API status header (default: cgminer version)
 --api-listen        Listen for API requests (default: disabled)
+                    By default any command that does not just display data returns access denied
+                    See --api-allow to overcome this
 --api-network       Allow API (if enabled) to listen on/for any address (default: only 127.0.0.1)
 --api-port          Port number of miner API (default: 4028)
 --auto-fan          Automatically adjust all GPU fan speeds to maintain a target temperature
 --auto-gpu          Automatically adjust all GPU engine clock speeds to maintain a target temperature
+--benchmark         Run cgminer in benchmark mode - produces no shares
 --debug|-D          Enable debug output
---donation <arg>    Set donation percentage to cgminer author (0.0 - 99.9) (default: 0.0)
 --expiry|-E <arg>   Upper bound on how many seconds after getting work we consider a share from it stale (default: 120)
 --failover-only     Don't leak work to backup pools when primary pool is lagging
 --load-balance      Change multipool strategy from failover to even load balance
@@ -151,6 +160,7 @@ Options for both config file and command line:
 --socks-proxy <arg> Set socks4 proxy (host:port)
 --submit-stale      Submit shares even if they would normally be considered stale
 --syslog            Use system log for output messages (default: standard error)
+--temp-cutoff <arg> Temperature where a device will be automatically disabled, one value or comma separated list (default: 95)
 --text-only|-T      Disable ncurses formatted screen output
 --url|-o <arg>      URL for bitcoin JSON-RPC server
 --user|-u <arg>     Username for bitcoin JSON-RPC server
@@ -170,28 +180,39 @@ GPU only options:
 --device|-d <arg>   Select device to use, (Use repeat -d for multiple devices, default: all)
 --disable-gpu|-G    Disable GPU mining even if suitable devices exist
 --gpu-threads|-g <arg> Number of threads per GPU (1 - 10) (default: 2)
+--gpu-dyninterval <arg> Set the refresh interval in ms for GPUs using dynamic intensity (default: 7)
 --gpu-engine <arg>  GPU engine (over)clock range in Mhz - one value, range and/or comma separated list (e.g. 850-900,900,750-850)
 --gpu-fan <arg>     GPU fan percentage range - one value, range and/or comma separated list (e.g. 25-85,85,65)
 --gpu-memclock <arg> Set the GPU memory (over)clock in Mhz - one value for all or separate by commas for per card.
 --gpu-memdiff <arg> Set a fixed difference in clock speed between the GPU and memory in auto-gpu mode
 --gpu-powertune <arg> Set the GPU powertune percentage - one value for all or separate by commas for per card.
+--gpu-reorder       Attempt to reorder GPU devices according to PCI Bus ID
 --gpu-vddc <arg>    Set the GPU voltage in Volts - one value for all or separate by commas for per card.
 --intensity|-I <arg> Intensity of GPU scanning (d or -10 -> 10, default: d to maintain desktop interactivity)
+--kernel|-k <arg>   Override kernel to use (diablo, poclbm, phatk or diakgcn) - one value or comma separated
 --kernel-path|-K <arg> Specify a path to where the kernel .cl files are (default: "/usr/local/bin")
---kernel|-k <arg>   Select kernel to use (poclbm or phatk - default: auto)
+--ndevs|-n          Enumerate number of detected GPUs and exit
 --no-restart        Do not attempt to restart GPUs that hang
---temp-cutoff <arg> Temperature where a GPU device will be automatically disabled, one value or comma separated list (default: 95)
 --temp-hysteresis <arg> Set how much the temperature can fluctuate outside limits when automanaging speeds (default: 3)
 --temp-overheat <arg> Overheat temperature when automatically managing fan and GPU speeds (default: 85)
 --temp-target <arg> Target temperature when automatically managing fan and GPU speeds (default: 75)
---vectors|-v <arg>  Override detected optimal vector width (1, 2 or 4)
---worksize|-w <arg> Override detected optimal worksize (default: 0)
---ndevs|-n          Enumerate number of detected GPUs and exit
+--vectors|-v <arg>  Override detected optimal vector (1, 2 or 4) - one value or comma separated list
+--worksize|-w <arg> Override detected optimal worksize - one value or comma separated list
+
 
+FPGA mining boards(BitForce, Icarus) only options:
 
-BitForce only options:
+--scan-serial|-S <arg> Serial port to probe for FPGA mining device
 
---scan-serial|-S <arg> Serial port to probe for BitForce device
+     By default, cgminer will scan for autodetected FPGAs unless at least one
+     -S is specified. If you specify -S and still want cgminer to scan, you
+     must also use "-S auto". Note that presently, autodetection only works
+     on Linux, and might only detect one device depending on the version of
+     udev being used.
+
+     On linux <arg> is usually of the format /dev/ttyUSBn
+     On windows <arg> is usually of the format COMn
+       (where n = the correct device number for the FPGA device)
 
 
 CPU only options:
@@ -216,6 +237,11 @@ EXECUTIVE SUMMARY ON USAGE:
 After saving configuration from the menu, you do not need to give cgminer any
 arguments and it will load your configuration.
 
+Any configuration file may also contain a single
+	"include" : "filename"
+to recursively include another configuration file.
+Writing the configuration will save all settings from all files in the output.
+
 
 Single pool, regular desktop:
 
@@ -310,14 +336,14 @@ The output line shows the following:
 (5s):1713.6 (avg):1707.8 Mh/s | Q:301  A:729  R:8  HW:0  E:242%  U:22.53/m
 
 Each column is as follows:
-A 5 second exponentially decaying average hash rate
-An all time average hash rate
-The number of requested (Queued) work items from the pools
-The number of Accepted shares
-The number of Rejected shares
-The number of HardWare errors
-The Efficiency defined as number of shares returned / work item
-The Utility defined as the number of shares / minute
+5s:  A 5 second exponentially decaying average hash rate
+avg: An all time average hash rate
+Q:   The number of requested (Queued) work items from the pools
+A:   The number of Accepted shares
+R:   The number of Rejected shares
+HW:  The number of HardWare errors
+E:   The Efficiency defined as number of shares returned / work item
+U:   The Utility defined as the number of shares / minute
 
  GPU 1: 73.5C 2551RPM | 427.3/443.0Mh/s | A:8 R:0 HW:0 U:4.39/m
 
@@ -525,15 +551,35 @@ cgminer shuts down because of this.
 
 ---
 
-API
+RPC API
 
 If you start cgminer with the "--api-listen" option, it will listen on a
 simple TCP/IP socket for single string API requests from the same machine
 running cgminer and reply with a string and then close the socket each time
-Also, if you add the "--api-network" option, it will accept API requests
-from any network attached computer.
-
-The request can be either simple text or JSON.
+If you add the "--api-network" option, it will accept API requests from any
+network attached computer.
+
+You can only access the comands that reply with data in this mode.
+By default, you cannot access any privileged command that affects the miner -
+you will receive an access denied status message see --api-allow below.
+
+You can specify IP addresses/prefixes that are only allowed to access the API
+with the "--api-allow" option e.g. --api-allow W:192.168.0.1,10.0.0/24
+will allow 192.168.0.1 or any address matching 10.0.0.*, but nothing else
+IP addresses are automatically padded with extra '.0's as needed
+Without a /prefix is the same as specifying /32
+0/0 means all IP addresses.
+The 'W:' on the front gives that address/subnet privileged access to commands
+that modify cgminer.
+Without it those commands return an access denied status.
+Privileged access is checked in the order the IP addresses were supplied to
+"--api-allow"
+The first match determines the privilege level.
+Using the "--api-allow" option overides the "--api-network" option if they
+are both specified
+With "--api-allow", 127.0.0.1 is not by default given access unless specified
+
+The RPC API request can be either simple text or JSON.
 
 If the request is JSON (starts with '{'), it will reply with a JSON formatted
 response, otherwise it replies with text formatted as described further below.
@@ -543,16 +589,27 @@ The JSON request format required is '{"command":"CMD","parameter":"PARAM"}'
 where "CMD" is from the "Request" column below and "PARAM" would be e.g.
 the CPU/GPU number if required.
 
-An example request in both formats:
+An example request in both formats to set GPU 0 fan to 80%:
   gpufan|0,80
   {"command":"gpufan","parameter":"0,80"}
 
 The format of each reply (unless stated otherwise) is a STATUS section
 followed by an optional detail section
 
+From API verion 1.7 onwards, reply strings in JSON and Text have the
+necessary escaping as required to avoid ambiguity - they didn't before 1.7
+For JSON the 2 characters '"' and '\' are escaped with a '\' before them
+For Text the 4 characters '|' ',' '=' and '\' are escaped the same way
+
+Only user entered information will contain characters that require being
+escaped, such as Pool URL, User and Password or the Config save filename,
+when they are returned in messages or as their values by the API
+
+For API version 1.4 and later:
+
 The STATUS section is:
 
- STATUS=X,Code=N,Msg=string,Description=string|
+ STATUS=X,When=NNN,Code=N,Msg=string,Description=string|
 
   STATUS=X Where X is one of:
    W - Warning
@@ -561,6 +618,9 @@ The STATUS section is:
    E - Error
    F - Fatal (code bug)
 
+  When=NNN
+   Standard long time of request in seconds
+
   Code=N
    Each unique reply has a unigue Code (See api.c - #define MSG_NNNNNN)
 
@@ -571,7 +631,9 @@ The STATUS section is:
    This defaults to the cgminer version but is the value of --api-description
    if it was specified at runtime.
 
-The list of requests and replies are:
+For API version 1.7:
+
+The list of requests - a (*) means it requires privileged access - and replies are:
 
  Request       Reply Section  Details
  -------       -------------  -------
@@ -580,11 +642,14 @@ The list of requests and replies are:
 
  config        CONFIG         Some miner configuration information:
                               GPU Count=N, <- the number of GPUs
+                              PGA Count=N, <- the number of PGAs
                               CPU Count=N, <- the number of CPUs
                               Pool Count=N, <- the number of Pools
                               ADL=X, <- Y or N if ADL is compiled in the code
                               ADL in use=X, <- Y or N if any GPU has ADL
-                              Strategy=Name| <- the current pool strategy
+                              Strategy=Name, <- the current pool strategy
+                              Log Interval=N, <- log interval (--log N)
+                              Device Code=GPU ICA | <- spaced list of compiled devices
 
  summary       SUMMARY        The status summary of the miner
                               e.g. Elapsed=NNN,Found Blocks=N,Getworks=N,...|
@@ -592,13 +657,22 @@ The list of requests and replies are:
  pools         POOLS          The status of each pool
                               e.g. Pool=0,URL=http://pool.com:6311,Status=Alive,...|
 
- devs          DEVS           Each available CPU and GPU with their details
+ devs          DEVS           Each available GPU, PGA and CPU with their details
                               e.g. GPU=0,Accepted=NN,MHS av=NNN,...,Intensity=D|
+                              Last Share Time=NNN, <- standand long time in seconds
+                               (or 0 if none) of last accepted share
+                              Last Share Pool=N, <- pool number (or -1 if none)
+                              Will not report PGAs if PGA mining is disabled
                               Will not report CPUs if CPU mining is disabled
 
  gpu|N         GPU            The details of a single GPU number N in the same
                               format and details as for DEVS
 
+ pga|N         PGA            The details of a single PGA number N in the same
+                              format and details as for DEVS
+                              This is only available if PGA mining is enabled
+                              Use 'pgacount' or 'config' first to see if there are any
+
  cpu|N         CPU            The details of a single CPU number N in the same
                               format and details as for DEVS
                               This is only available if CPU mining is enabled
@@ -606,47 +680,106 @@ The list of requests and replies are:
 
  gpucount      GPUS           Count=N| <- the number of GPUs
 
+ pgacount      PGAS           Count=N| <- the number of PGAs
+                              Always returns 0 if PGA mining is disabled
+
  cpucount      CPUS           Count=N| <- the number of CPUs
                               Always returns 0 if CPU mining is disabled
 
- switchpool|N  none           There is no reply section just the STATUS section
+ switchpool|N (*)
+               none           There is no reply section just the STATUS section
                               stating the results of switching pool N to the
                               highest priority (the pool is also enabled)
                               The Msg includes the pool URL
 
- gpuenable|N   none           There is no reply section just the STATUS section
+ enablepool|N (*)
+               none           There is no reply section just the STATUS section
+                              stating the results of enabling pool N
+                              The Msg includes the pool URL
+
+ addpool|URL,USR,PASS (*)
+               none           There is no reply section just the STATUS section
+                              stating the results of attempting to add pool N
+                              The Msg includes the pool URL
+                              Use '\\' to get a '\' and '\,' to include a comma
+                              inside URL, USR or PASS
+
+ disablepool|N (*)
+               none           There is no reply section just the STATUS section
+                              stating the results of disabling pool N
+                              The Msg includes the pool URL
+
+ removepool|N (*)
+               none           There is no reply section just the STATUS section
+                              stating the results of removing pool N
+                              The Msg includes the pool URL
+                              N.B. all details for the pool will be lost
+
+ gpuenable|N (*)
+               none           There is no reply section just the STATUS section
                               stating the results of the enable request
 
- gpudisable|N  none           There is no reply section just the STATUS section
+ gpudisable|N (*)
+               none           There is no reply section just the STATUS section
                               stating the results of the disable request
 
- gpurestart|N  none           There is no reply section just the STATUS section
+ gpurestart|N (*)
+               none           There is no reply section just the STATUS section
                               stating the results of the restart request
 
- gpuintensity|N,I  none       There is no reply section just the STATUS section
+ gpuintensity|N,I (*)
+               none           There is no reply section just the STATUS section
                               stating the results of setting GPU N intensity to I
 
- gpumem|N,V    none           There is no reply section just the STATUS section
+ gpumem|N,V (*)
+               none           There is no reply section just the STATUS section
                               stating the results of setting GPU N memoryclock to V MHz
 
- gpuengine|N,V none           There is no reply section just the STATUS section
+ gpuengine|N,V (*)
+               none           There is no reply section just the STATUS section
                               stating the results of setting GPU N clock to V MHz
 
- gpufan|N,V    none           There is no reply section just the STATUS section
+ gpufan|N,V (*)
+               none           There is no reply section just the STATUS section
                               stating the results of setting GPU N fan speed to V%
 
- gpuvddc|N,V   none           There is no reply section just the STATUS section
+ gpuvddc|N,V (*)
+               none           There is no reply section just the STATUS section
                               stating the results of setting GPU N vddc to V
 
- save|filename none           There is no reply section just the STATUS section
+ save|filename (*)
+               none           There is no reply section just the STATUS section
                               stating success or failure saving the cgminer config
                               to filename
 
- quit          none           There is no status section but just a single "BYE|"
+ quit (*)      none           There is no status section but just a single "BYE|"
                               reply before cgminer quits
 
-When you enable, disable or restart a GPU, you will also get Thread messages in
-the cgminer status window
+ notify        NOTIFY         The last status and history count of each devices problem
+                              e.g. NOTIFY=0,Name=GPU,ID=0,Last Well=1332432290,...|
+
+ privileged (*)
+               none           There is no reply section just the STATUS section
+                              stating an error if you do not have privileged access
+                              to the API and success if you do have privilege
+                              The command doesn't change anything in cgminer
+
+ pgaenable|N (*)
+               none           There is no reply section just the STATUS section
+                              stating the results of the enable request
+                              You cannot enable a PGA if it's status is not WELL
+                              This is only available if PGA mining is enabled
+
+ pgadisable|N (*)
+               none           There is no reply section just the STATUS section
+                              stating the results of the disable request
+                              This is only available if PGA mining is enabled
+
+When you enable, disable or restart a GPU or PGA, you will also get Thread messages
+in the cgminer status window
+
+When you switch to a different pool to the current one, you will get a
+'Switching to URL' message in the cgminer status windows
 
 Obviously, the JSON format is simply just the names as given before the '='
 with the values after the '='
@@ -675,11 +808,9 @@ api-example.c - a 'C' program to access the API (with source code)
   api-example summary 127.0.0.1 4028
 
 miner.php - an example web page to access the API
- This includes buttons to enable, disable and restart the GPUs and also to
- quit cgminer
- You must modify the 2 lines near the top to change where it looks for cgminer
-  $miner = '127.0.0.1'; # hostname or IP address
-  $port = 4028;
+ This includes buttons and inputs to attempt access to the privileged commands
+ Read the top of the file (miner.php) for details of how to tune the display
+ and also to use the option to display a multi-rig summary
 
 ---
 
@@ -711,7 +842,9 @@ Q: The CPU usage is high.
 A: The ATI drivers after 11.6 have a bug that makes them consume 100% of one
 CPU core unnecessarily so downgrade to 11.6. Binding cgminer to one CPU core on
 windows can minimise it to 100% (instead of more than one core). Driver version
-11.11 on linux and 11.12 on windows appear to have fixed this issue.
+11.11 on linux and 11.12 on windows appear to have fixed this issue. Note that
+later drivers may have an apparent return of high CPU usage. Try
+'export GPU_USE_SYNC_OBJECTS=1' on Linux before starting cgminer.
 
 Q: Can you implement feature X?
 A: I can, but time is limited, and people who donate are more likely to get
@@ -745,14 +878,6 @@ than whatever it is being packaged with. If you installed cgminer yourself,
 then you do not have a virus on your computer. Complain to your antivirus
 software company.
 
-Q: How does the donation feature work and how does it affect my shares?
-A: The donation feature is disabled by default and only does anything once
-enabled. It queries the author's website for login credentials and contributes
-up to a proportion of work to the author's account. While the overall
-accepted/rejected rates will include this work, none of these will appear in
-your own accounts. On exiting, the summary will tell you how many shares were
-contributed to the author.
-
 Q: Can you modify the display to include more of one thing in the output and
 less of another, or can you change the quiet mode or can you add yet another
 output mode?
@@ -762,20 +887,14 @@ any further.
 
 Q: Can you change the autofan/autogpu to change speeds in a different manner?
 A: The defaults are sane and safe. I'm not interested in changing them
-further. The starting fan speed is set to 85% in auto-fan mode as a safety
-precaution, but if a specific fan speed has been set, it will use that first
-before adjusting automatically.
-
-Q: The fanspeed starts at 85% with --auto-fan. Can I set it lower?
-A: The initial fanspeed will always start at 85% unless you choose your own
-value with --gpu-fan. In this case it will use the value you give it with
---gpu-fan as the first fanspeed.
+further. The starting fan speed is set to 50% in auto-fan mode as a safety
+precaution.
 
 Q: Why is my efficiency above/below 100%?
 A: Efficiency simply means how many shares you return for the amount of work
 you request. It does not correlate with efficient use of your hardware, and is
 a measure of a combination of hardware speed, block luck, pool design and other
-factors.
+factors
 
 Q: What are the best parameters to pass for X pool/hardware/device.
 A: Virtually always, the DEFAULT parameters give the best results. Most user
@@ -800,21 +919,53 @@ this time.
 
 Q: Which ATI SDK is the best for cgminer?
 A: At the moment, versions 2.4 and 2.5 work the best. If you are forced to use
-the 2.6 SDK, -v 1 -w 64 might help, along with not decreasing your memroy clock
-speed.
+the 2.6 SDK.
 
-Q: ATI 79XX support?
-A: Pending.
+Q: I have multiple SDKs installed, can I choose which one it uses?
+A: Run cgminer with the -n option and it will list all the platforms currently
+installed. Then you can tell cgminer which platform to use with --gpu-platform.
 
 Q: GUI version?
 A: No. The RPC interface makes it possible for someone else to write one
 though.
 
+Q: I'm having an issue. What debugging information should I provide?
+A: Start cgminer with your regular commands and add -D -T --verbose and provide
+the full startup output and a summary of your hardware, operating system, ATI
+driver version and ATI stream version.
+
+Q: cgminer reports no devices or only one device on startup on Linux although
+I have multiple devices and drivers+SDK installed properly?
+A: Try 'export DISPLAY=:0" before running cgminer.
+
+Q: My network gets slower and slower and then dies for a minute?
+A; Try the --net-delay option.
+
+Q: How do I tune for p2pool?
+A: p2pool has very rapid expiration of work and new blocks, it is suggested you
+decrease intensity by 1 from your optimal value, and decrease GPU threads to 1
+with -g 1.
+
+Q: Are kernels from other mining software useable in cgminer?
+A: No, the APIs are slightly different between the different software and they
+will not work.
+
+Q: I run PHP on windows to access the API with the example miner.php. Why does
+it fail when php is installed properly but I only get errors about Sockets not
+working in the logs?
+A: http://us.php.net/manual/en/sockets.installation.php
+
+Q: What is a PGA?
+A: At the moment, cgminer supports 2 FPGA's: Icarus and BitForce.
+They are Field-Programmable Gate Arrays that have been programmed to do Bitcoin
+mining. Since the acronym needs to be only 3 characters, the "Field-" part has
+been skipped.
+
 ---
 
 This code is provided entirely free of charge by the programmer in his spare
-time so donations would be greatly appreciated. Please consider using the
---donation feature or donate directly to the address below.
+time so donations would be greatly appreciated. Please consider donating to the
+address below.
 
 Con Kolivas <kernel@kolivas.org>
 15qSxP1SQcUX3o4nhkfdbgyoWEFMomJ4rZ

+ 318 - 102
adl.c

@@ -1,12 +1,26 @@
+/*
+ * Copyright 2011-2012 Con Kolivas
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.  See COPYING for more details.
+ */
+
 #include "config.h"
 
 #if defined(HAVE_ADL) && (defined(__linux) || defined (WIN32))
 
 #include <stdio.h>
+#include <string.h>
+
+#ifdef HAVE_CURSES
 #include <curses.h>
+#endif
 
 #include "miner.h"
 #include "ADL_SDK/adl_sdk.h"
+#include "compat.h"
 
 #if defined (__linux)
 #include <dlfcn.h>
@@ -15,18 +29,24 @@
 #else /* WIN32 */
 #include <windows.h>
 #include <tchar.h>
-#define sleep(x) Sleep(x)
 #endif
 #include "adl_functions.h"
 
 bool adl_active;
+bool opt_reorder = false;
 
 int opt_hysteresis = 3;
 const int opt_targettemp = 75;
 const int opt_overheattemp = 85;
-const int opt_cutofftemp = 95;
 static pthread_mutex_t adl_lock;
 
+struct gpu_adapters {
+	int iAdapterIndex;
+	int iBusNumber;
+	int virtual_gpu;
+	int id;
+};
+
 // Memory allocation function
 static void * __stdcall ADL_Main_Memory_Alloc(int iSize)
 {
@@ -58,7 +78,6 @@ static	ADL_ADAPTER_NUMBEROFADAPTERS_GET	ADL_Adapter_NumberOfAdapters_Get;
 static	ADL_ADAPTER_ADAPTERINFO_GET	ADL_Adapter_AdapterInfo_Get;
 static	ADL_ADAPTER_ID_GET		ADL_Adapter_ID_Get;
 static	ADL_OVERDRIVE5_TEMPERATURE_GET	ADL_Overdrive5_Temperature_Get;
-static	ADL_OVERDRIVE5_THERMALDEVICES_ENUM	ADL_Overdrive5_ThermalDevices_Enum;
 static	ADL_OVERDRIVE5_CURRENTACTIVITY_GET	ADL_Overdrive5_CurrentActivity_Get;
 static	ADL_OVERDRIVE5_ODPARAMETERS_GET	ADL_Overdrive5_ODParameters_Get;
 static	ADL_OVERDRIVE5_FANSPEEDINFO_GET	ADL_Overdrive5_FanSpeedInfo_Get;
@@ -92,9 +111,22 @@ static inline void unlock_adl(void)
 	mutex_unlock(&adl_lock);
 }
 
-void init_adl(int nDevs)
+/* This looks for the twin GPU that has the fanspeed control of a non fanspeed
+ * control GPU on dual GPU cards */
+static bool fanspeed_twin(struct gpu_adl *ga, struct gpu_adl *other_ga)
 {
-	int i, j, devices = 0, last_adapter = -1, gpu = 0, dummy = 0, prev_id = -1, prev_gpu = -1;
+	if (!other_ga->has_fanspeed)
+		return false;
+	if (abs(ga->iBusNumber - other_ga->iBusNumber) != 1)
+		return false;
+	if (strcmp(ga->strAdapterName, other_ga->strAdapterName))
+		return false;
+	return true;
+}
+
+static bool prepare_adl(void)
+{
+	int result;
 
 #if defined (LINUX)
 	hDLL = dlopen( "libatiadlxx.so", RTLD_LAZY|RTLD_GLOBAL);
@@ -107,21 +139,14 @@ void init_adl(int nDevs)
 #endif
 	if (hDLL == NULL) {
 		applog(LOG_INFO, "Unable to load ati adl library");
-		return;
-	}
-
-	if (unlikely(pthread_mutex_init(&adl_lock, NULL))) {
-		applog(LOG_ERR, "Failed to init adl_lock in init_adl");
-		return;
+		return false;
 	}
-
 	ADL_Main_Control_Create = (ADL_MAIN_CONTROL_CREATE) GetProcAddress(hDLL,"ADL_Main_Control_Create");
 	ADL_Main_Control_Destroy = (ADL_MAIN_CONTROL_DESTROY) GetProcAddress(hDLL,"ADL_Main_Control_Destroy");
 	ADL_Adapter_NumberOfAdapters_Get = (ADL_ADAPTER_NUMBEROFADAPTERS_GET) GetProcAddress(hDLL,"ADL_Adapter_NumberOfAdapters_Get");
 	ADL_Adapter_AdapterInfo_Get = (ADL_ADAPTER_ADAPTERINFO_GET) GetProcAddress(hDLL,"ADL_Adapter_AdapterInfo_Get");
 	ADL_Adapter_ID_Get = (ADL_ADAPTER_ID_GET) GetProcAddress(hDLL,"ADL_Adapter_ID_Get");
 	ADL_Overdrive5_Temperature_Get = (ADL_OVERDRIVE5_TEMPERATURE_GET) GetProcAddress(hDLL,"ADL_Overdrive5_Temperature_Get");
-	ADL_Overdrive5_ThermalDevices_Enum = (ADL_OVERDRIVE5_THERMALDEVICES_ENUM) GetProcAddress(hDLL,"ADL_Overdrive5_ThermalDevices_Enum");
 	ADL_Overdrive5_CurrentActivity_Get = (ADL_OVERDRIVE5_CURRENTACTIVITY_GET) GetProcAddress(hDLL, "ADL_Overdrive5_CurrentActivity_Get");
 	ADL_Overdrive5_ODParameters_Get = (ADL_OVERDRIVE5_ODPARAMETERS_GET) GetProcAddress(hDLL, "ADL_Overdrive5_ODParameters_Get");
 	ADL_Overdrive5_FanSpeedInfo_Get = (ADL_OVERDRIVE5_FANSPEEDINFO_GET) GetProcAddress(hDLL, "ADL_Overdrive5_FanSpeedInfo_Get");
@@ -137,31 +162,51 @@ void init_adl(int nDevs)
 	if (!ADL_Main_Control_Create || !ADL_Main_Control_Destroy ||
 		!ADL_Adapter_NumberOfAdapters_Get || !ADL_Adapter_AdapterInfo_Get ||
 		!ADL_Adapter_ID_Get || !ADL_Overdrive5_Temperature_Get ||
-		!ADL_Overdrive5_ThermalDevices_Enum || !ADL_Overdrive5_CurrentActivity_Get ||
+		!ADL_Overdrive5_CurrentActivity_Get ||
 		!ADL_Overdrive5_ODParameters_Get || !ADL_Overdrive5_FanSpeedInfo_Get ||
 		!ADL_Overdrive5_FanSpeed_Get || !ADL_Overdrive5_FanSpeed_Set ||
 		!ADL_Overdrive5_ODPerformanceLevels_Get || !ADL_Overdrive5_ODPerformanceLevels_Set ||
 		!ADL_Main_Control_Refresh || !ADL_Overdrive5_PowerControl_Get ||
 		!ADL_Overdrive5_PowerControl_Set || !ADL_Overdrive5_FanSpeedToDefault_Set) {
 			applog(LOG_WARNING, "ATI ADL's API is missing");
-		return;
+		return false;
 	}
 
 	// Initialise ADL. The second parameter is 1, which means:
 	// retrieve adapter information only for adapters that are physically present and enabled in the system
-	if (ADL_Main_Control_Create (ADL_Main_Memory_Alloc, 1) != ADL_OK) {
-		applog(LOG_INFO, "ADL Initialisation Error!");
-		return ;
+	result = ADL_Main_Control_Create (ADL_Main_Memory_Alloc, 1);
+	if (result != ADL_OK) {
+		applog(LOG_INFO, "ADL Initialisation Error! Error %d!", result);
+		return false;
 	}
 
-	if (ADL_Main_Control_Refresh() != ADL_OK) {
-		applog(LOG_INFO, "ADL Refresh Error!");
-		return ;
+	result = ADL_Main_Control_Refresh();
+	if (result != ADL_OK) {
+		applog(LOG_INFO, "ADL Refresh Error! Error %d!", result);
+		return false;
+	}
+
+	return true;
+}
+
+void init_adl(int nDevs)
+{
+	int result, i, j, devices = 0, last_adapter = -1, gpu = 0, dummy = 0;
+	struct gpu_adapters adapters[MAX_GPUDEVICES], vadapters[MAX_GPUDEVICES];
+	bool devs_match = true;
+
+	if (unlikely(pthread_mutex_init(&adl_lock, NULL))) {
+		applog(LOG_ERR, "Failed to init adl_lock in init_adl");
+		return;
 	}
 
+	if (!prepare_adl())
+		return;
+
 	// Obtain the number of adapters for the system
-	if (ADL_Adapter_NumberOfAdapters_Get ( &iNumberAdapters ) != ADL_OK) {
-		applog(LOG_INFO, "Cannot get the number of adapters!\n");
+	result = ADL_Adapter_NumberOfAdapters_Get (&iNumberAdapters);
+	if (result != ADL_OK) {
+		applog(LOG_INFO, "Cannot get the number of adapters! Error %d!", result);
 		return ;
 	}
 
@@ -171,8 +216,9 @@ void init_adl(int nDevs)
 
 		lpInfo->iSize = sizeof(lpInfo);
 		// Get the AdapterInfo structure for all adapters in the system
-		if (ADL_Adapter_AdapterInfo_Get (lpInfo, sizeof (AdapterInfo) * iNumberAdapters) != ADL_OK) {
-			applog(LOG_INFO, "ADL_Adapter_AdapterInfo_Get Error!");
+		result = ADL_Adapter_AdapterInfo_Get (lpInfo, sizeof (AdapterInfo) * iNumberAdapters);
+		if (result != ADL_OK) {
+			applog(LOG_INFO, "ADL_Adapter_AdapterInfo_Get Error! Error %d", result);
 			return ;
 		}
 	} else {
@@ -180,17 +226,16 @@ void init_adl(int nDevs)
 		return;
 	}
 
+	/* Iterate over iNumberAdapters and find the lpAdapterID of real devices */
 	for (i = 0; i < iNumberAdapters; i++) {
-		struct gpu_adl *ga;
 		int iAdapterIndex;
 		int lpAdapterID;
-		ADLODPerformanceLevels *lpOdPerformanceLevels;
-		int lev;
 
 		iAdapterIndex = lpInfo[i].iAdapterIndex;
 		/* Get unique identifier of the adapter, 0 means not AMD */
-		if (ADL_Adapter_ID_Get(iAdapterIndex, &lpAdapterID) != ADL_OK) {
-			applog(LOG_INFO, "Failed to ADL_Adapter_ID_Get");
+		result = ADL_Adapter_ID_Get(iAdapterIndex, &lpAdapterID);
+		if (result != ADL_OK) {
+			applog(LOG_INFO, "Failed to ADL_Adapter_ID_Get. Error %d", result);
 			continue;
 		}
 
@@ -198,23 +243,106 @@ void init_adl(int nDevs)
 		if (lpAdapterID == last_adapter)
 			continue;
 
+		applog(LOG_DEBUG, "GPU %d "
+		       "iAdapterIndex %d "
+		       "strUDID %s "
+		       "iBusNumber %d "
+		       "iDeviceNumber %d "
+		       "iFunctionNumber %d "
+		       "iVendorID %d "
+		       "strAdapterName  %s ",
+		       devices,
+		       iAdapterIndex,
+		       lpInfo[i].strUDID,
+		       lpInfo[i].iBusNumber,
+		       lpInfo[i].iDeviceNumber,
+		       lpInfo[i].iFunctionNumber,
+		       lpInfo[i].iVendorID,
+		       lpInfo[i].strAdapterName);
+
+		adapters[devices].iAdapterIndex = iAdapterIndex;
+		adapters[devices].iBusNumber = lpInfo[i].iBusNumber;
+		adapters[devices].id = i;
+
 		/* We found a truly new adapter instead of a logical
-		* one. Now since there's no way of correlating the
-		* opencl enumerated devices and the ADL enumerated
-		* ones, we have to assume they're in the same order.*/
+		 * one. Now since there's no way of correlating the
+		 * opencl enumerated devices and the ADL enumerated
+		 * ones, we have to assume they're in the same order.*/
 		if (++devices > nDevs) {
-			applog(LOG_ERR, "ADL found more devices than opencl");
-			return;
+			applog(LOG_ERR, "ADL found more devices than opencl!");
+			applog(LOG_ERR, "There is possibly at least one GPU that doesn't support OpenCL");
+			devs_match = false;
+			devices = nDevs;
+			break;
 		}
-		gpu = devices - 1;
 		last_adapter = lpAdapterID;
 
 		if (!lpAdapterID) {
 			applog(LOG_INFO, "Adapter returns ID 0 meaning not AMD. Card order might be confused");
 			continue;
 		}
+	}
+
+	if (devices < nDevs) {
+		applog(LOG_ERR, "ADL found less devices than opencl!");
+		applog(LOG_ERR, "There is possibly more than one display attached to a GPU");
+		devs_match = false;
+	}
+
+	for (i = 0; i < nDevs; i++) {
+		vadapters[i].virtual_gpu = i;
+		vadapters[i].id = adapters[i].id;
+	}
+
+	if (!devs_match) {
+		applog(LOG_ERR, "WARNING: Number of OpenCL and ADL devices does not match!");
+		applog(LOG_ERR, "Hardware monitoring may NOT match up with devices!");
+	} else if (opt_reorder) {
+		/* Windows has some kind of random ordering for bus number IDs and
+		 * ordering the GPUs according to ascending order fixes it. Linux
+		 * has usually sequential but decreasing order instead! */
+		for (i = 0; i < devices; i++) {
+			int j, virtual_gpu;
+
+			virtual_gpu = 0;
+			for (j = 0; j < devices; j++) {
+				if (i == j)
+					continue;
+#ifdef WIN32
+				if (adapters[j].iBusNumber < adapters[i].iBusNumber)
+#else
+				if (adapters[j].iBusNumber > adapters[i].iBusNumber)
+#endif
+					virtual_gpu++;
+			}
+			if (virtual_gpu != i) {
+				applog(LOG_INFO, "Mapping device %d to GPU %d according to Bus Number order",
+				       i, virtual_gpu);
+				vadapters[virtual_gpu].virtual_gpu = i;
+				vadapters[virtual_gpu].id = adapters[i].id;
+			}
+		}
+	}
 
-		if (!gpus[gpu].enabled) {
+	for (gpu = 0; gpu < devices; gpu++) {
+		struct gpu_adl *ga;
+		int iAdapterIndex;
+		int lpAdapterID;
+		ADLODPerformanceLevels *lpOdPerformanceLevels;
+		int lev;
+
+		i = vadapters[gpu].id;
+		iAdapterIndex = lpInfo[i].iAdapterIndex;
+		gpus[gpu].virtual_gpu = vadapters[gpu].virtual_gpu;
+
+		/* Get unique identifier of the adapter, 0 means not AMD */
+		result = ADL_Adapter_ID_Get(iAdapterIndex, &lpAdapterID);
+		if (result != ADL_OK) {
+			applog(LOG_INFO, "Failed to ADL_Adapter_ID_Get. Error %d", result);
+			continue;
+		}
+
+		if (gpus[gpu].deven == DEV_DISABLED) {
 			gpus[i].gpu_engine =
 			gpus[i].gpu_memclock =
 			gpus[i].gpu_vddc =
@@ -223,6 +351,10 @@ void init_adl(int nDevs)
 			continue;
 		}
 
+		applog(LOG_INFO, "GPU %d %s hardware monitoring enabled", gpu, lpInfo[i].strAdapterName);
+		if (gpus[gpu].name)
+			free(gpus[gpu].name);
+		gpus[gpu].name = lpInfo[i].strAdapterName;
 		gpus[gpu].has_adl = true;
 		/* Flag adl as active if any card is successfully activated */
 		adl_active = true;
@@ -233,13 +365,10 @@ void init_adl(int nDevs)
 		ga->gpu = gpu;
 		ga->iAdapterIndex = iAdapterIndex;
 		ga->lpAdapterID = lpAdapterID;
+		strcpy(ga->strAdapterName, lpInfo[i].strAdapterName);
 		ga->DefPerfLev = NULL;
 		ga->twin = NULL;
 
-		ga->lpThermalControllerInfo.iSize=sizeof(ADLThermalControllerInfo);
-		if (ADL_Overdrive5_ThermalDevices_Enum(iAdapterIndex, 0, &ga->lpThermalControllerInfo) != ADL_OK)
-			applog(LOG_INFO, "Failed to ADL_Overdrive5_ThermalDevices_Enum");
-
 		ga->lpOdParameters.iSize = sizeof(ADLODParameters);
 		if (ADL_Overdrive5_ODParameters_Get(iAdapterIndex, &ga->lpOdParameters) != ADL_OK)
 			applog(LOG_INFO, "Failed to ADL_Overdrive5_ODParameters_Get");
@@ -278,7 +407,10 @@ void init_adl(int nDevs)
 			if (gpus[gpu].min_engine)
 				ga->minspeed = gpus[gpu].min_engine * 100;
 			ga->managed = true;
+			if (gpus[gpu].gpu_memdiff)
+				set_memoryclock(gpu, gpus[gpu].gpu_engine + gpus[gpu].gpu_memdiff);
 		}
+
 		if (gpus[gpu].gpu_memclock) {
 			int setmem = gpus[gpu].gpu_memclock * 100;
 
@@ -291,6 +423,7 @@ void init_adl(int nDevs)
 			ADL_Overdrive5_ODPerformanceLevels_Set(iAdapterIndex, lpOdPerformanceLevels);
 			ga->managed = true;
 		}
+
 		if (gpus[gpu].gpu_vddc) {
 			int setv = gpus[gpu].gpu_vddc * 1000;
 
@@ -303,25 +436,16 @@ void init_adl(int nDevs)
 			ADL_Overdrive5_ODPerformanceLevels_Set(iAdapterIndex, lpOdPerformanceLevels);
 			ga->managed = true;
 		}
+
 		ADL_Overdrive5_ODPerformanceLevels_Get(iAdapterIndex, 0, lpOdPerformanceLevels);
 		ga->iEngineClock = lpOdPerformanceLevels->aLevels[lev].iEngineClock;
 		ga->iMemoryClock = lpOdPerformanceLevels->aLevels[lev].iMemoryClock;
 		ga->iVddc = lpOdPerformanceLevels->aLevels[lev].iVddc;
+		ga->iBusNumber = lpInfo[i].iBusNumber;
 
-		if (ADL_Overdrive5_FanSpeedInfo_Get(iAdapterIndex, 0, &ga->lpFanSpeedInfo) != ADL_OK) {
+		if (ADL_Overdrive5_FanSpeedInfo_Get(iAdapterIndex, 0, &ga->lpFanSpeedInfo) != ADL_OK)
 			applog(LOG_INFO, "Failed to ADL_Overdrive5_FanSpeedInfo_Get");
-			/* This is our opportunity to detect the 2nd GPU in a
-			 * dual GPU device with a fan controller only on the
-			 * first */
-			if (prev_id >= 0 &&
-			    !strcmp(lpInfo[i].strAdapterName, lpInfo[prev_id].strAdapterName) &&
-			    lpInfo[i].iBusNumber == lpInfo[prev_id].iBusNumber + 1 &&
-			    gpus[prev_gpu].adl.has_fanspeed) {
-				applog(LOG_INFO, "2nd GPU of dual card device detected");
-				ga->twin = &gpus[prev_gpu].adl;
-				gpus[prev_gpu].adl.twin = ga;
-			}
-		} else
+		else
 			ga->has_fanspeed = true;
 
 		/* Save the fanspeed values as defaults in case we reset later */
@@ -346,21 +470,44 @@ void init_adl(int nDevs)
 			ga->targettemp = opt_targettemp;
 		if (!ga->overtemp)
 			ga->overtemp = opt_overheattemp;
-		if (!ga->cutofftemp)
-			ga->cutofftemp = opt_cutofftemp;
+		if (!gpus[gpu].cutofftemp)
+			gpus[gpu].cutofftemp = opt_cutofftemp;
 		if (opt_autofan) {
 			ga->autofan = true;
 			/* Set a safe starting default if we're automanaging fan speeds */
-			set_fanspeed(gpu, gpus[gpu].gpu_fan);
+			set_fanspeed(gpu, 50);
 		}
 		if (opt_autoengine) {
 			ga->autoengine = true;
 			ga->managed = true;
 		}
 		ga->lasttemp = __gpu_temp(ga);
+	}
+
+	for (gpu = 0; gpu < devices; gpu++) {
+		struct gpu_adl *ga = &gpus[gpu].adl;
+		int j;
+
+		for (j = 0; j < devices; j++) {
+			struct gpu_adl *other_ga;
+
+			if (j == gpu)
+				continue;
 
-		prev_id = i;
-		prev_gpu = gpu;
+			other_ga = &gpus[j].adl;
+
+			/* Search for twin GPUs on a single card. They will be
+			 * separated by one bus id and one will have fanspeed
+			 * while the other won't. */
+			if (!ga->has_fanspeed) {
+				if (fanspeed_twin(ga, other_ga)) {
+					applog(LOG_INFO, "Dual GPUs detected: %d and %d",
+						ga->gpu, other_ga->gpu);
+					ga->twin = other_ga;
+					other_ga->twin = ga;
+				}
+			}
+		}
 	}
 }
 
@@ -383,6 +530,7 @@ float gpu_temp(int gpu)
 	lock_adl();
 	ret = __gpu_temp(ga);
 	unlock_adl();
+	gpus[gpu].temp = ret;
 	return ret;
 }
 
@@ -534,6 +682,16 @@ int gpu_fanpercent(int gpu)
 	lock_adl();
 	ret = __gpu_fanpercent(ga);
 	unlock_adl();
+	if (unlikely(ga->has_fanspeed && ret == -1)) {
+		applog(LOG_WARNING, "GPU %d stopped reporting fanspeed due to driver corruption", gpu);
+		if (opt_restart) {
+			applog(LOG_WARNING, "Restart enabled, will restart cgminer");
+			applog(LOG_WARNING, "You can disable this with the --no-restart option");
+			app_restart();
+		}
+		applog(LOG_WARNING, "Disabling fanspeed monitoring on this device");
+		ga->has_fanspeed = false;
+	}
 	return ret;
 }
 
@@ -619,6 +777,10 @@ int set_engineclock(int gpu, int iEngineClock)
 	iEngineClock *= 100;
 	ga = &gpus[gpu].adl;
 
+	/* Keep track of intended engine clock in case the device changes
+	 * profile and drops while idle, not taking the new engine clock */
+	ga->lastengine = iEngineClock;
+
 	lev = ga->lpOdParameters.iNumberOfPerformanceLevels - 1;
 	lpOdPerformanceLevels = alloca(sizeof(ADLODPerformanceLevels) + (lev * sizeof(ADLODPerformanceLevel)));
 	lpOdPerformanceLevels->iSize = sizeof(ADLODPerformanceLevels) + sizeof(ADLODPerformanceLevel) * lev;
@@ -713,6 +875,7 @@ static void get_vddcrange(int gpu, float *imin, float *imax)
 	*imax = (float)ga->lpOdParameters.sVddc.iMax / 1000;
 }
 
+#ifdef HAVE_CURSES
 static float curses_float(const char *query)
 {
 	float ret;
@@ -723,6 +886,7 @@ static float curses_float(const char *query)
 	free(cvar);
 	return ret;
 }
+#endif
 
 int set_vddc(int gpu, float fVddc)
 {
@@ -788,8 +952,7 @@ int set_fanspeed(int gpu, int iFanSpeed)
 
 	ga = &gpus[gpu].adl;
 	if (!(ga->lpFanSpeedInfo.iFlags & (ADL_DL_FANCTRL_SUPPORTS_RPM_WRITE | ADL_DL_FANCTRL_SUPPORTS_PERCENT_WRITE ))) {
-		if (opt_debug)
-			applog(LOG_DEBUG, "GPU %d doesn't support rpm or percent write", gpu);
+		applog(LOG_DEBUG, "GPU %d doesn't support rpm or percent write", gpu);
 		return ret;
 	}
 
@@ -799,8 +962,7 @@ int set_fanspeed(int gpu, int iFanSpeed)
 
 	lock_adl();
 	if (ADL_Overdrive5_FanSpeed_Get(ga->iAdapterIndex, 0, &ga->lpFanSpeedValue) != ADL_OK) {
-		if (opt_debug)
-			applog(LOG_DEBUG, "GPU %d call to fanspeed get failed", gpu);
+		applog(LOG_DEBUG, "GPU %d call to fanspeed get failed", gpu);
 	}
 	if (!(ga->lpFanSpeedInfo.iFlags & ADL_DL_FANCTRL_SUPPORTS_PERCENT_WRITE)) {
 		/* Must convert speed to an RPM */
@@ -843,7 +1005,8 @@ static int set_powertune(int gpu, int iPercentage)
 	return ret;
 }
 
-static void fan_autotune(int gpu, int temp, int fanpercent, bool *fan_optimal)
+/* Returns whether the fanspeed is optimal already or not */
+static bool fan_autotune(int gpu, int temp, int fanpercent, int lasttemp)
 {
 	struct cgpu_info *cgpu = &gpus[gpu];
 	struct gpu_adl *ga = &cgpu->adl;
@@ -856,19 +1019,31 @@ static void fan_autotune(int gpu, int temp, int fanpercent, bool *fan_optimal)
 	if (temp > ga->overtemp && fanpercent < iMax) {
 		applog(LOG_WARNING, "Overheat detected on GPU %d, increasing fan to 100%", gpu);
 		newpercent = iMax;
-	} else if (temp > ga->targettemp && fanpercent < top && temp >= ga->lasttemp) {
-		if (opt_debug)
-			applog(LOG_DEBUG, "Temperature over target, increasing fanspeed");
+
+		cgpu->device_last_not_well = time(NULL);
+		cgpu->device_not_well_reason = REASON_DEV_OVER_HEAT;
+		cgpu->dev_over_heat_count++;
+	} else if (temp > ga->targettemp && fanpercent < top && temp >= lasttemp) {
+		applog(LOG_DEBUG, "Temperature over target, increasing fanspeed");
 		if (temp > ga->targettemp + opt_hysteresis)
 			newpercent = ga->targetfan + 10;
 		else
 			newpercent = ga->targetfan + 5;
 		if (newpercent > top)
 			newpercent = top;
-	} else if (fanpercent > bot && temp < ga->targettemp - opt_hysteresis && temp <= ga->lasttemp) {
-		if (opt_debug)
-			applog(LOG_DEBUG, "Temperature %d degrees below target, decreasing fanspeed", opt_hysteresis);
+	} else if (fanpercent > bot && temp < ga->targettemp - opt_hysteresis && temp <= lasttemp) {
+		applog(LOG_DEBUG, "Temperature %d degrees below target, decreasing fanspeed", opt_hysteresis);
 		newpercent = ga->targetfan - 1;
+	} else {
+		/* We're in the optimal range, make minor adjustments if the
+		 * temp is still drifting */
+		if (fanpercent > bot && temp < lasttemp && lasttemp < ga->targettemp) {
+			applog(LOG_DEBUG, "Temperature dropping while in target range, decreasing fanspeed");
+			newpercent = ga->targetfan - 1;
+		} else if (fanpercent < top && temp > lasttemp && temp > ga->targettemp - opt_hysteresis) {
+			applog(LOG_DEBUG, "Temperature rising while in target range, increasing fanspeed");
+			newpercent = ga->targetfan + 1;
+		}
 	}
 
 	if (newpercent > iMax)
@@ -876,15 +1051,16 @@ static void fan_autotune(int gpu, int temp, int fanpercent, bool *fan_optimal)
 	else if (newpercent < iMin)
 		newpercent = iMin;
 	if (newpercent != fanpercent) {
-		fan_optimal = false;
 		applog(LOG_INFO, "Setting GPU %d fan percentage to %d", gpu, newpercent);
 		set_fanspeed(gpu, newpercent);
+		return false;
 	}
+	return true;
 }
 
-void gpu_autotune(int gpu, bool *enable)
+void gpu_autotune(int gpu, enum dev_enable *denable)
 {
-	int temp, fanpercent, engine, newengine, twintemp;
+	int temp, fanpercent, engine, newengine, twintemp = 0;
 	bool fan_optimal = true;
 	struct cgpu_info *cgpu;
 	struct gpu_adl *ga;
@@ -904,48 +1080,69 @@ void gpu_autotune(int gpu, bool *enable)
 
 	if (temp && fanpercent >= 0 && ga->autofan) {
 		if (!ga->twin)
-			fan_autotune(gpu, temp, fanpercent, &fan_optimal);
-		else {
+			fan_optimal = fan_autotune(gpu, temp, fanpercent, ga->lasttemp);
+		else if (ga->autofan && (ga->has_fanspeed || !ga->twin->autofan)) {
+			/* On linked GPUs, we autotune the fan only once, based
+			 * on the highest temperature from either GPUs */
 			int hightemp, fan_gpu;
+			int lasttemp;
 
-			if (twintemp > temp)
+			if (twintemp > temp) {
+				lasttemp = ga->twin->lasttemp;
 				hightemp = twintemp;
-			else
+			} else {
+				lasttemp = ga->lasttemp;
 				hightemp = temp;
+			}
 			if (ga->has_fanspeed)
 				fan_gpu = gpu;
 			else
 				fan_gpu = ga->twin->gpu;
-			fan_autotune(fan_gpu, hightemp, fanpercent, &fan_optimal);
+			fan_optimal = fan_autotune(fan_gpu, hightemp, fanpercent, lasttemp);
 		}
 	}
 
 	if (engine && ga->autoengine) {
-		if (temp > ga->cutofftemp) {
+		if (temp > cgpu->cutofftemp) {
 			applog(LOG_WARNING, "Hit thermal cutoff limit on GPU %d, disabling!", gpu);
-			*enable = false;
+			*denable = DEV_RECOVER;
 			newengine = ga->minspeed;
+
+			cgpu->device_last_not_well = time(NULL);
+			cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
+			cgpu->dev_thermal_cutoff_count++;
 		} else if (temp > ga->overtemp && engine > ga->minspeed) {
 			applog(LOG_WARNING, "Overheat detected, decreasing GPU %d clock speed", gpu);
 			newengine = ga->minspeed;
-		/* Only try to tune engine speed if the current performance level is at max */
-		} else if ((ga->lpActivity.iCurrentPerformanceLevel == ga->lpOdParameters.iNumberOfPerformanceLevels - 1) &&
-			   (temp > ga->targettemp + opt_hysteresis && engine > ga->minspeed && fan_optimal)) {
-			if (opt_debug)
-				applog(LOG_DEBUG, "Temperature %d degrees over target, decreasing clock speed", opt_hysteresis);
+
+			cgpu->device_last_not_well = time(NULL);
+			cgpu->device_not_well_reason = REASON_DEV_OVER_HEAT;
+			cgpu->dev_over_heat_count++;
+		} else if (temp > ga->targettemp + opt_hysteresis && engine > ga->minspeed && fan_optimal) {
+			applog(LOG_DEBUG, "Temperature %d degrees over target, decreasing clock speed", opt_hysteresis);
 			newengine = engine - ga->lpOdParameters.sEngineClock.iStep;
-		} else if ((ga->lpActivity.iCurrentPerformanceLevel == ga->lpOdParameters.iNumberOfPerformanceLevels - 1) &&
-			   (temp < ga->targettemp && engine < ga->maxspeed)) {
-			if (opt_debug)
-				applog(LOG_DEBUG, "Temperature below target, increasing clock speed");
-			newengine = engine + ga->lpOdParameters.sEngineClock.iStep;
+			/* Only try to tune engine speed up if this GPU is not disabled */
+		} else if (temp < ga->targettemp && engine < ga->maxspeed && *denable == DEV_ENABLED) {
+			applog(LOG_DEBUG, "Temperature below target, increasing clock speed");
+			if (temp < ga->targettemp - opt_hysteresis)
+				newengine = ga->maxspeed;
+			else
+				newengine = engine + ga->lpOdParameters.sEngineClock.iStep;
+		} else if (temp < ga->targettemp && *denable == DEV_RECOVER && opt_restart) {
+			applog(LOG_NOTICE, "Device recovered to temperature below target, re-enabling");
+			*denable = DEV_ENABLED;
 		}
 
 		if (newengine > ga->maxspeed)
 			newengine = ga->maxspeed;
 		else if (newengine < ga->minspeed)
 			newengine = ga->minspeed;
-		if (newengine != engine) {
+
+		/* Adjust engine clock speed if it's lower, or if it's higher
+		 * but higher than the last intended value as well as the
+		 * current speed, to avoid setting the engine clock speed to
+		 * a speed relateive to a lower profile during idle periods. */
+		if (newengine < engine || (newengine > engine && newengine > ga->lastengine)) {
 			newengine /= 100;
 			applog(LOG_INFO, "Setting GPU %d engine clock to %d", gpu, newengine);
 			set_engineclock(gpu, newengine);
@@ -980,6 +1177,7 @@ void set_defaultengine(int gpu)
 	unlock_adl();
 }
 
+#ifdef HAVE_CURSES
 void change_autosettings(int gpu)
 {
 	struct gpu_adl *ga = &gpus[gpu].adl;
@@ -988,7 +1186,7 @@ void change_autosettings(int gpu)
 
 	wlogprint("Target temperature: %d\n", ga->targettemp);
 	wlogprint("Overheat temperature: %d\n", ga->overtemp);
-	wlogprint("Cutoff temperature: %d\n", ga->cutofftemp);
+	wlogprint("Cutoff temperature: %d\n", gpus[gpu].cutofftemp);
 	wlogprint("Toggle [F]an auto [G]PU auto\nChange [T]arget [O]verheat [C]utoff\n");
 	wlogprint("Or press any other key to continue\n");
 	input = getch();
@@ -1025,7 +1223,7 @@ void change_autosettings(int gpu)
 		if (val <= ga->overtemp || val > 200)
 			wlogprint("Invalid temperature");
 		else
-			ga->cutofftemp = val;
+			gpus[gpu].cutofftemp = val;
 	}
 }
 
@@ -1136,8 +1334,20 @@ updated:
 	sleep(1);
 	goto updated;
 }
+#endif
 
-void clear_adl(nDevs)
+static void free_adl(void)
+{
+	ADL_Main_Memory_Free ((void **)&lpInfo);
+	ADL_Main_Control_Destroy ();
+#if defined (LINUX)
+	dlclose(hDLL);
+#else
+	FreeLibrary(hDLL);
+#endif
+}
+
+void clear_adl(int nDevs)
 {
 	struct gpu_adl *ga;
 	int i;
@@ -1157,15 +1367,21 @@ void clear_adl(nDevs)
 		ADL_Overdrive5_FanSpeed_Set(ga->iAdapterIndex, 0, &ga->DefFanSpeedValue);
 		ADL_Overdrive5_FanSpeedToDefault_Set(ga->iAdapterIndex, 0);
 	}
-
-	ADL_Main_Memory_Free ( (void **)&lpInfo );
-	ADL_Main_Control_Destroy ();
+	adl_active = false;
 	unlock_adl();
+	free_adl();
+}
 
-#if defined (LINUX)
-	dlclose(hDLL);
-#else
-	FreeLibrary(hDLL);
-#endif
+void reinit_adl(void)
+{
+	bool ret;
+	lock_adl();
+	free_adl();
+	ret = prepare_adl();
+	if (!ret) {
+		adl_active = false;
+		applog(LOG_WARNING, "Attempt to re-initialise ADL has failed, disabling");
+	}
+	unlock_adl();
 }
 #endif /* HAVE_ADL */

+ 4 - 2
adl.h

@@ -2,10 +2,10 @@
 #define __ADL_H__
 #ifdef HAVE_ADL
 bool adl_active;
+bool opt_reorder;
 int opt_hysteresis;
 const int opt_targettemp;
 const int opt_overheattemp;
-const int opt_cutofftemp;
 void init_adl(int nDevs);
 float gpu_temp(int gpu);
 int gpu_engineclock(int gpu);
@@ -17,12 +17,14 @@ int gpu_fanpercent(int gpu);
 bool gpu_stats(int gpu, float *temp, int *engineclock, int *memclock, float *vddc,
 	       int *activity, int *fanspeed, int *fanpercent, int *powertune);
 void change_gpusettings(int gpu);
-void gpu_autotune(int gpu, bool *enable);
+void gpu_autotune(int gpu, enum dev_enable *denable);
 void clear_adl(int nDevs);
+void reinit_adl(void);
 #else /* HAVE_ADL */
 #define adl_active (0)
 static inline void init_adl(int nDevs) {}
 static inline void change_gpusettings(int gpu) { }
 static inline void clear_adl(int nDevs) {}
+static inline void reinit_adl(void) {}
 #endif
 #endif

+ 1 - 1
api-example.c

@@ -3,7 +3,7 @@
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
+ * Software Foundation; either version 3 of the License, or (at your option)
  * any later version.  See COPYING for more details.
  */
 

File diff suppressed because it is too large
+ 573 - 69
api.c


File diff suppressed because it is too large
+ 93 - 978
cgminer.c


+ 10 - 1
compat.h

@@ -2,10 +2,16 @@
 #define __COMPAT_H__
 
 #ifdef WIN32
+#include <time.h>
+#include <pthread.h>
 
 #include <windows.h>
 
-static inline void sleep(int secs)
+static inline void nanosleep(struct timespec *rgtp, void *__unused)
+{
+	Sleep(rgtp->tv_nsec / 1000000);
+}
+static inline void sleep(unsigned int secs)
 {
 	Sleep(secs * 1000);
 }
@@ -28,6 +34,9 @@ typedef unsigned int uint;
 typedef long suseconds_t;
 #endif
 
+#define PTH(thr) ((thr)->pth.p)
+#else
+#define PTH(thr) ((thr)->pth)
 #endif /* WIN32 */
 
 #endif /* __COMPAT_H__ */

+ 98 - 19
configure.ac

@@ -1,8 +1,8 @@
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 m4_define([v_maj], [2])
-m4_define([v_min], [1])
-m4_define([v_mic], [2])
+m4_define([v_min], [3])
+m4_define([v_mic], [3])
 ##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##--##
 m4_define([v_ver], [v_maj.v_min.v_mic])
 m4_define([lt_rev], m4_eval(v_maj + v_min))
@@ -16,7 +16,7 @@ AC_INIT([cgminer], [v_ver], [kernel@kolivas.org])
 AC_PREREQ(2.59)
 AC_CANONICAL_SYSTEM
 AC_CONFIG_MACRO_DIR([m4])
-AC_CONFIG_SRCDIR([main.c])
+AC_CONFIG_SRCDIR([cgminer.c])
 AC_CONFIG_HEADERS([config.h])
 
 AM_INIT_AUTOMAKE([foreign])
@@ -112,6 +112,8 @@ if test "x$ATISTREAMSDKROOT" != x; then
 	OPENCL_LIBS="-L$ATISTREAMSDKROOT/lib/$ATI_STREAM_ARCH_DIR $OPENCL_LIBS"
 fi
 
+cpumining="no"
+
 AC_ARG_ENABLE([cpumining],
 	[AC_HELP_STRING([--enable-cpumining],[Build with cpu mining support(default disabled)])],
 	[cpumining=$enableval]
@@ -121,6 +123,8 @@ if test "x$cpumining" = xyes; then
 fi
 AM_CONDITIONAL([HAS_CPUMINE], [test x$cpumining = xyes])
 
+opencl="yes"
+
 AC_ARG_ENABLE([opencl],
 	[AC_HELP_STRING([--disable-opencl],[Override detection and disable building with opencl])],
 	[opencl=$enableval]
@@ -181,25 +185,58 @@ else
 	DLOPEN_FLAGS=""
 fi
 
-bitforce=yes
+bitforce="no"
+
 AC_ARG_ENABLE([bitforce],
-	[AC_HELP_STRING([--disable-bitforce],[Don't compile support for BitForce FPGAs])],
+	[AC_HELP_STRING([--enable-bitforce],[Compile support for BitForce FPGAs(default disabled)])],
 	[bitforce=$enableval]
-)
-if test "x$bitforce" != xno; then
-	AC_DEFINE([USE_BITFORCE], [1], [Defined to 1 if BitForce support is wanted.])
+	)
+if test "x$bitforce" = xyes; then
+	AC_DEFINE([USE_BITFORCE], [1], [Defined to 1 if BitForce support is wanted])
+fi
+AM_CONDITIONAL([HAS_BITFORCE], [test x$bitforce = xyes])
+
+icarus="no"
+
+AC_ARG_ENABLE([icarus],
+	[AC_HELP_STRING([--enable-icarus],[Compile support for Icarus (default disabled)])],
+	[icarus=$enableval]
+	)
+if test "x$icarus" = xyes; then
+	AC_DEFINE([USE_ICARUS], [1], [Defined to 1 if Icarus support is wanted])
 fi
+AM_CONDITIONAL([HAS_ICARUS], [test x$icarus = xyes])
 
-AC_SEARCH_LIBS(addstr, ncurses pdcurses, ,
-        AC_MSG_ERROR([Could not find curses library - please install libncurses-dev or pdcurses-dev]))
 
-AC_CHECK_LIB(ncurses, addstr, NCURSES_LIBS=-lncurses)
-AC_CHECK_LIB(pdcurses, addstr, PDCURSES_LIBS=-lpdcurses)
+curses="auto"
+
+AC_ARG_WITH([curses],
+	[AC_HELP_STRING([--without-curses],[Compile support for curses TUI (default enabled)])],
+	[curses=$withval]
+	)
+if test "x$curses" = "xno"; then
+	cursesmsg='User specified --without-curses. TUI support DISABLED'
+else
+	AC_SEARCH_LIBS(addstr, ncurses pdcurses, [
+		curses=yes
+		cursesmsg="FOUND: ${ac_cv_search_addstr:2}"
+		AC_DEFINE([HAVE_CURSES], [1], [Defined to 1 if curses TUI support is wanted])
+	], [
+		if test "x$curses" = "xyes"; then
+			AC_MSG_ERROR([Could not find curses library - please install libncurses-dev or pdcurses-dev (or configure --without-curses)])
+		else
+			AC_MSG_WARN([Could not find curses library - if you want a TUI, install libncurses-dev or pdcurses-dev])
+			curses=no
+			cursesmsg='NOT FOUND. TUI support DISABLED'
+		fi
+	])
+fi
+
 
+AM_CONDITIONAL([HAVE_CURSES], [test x$curses = xyes])
 AM_CONDITIONAL([WANT_JANSSON], [test x$request_jansson = xtrue])
 AM_CONDITIONAL([HAVE_WINDOWS], [test x$have_win32 = xtrue])
 AM_CONDITIONAL([HAVE_x86_64], [test x$have_x86_64 = xtrue])
-AM_CONDITIONAL([USE_BITFORCE], [test x$bitforce != xno])
 
 if test x$request_jansson = xtrue
 then
@@ -244,6 +281,26 @@ fi
 
 AM_CONDITIONAL([HAS_YASM], [test x$has_yasm = xtrue])
 
+if test "x$bitforce" != xno; then
+	AC_ARG_WITH([libudev], [AC_HELP_STRING([--without-libudev], [Autodetect FPGAs using libudev (default enabled)])],
+		[libudev=$withval],
+		[libudev=auto]
+		)
+	if test "x$libudev" != "xno"; then
+		AC_CHECK_HEADER([libudev.h],[
+			libudev=yes
+			UDEV_LIBS=-ludev
+			AC_DEFINE([HAVE_LIBUDEV], [1], [Defined to 1 if libudev is wanted])
+		], [
+			if test "x$libudev" = "xyes"; then
+				AC_MSG_ERROR([libudev not found])
+			fi
+			libudev=no
+		])
+	fi
+fi
+AM_CONDITIONAL([HAVE_LIBUDEV], [test x$libudev != xno])
+
 PKG_PROG_PKG_CONFIG()
 
 PKG_CHECK_MODULES([LIBCURL], [libcurl >= 7.15.6], [AC_DEFINE([CURL_HAS_SOCKOPT], [1], [Defined if version of curl supports sockopts.])],
@@ -288,6 +345,12 @@ fi
 
 AC_DEFINE_UNQUOTED([CGMINER_PREFIX], ["$prefix/bin"], [Path to cgminer install])
 
+AC_DEFINE_UNQUOTED([PHATK_KERNNAME], ["phatk120223"], [Filename for phatk kernel])
+AC_DEFINE_UNQUOTED([POCLBM_KERNNAME], ["poclbm120327"], [Filename for poclbm kernel])
+AC_DEFINE_UNQUOTED([DIAKGCN_KERNNAME], ["diakgcn120223"], [Filename for diakgcn kernel])
+AC_DEFINE_UNQUOTED([DIABLO_KERNNAME], ["diablo120328"], [Filename for diablo kernel])
+
+
 AC_SUBST(OPENCL_LIBS)
 AC_SUBST(OPENCL_FLAGS)
 AC_SUBST(JANSSON_LIBS)
@@ -298,6 +361,7 @@ AC_SUBST(NCURSES_LIBS)
 AC_SUBST(PDCURSES_LIBS)
 AC_SUBST(WS2_LIBS)
 AC_SUBST(MATH_LIBS)
+AC_SUBST(UDEV_LIBS)
 
 AC_CONFIG_FILES([
 	Makefile
@@ -322,20 +386,20 @@ echo
 echo "Configuration Options Summary:"
 echo
 
-echo "  BitForce.FPGAs.......: $bitforce"
+echo "  curses.TUI...........: $cursesmsg"
 
 if test "x$opencl" != xno; then
 	if test $found_opencl = 1; then
 		echo "  OpenCL...............: FOUND. GPU mining support enabled"
 	else
 		echo "  OpenCL...............: NOT FOUND. GPU mining support DISABLED"
-		if test "x$cpumining$bitforce" = xnono; then
+		if test "x$cpumining$bitforce$icarus" = xnonono; then
 			AC_MSG_ERROR([No mining configured in])
 		fi
 	fi
 else
 	echo "  OpenCL...............: Detection overrided. GPU mining support DISABLED"
-	if test "x$cpumining$bitforce" = xnono; then
+	if test "x$cpumining$bitforce$icarus" = xnonono; then
 		AC_MSG_ERROR([No mining configured in])
 	fi
 fi
@@ -351,19 +415,34 @@ else
 fi
 
 echo
+if test "x$bitforce" = xyes; then
+	echo "  BitForce.FPGAs.......: Enabled"
+else
+	echo "  BitForce.FPGAs.......: Disabled"
+fi
+
+if test "x$icarus" = xyes; then
+	echo "  Icarus.FPGAs.........: Enabled"
+else
+	echo "  Icarus.FPGAs.........: Disabled"
+fi
+
+if test "x$bitforce" != xno; then
+	echo "  libudev.detection....: $libudev"
+fi
 
 if test "x$cpumining" = xyes; then
+	echo
 	echo "  CPU Mining...........: Enabled"
 	echo "  ASM.(for CPU mining).: $has_yasm"
-else
-	echo "  CPU Mining...........: Disabled"
 fi
+
 echo
 echo "Compilation............: make (or gmake)"
 echo "  CPPFLAGS.............: $CPPFLAGS"
 echo "  CFLAGS...............: $CFLAGS"
 echo "  LDFLAGS..............: $LDFLAGS $PTHREAD_FLAGS"
-echo "  LDADD................: $DLOPEN_FLAGS $LIBCURL_LIBS $JANSSON_LIBS $PTHREAD_LIBS $OPENCL_LIBS $NCURSES_LIBS $PDCURSES_LIBS $WS2_LIBS $MATH_LIBS"
+echo "  LDADD................: $DLOPEN_FLAGS $LIBCURL_LIBS $JANSSON_LIBS $PTHREAD_LIBS $OPENCL_LIBS $NCURSES_LIBS $PDCURSES_LIBS $WS2_LIBS $MATH_LIBS $UDEV_LIBS"
 echo
 echo "Installation...........: make install (as root if needed, with 'su' or 'sudo')"
 echo "  prefix...............: $prefix"

+ 1274 - 0
diablo120328.cl

@@ -0,0 +1,1274 @@
+/*
+ *  DiabloMiner - OpenCL miner for BitCoin
+ *  Copyright (C) 2010, 2011, 2012 Patrick McFarland <diablod3@gmail.com>
+ *
+ *  This program is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation, either version 3 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more detail).
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef VECTORS4
+	typedef uint4 z;
+#elif defined(VECTORS2)
+	typedef uint2 z;
+#else
+	typedef uint z;
+#endif
+
+#ifdef BITALIGN
+#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+#define Zrotr(a, b) amd_bitalign((z)a, (z)a, (z)(32 - b))
+#else
+#define Zrotr(a, b) rotate((z)a, (z)b)
+#endif
+
+#ifdef BFI_INT
+#define ZCh(a, b, c) amd_bytealign(a, b, c)
+#define ZMa(a, b, c) amd_bytealign((c ^ a), (b), (a))
+#else
+#define ZCh(a, b, c) bitselect((z)c, (z)b, (z)a)
+#define ZMa(a, b, c) bitselect((z)a, (z)b, (z)c ^ (z)a)
+#endif
+
+#define ZR25(n) ((Zrotr((n), 25) ^ Zrotr((n), 14) ^ ((n) >> 3U)))
+#define ZR15(n) ((Zrotr((n), 15) ^ Zrotr((n), 13) ^ ((n) >> 10U)))
+#define ZR26(n) ((Zrotr((n), 26) ^ Zrotr((n), 21) ^ Zrotr((n), 7)))
+#define ZR30(n) ((Zrotr((n), 30) ^ Zrotr((n), 19) ^ Zrotr((n), 10)))
+
+__kernel
+__attribute__((vec_type_hint(z)))
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+void search(
+#ifndef GOFFSET
+    const z base,
+#endif
+    const uint PreVal4_state0, const uint PreVal4_state0_k7,
+    const uint PreVal4_T1,
+    const uint W18, const uint W19,
+    const uint W16, const uint W17,
+    const uint W16_plus_K16, const uint W17_plus_K17,
+    const uint W31, const uint W32,
+    const uint d1, const uint b1, const uint c1,
+    const uint h1, const uint f1, const uint g1,
+    const uint c1_plus_k5, const uint b1_plus_k6,
+    const uint state0, const uint state1, const uint state2, const uint state3,
+    const uint state4, const uint state5, const uint state6, const uint state7,
+    __global uint * output)
+{
+
+  z ZA[930];
+
+#ifdef GOFFSET
+	const z Znonce = (uint)(get_global_id(0));
+#else
+	const z Znonce = base + (uint)(get_global_id(0));
+#endif
+
+    ZA[15] = Znonce + PreVal4_state0;
+    
+    ZA[16] = (ZCh(ZA[15], b1, c1) + d1) + ZR26(ZA[15]);
+    ZA[26] = Znonce + PreVal4_T1;
+    
+    ZA[27] = ZMa(f1, g1, ZA[26]) + ZR30(ZA[26]);
+    ZA[17] = ZA[16] + h1;
+    
+    ZA[19] = (ZCh(ZA[17], ZA[15], b1) + c1_plus_k5) + ZR26(ZA[17]);
+    ZA[28] = ZA[27] + ZA[16];
+    
+    ZA[548] = ZMa(ZA[26], f1, ZA[28]) + ZR30(ZA[28]);
+    ZA[20] = ZA[19] + g1;
+    
+    ZA[22] = (ZCh(ZA[20], ZA[17], ZA[15]) + b1_plus_k6) + ZR26(ZA[20]);
+    ZA[29] = ZA[548] + ZA[19];
+    
+    ZA[549] = ZMa(ZA[28], ZA[26], ZA[29]) + ZR30(ZA[29]);
+    ZA[23] = ZA[22] + f1;
+    
+    ZA[24] = ZCh(ZA[23], ZA[20], ZA[17]) + ZR26(ZA[23]);
+    ZA[180] = Znonce + PreVal4_state0_k7;
+    ZA[30] = ZA[549] + ZA[22];
+    
+    ZA[31] = ZMa(ZA[29], ZA[28], ZA[30]) + ZR30(ZA[30]);
+    ZA[181] = ZA[180] + ZA[24];
+    
+    ZA[182] = ZA[181] + ZA[26];
+    ZA[183] = ZA[181] + ZA[31];
+    ZA[18] = ZA[17] + 0xd807aa98U;
+    
+    ZA[186] = (ZCh(ZA[182], ZA[23], ZA[20]) + ZA[18]) + ZR26(ZA[182]);
+    ZA[184] = ZMa(ZA[30], ZA[29], ZA[183]) + ZR30(ZA[183]);
+    
+    ZA[187] = ZA[186] + ZA[28];
+    ZA[188] = ZA[186] + ZA[184];
+    ZA[21] = ZA[20] + 0x12835b01U;
+    
+    ZA[191] = (ZCh(ZA[187], ZA[182], ZA[23]) + ZA[21]) + ZR26(ZA[187]);
+    ZA[189] = ZMa(ZA[183], ZA[30], ZA[188]) + ZR30(ZA[188]);
+    
+    ZA[192] = ZA[191] + ZA[29];
+    ZA[193] = ZA[191] + ZA[189];
+    ZA[25] = ZA[23] + 0x243185beU;
+    
+    ZA[196] = (ZCh(ZA[192], ZA[187], ZA[182]) + ZA[25]) + ZR26(ZA[192]);
+    ZA[194] = ZMa(ZA[188], ZA[183], ZA[193]) + ZR30(ZA[193]);
+    
+    ZA[197] = ZA[196] + ZA[30];
+    ZA[198] = ZA[196] + ZA[194];
+    ZA[185] = ZA[182] + 0x550c7dc3U;
+    
+    ZA[201] = (ZCh(ZA[197], ZA[192], ZA[187]) + ZA[185]) + ZR26(ZA[197]);
+    ZA[199] = ZMa(ZA[193], ZA[188], ZA[198]) + ZR30(ZA[198]);
+    
+    ZA[202] = ZA[201] + ZA[183];
+    ZA[203] = ZA[201] + ZA[199];
+    ZA[190] = ZA[187] + 0x72be5d74U;
+    
+    ZA[206] = (ZCh(ZA[202], ZA[197], ZA[192]) + ZA[190]) + ZR26(ZA[202]);
+    ZA[204] = ZMa(ZA[198], ZA[193], ZA[203]) + ZR30(ZA[203]);
+    
+    ZA[207] = ZA[206] + ZA[188];
+    ZA[208] = ZA[206] + ZA[204];
+    ZA[195] = ZA[192] + 0x80deb1feU;
+    
+    ZA[211] = (ZCh(ZA[207], ZA[202], ZA[197]) + ZA[195]) + ZR26(ZA[207]);
+    ZA[209] = ZMa(ZA[203], ZA[198], ZA[208]) + ZR30(ZA[208]);
+    
+    ZA[212] = ZA[193] + ZA[211];
+    ZA[213] = ZA[211] + ZA[209];
+    ZA[200] = ZA[197] + 0x9bdc06a7U;
+    
+    ZA[216] = (ZCh(ZA[212], ZA[207], ZA[202]) + ZA[200]) + ZR26(ZA[212]);
+    ZA[214] = ZMa(ZA[208], ZA[203], ZA[213]) + ZR30(ZA[213]);
+    
+    ZA[217] = ZA[198] + ZA[216];
+    ZA[218] = ZA[216] + ZA[214];
+    ZA[205] = ZA[202] + 0xc19bf3f4U;
+    
+    ZA[220] = (ZCh(ZA[217], ZA[212], ZA[207]) + ZA[205]) + ZR26(ZA[217]);
+    ZA[219] = ZMa(ZA[213], ZA[208], ZA[218]) + ZR30(ZA[218]);
+    
+    ZA[222] = ZA[203] + ZA[220];
+    ZA[223] = ZA[220] + ZA[219];
+    ZA[210] = ZA[207] + W16_plus_K16;
+    
+    ZA[226] = (ZCh(ZA[222], ZA[217], ZA[212]) + ZA[210]) + ZR26(ZA[222]);
+    ZA[225] = ZMa(ZA[218], ZA[213], ZA[223]) + ZR30(ZA[223]);
+    
+    ZA[0] = ZR25(Znonce) + W18;
+    ZA[228] = ZA[226] + ZA[225];
+    ZA[227] = ZA[208] + ZA[226];
+    ZA[215] = ZA[212] + W17_plus_K17;
+    
+    ZA[231] = (ZCh(ZA[227], ZA[222], ZA[217]) + ZA[215]) + ZR26(ZA[227]);
+    ZA[229] = ZMa(ZA[223], ZA[218], ZA[228]) + ZR30(ZA[228]);
+    ZA[1] = ZA[0] + 0x0fc19dc6U;
+    
+    ZA[232] = ZA[213] + ZA[231];
+    ZA[233] = ZA[231] + ZA[229];
+    ZA[221] = ZA[217] + ZA[1];
+    ZA[32] = Znonce + W19;
+    
+    ZA[236] = (ZCh(ZA[232], ZA[227], ZA[222]) + ZA[221]) + ZR26(ZA[232]);
+    ZA[234] = ZMa(ZA[228], ZA[223], ZA[233]) + ZR30(ZA[233]);
+    ZA[33] = ZA[32] + 0x240ca1ccU;
+    
+    ZA[3] = ZR15(ZA[0]) + 0x80000000U;
+    ZA[238] = ZA[236] + ZA[234];
+    ZA[237] = ZA[218] + ZA[236];
+    ZA[224] = ZA[222] + ZA[33];
+    
+    ZA[241] = (ZCh(ZA[237], ZA[232], ZA[227]) + ZA[224]) + ZR26(ZA[237]);
+    ZA[239] = ZMa(ZA[233], ZA[228], ZA[238]) + ZR30(ZA[238]);
+    ZA[4] = ZA[3] + 0x2de92c6fU;
+    
+    ZA[35] = ZR15(ZA[32]);
+    ZA[243] = ZA[241] + ZA[239];
+    ZA[242] = ZA[223] + ZA[241];
+    ZA[230] = ZA[227] + ZA[4];
+    
+    ZA[246] = (ZCh(ZA[242], ZA[237], ZA[232]) + ZA[230]) + ZR26(ZA[242]);
+    ZA[244] = ZMa(ZA[238], ZA[233], ZA[243]) + ZR30(ZA[243]);
+    ZA[36] = ZA[35] + 0x4a7484aaU;
+    
+    ZA[7] = ZR15(ZA[3]) + 0x00000280U;
+    ZA[248] = ZA[246] + ZA[244];
+    ZA[247] = ZA[228] + ZA[246];
+    ZA[235] = ZA[232] + ZA[36];
+    
+    ZA[251] = (ZCh(ZA[247], ZA[242], ZA[237]) + ZA[235]) + ZR26(ZA[247]);
+    ZA[249] = ZMa(ZA[243], ZA[238], ZA[248]) + ZR30(ZA[248]);
+    ZA[8] = ZA[7] + 0x5cb0a9dcU;
+    
+    ZA[38] = ZR15(ZA[35]) + W16;
+    ZA[253] = ZA[251] + ZA[249];
+    ZA[252] = ZA[233] + ZA[251];
+    ZA[240] = ZA[237] + ZA[8];
+    
+    ZA[256] = (ZCh(ZA[252], ZA[247], ZA[242]) + ZA[240]) + ZR26(ZA[252]);
+    ZA[254] = ZMa(ZA[248], ZA[243], ZA[253]) + ZR30(ZA[253]);
+    ZA[40] = ZA[38] + 0x76f988daU;
+    
+    ZA[10] = ZR15(ZA[7]) + W17;
+    ZA[258] = ZA[256] + ZA[254];
+    ZA[257] = ZA[238] + ZA[256];
+    ZA[245] = ZA[242] + ZA[40];
+    
+    ZA[261] = (ZCh(ZA[257], ZA[252], ZA[247]) + ZA[245]) + ZR26(ZA[257]);
+    ZA[259] = ZMa(ZA[253], ZA[248], ZA[258]) + ZR30(ZA[258]);
+    ZA[13] = ZA[10] + 0x983e5152U;
+    
+    ZA[43] = ZR15(ZA[38]) + ZA[0];
+    ZA[263] = ZA[261] + ZA[259];
+    ZA[262] = ZA[243] + ZA[261];
+    ZA[250] = ZA[247] + ZA[13];
+    
+    ZA[266] = (ZCh(ZA[262], ZA[257], ZA[252]) + ZA[250]) + ZR26(ZA[262]);
+    ZA[264] = ZMa(ZA[258], ZA[253], ZA[263]) + ZR30(ZA[263]);
+    ZA[11] = ZR15(ZA[10]);
+    ZA[45] = ZA[43] + 0xa831c66dU;
+    
+    ZA[52] = ZA[11] + ZA[32];
+    ZA[267] = ZA[248] + ZA[266];
+    ZA[255] = ZA[252] + ZA[45];
+    ZA[268] = ZA[266] + ZA[264];
+    
+    ZA[271] = (ZCh(ZA[267], ZA[262], ZA[257]) + ZA[255]) + ZR26(ZA[267]);
+    ZA[269] = ZMa(ZA[263], ZA[258], ZA[268]) + ZR30(ZA[268]);
+    ZA[54] = ZA[52] + 0xb00327c8U;
+    
+    ZA[48] = ZR15(ZA[43]) + ZA[3];
+    ZA[273] = ZA[271] + ZA[269];
+    ZA[272] = ZA[253] + ZA[271];
+    ZA[260] = ZA[257] + ZA[54];
+    
+    ZA[276] = (ZCh(ZA[272], ZA[267], ZA[262]) + ZA[260]) + ZR26(ZA[272]);
+    ZA[274] = ZMa(ZA[268], ZA[263], ZA[273]) + ZR30(ZA[273]);
+    ZA[49] = ZA[48] + 0xbf597fc7U;
+    
+    ZA[61] = ZR15(ZA[52]) + ZA[35];
+    ZA[278] = ZA[276] + ZA[274];
+    ZA[277] = ZA[258] + ZA[276];
+    ZA[265] = ZA[262] + ZA[49];
+    
+    ZA[281] = (ZCh(ZA[277], ZA[272], ZA[267]) + ZA[265]) + ZR26(ZA[277]);
+    ZA[279] = ZMa(ZA[273], ZA[268], ZA[278]) + ZR30(ZA[278]);
+    ZA[62] = ZA[61] + 0xc6e00bf3U;
+    
+    ZA[53] = ZR15(ZA[48]) + ZA[7];
+    ZA[283] = ZA[281] + ZA[279];
+    ZA[282] = ZA[263] + ZA[281];
+    ZA[270] = ZA[267] + ZA[62];
+    
+    ZA[286] = (ZCh(ZA[282], ZA[277], ZA[272]) + ZA[270]) + ZR26(ZA[282]);
+    ZA[284] = ZMa(ZA[278], ZA[273], ZA[283]) + ZR30(ZA[283]);
+    ZA[39] = ZA[38] + 0x00A00055U;
+    ZA[55] = ZA[53] + 0xd5a79147U;
+    
+    ZA[66] = ZR15(ZA[61]) + ZA[39];
+    ZA[288] = ZA[286] + ZA[284];
+    ZA[287] = ZA[268] + ZA[286];
+    ZA[275] = ZA[272] + ZA[55];
+    
+    ZA[291] = (ZCh(ZA[287], ZA[282], ZA[277]) + ZA[275]) + ZR26(ZA[287]);
+    ZA[289] = ZMa(ZA[283], ZA[278], ZA[288]) + ZR30(ZA[288]);
+    ZA[12] = ZA[10] + W31;
+    ZA[68] = ZA[66] + 0x06ca6351U;
+    
+    ZA[67] = ZR15(ZA[53]) + ZA[12];
+    ZA[293] = ZA[291] + ZA[289];
+    ZA[292] = ZA[273] + ZA[291];
+    ZA[280] = ZA[277] + ZA[68];
+    
+    ZA[296] = (ZCh(ZA[292], ZA[287], ZA[282]) + ZA[280]) + ZR26(ZA[292]);
+    ZA[294] = ZMa(ZA[288], ZA[283], ZA[293]) + ZR30(ZA[293]);
+    ZA[2] = ZR25(ZA[0]);
+    ZA[69] = ZA[67] + 0x14292967U;
+    ZA[44] = ZA[43] + W32;
+    
+    ZA[75] = ZR15(ZA[66]) + ZA[44];
+    ZA[298] = ZA[296] + ZA[294];
+    ZA[297] = ZA[278] + ZA[296];
+    ZA[285] = ZA[282] + ZA[69];
+    ZA[5] = ZA[2] + W17;
+    
+    ZA[301] = (ZCh(ZA[297], ZA[292], ZA[287]) + ZA[285]) + ZR26(ZA[297]);
+    ZA[299] = ZMa(ZA[293], ZA[288], ZA[298]) + ZR30(ZA[298]);
+    ZA[56] = ZA[52] + ZA[5];
+    ZA[76] = ZA[75] + 0x27b70a85U;
+    
+    ZA[34] = ZR25(ZA[32]) + ZA[0];
+    ZA[70] = ZR15(ZA[67]) + ZA[56];
+    ZA[302] = ZA[283] + ZA[301];
+    ZA[303] = ZA[301] + ZA[299];
+    ZA[290] = ZA[287] + ZA[76];
+    
+    ZA[306] = (ZCh(ZA[302], ZA[297], ZA[292]) + ZA[290]) + ZR26(ZA[302]);
+    ZA[304] = ZMa(ZA[298], ZA[293], ZA[303]) + ZR30(ZA[303]);
+    ZA[6] = ZR25(ZA[3]);
+    ZA[77] = ZA[70] + 0x2e1b2138U;
+    ZA[50] = ZA[34] + ZA[48];
+    
+    ZA[78] = ZR15(ZA[75]) + ZA[50];
+    ZA[308] = ZA[306] + ZA[304];
+    ZA[307] = ZA[288] + ZA[306];
+    ZA[295] = ZA[292] + ZA[77];
+    ZA[41] = ZA[32] + ZA[6];
+    
+    ZA[311] = (ZCh(ZA[307], ZA[302], ZA[297]) + ZA[295]) + ZR26(ZA[307]);
+    ZA[309] = ZMa(ZA[303], ZA[298], ZA[308]) + ZR30(ZA[308]);
+    ZA[63] = ZA[41] + ZA[61];
+    ZA[85] = ZA[78] + 0x4d2c6dfcU;
+    
+    ZA[37] = ZR25(ZA[35]) + ZA[3];
+    ZA[79] = ZR15(ZA[70]) + ZA[63];
+    ZA[312] = ZA[293] + ZA[311];
+    ZA[313] = ZA[311] + ZA[309];
+    ZA[300] = ZA[297] + ZA[85];
+    
+    ZA[316] = (ZCh(ZA[312], ZA[307], ZA[302]) + ZA[300]) + ZR26(ZA[312]);
+    ZA[314] = ZMa(ZA[308], ZA[303], ZA[313]) + ZR30(ZA[313]);
+    ZA[9] = ZR25(ZA[7]);
+    ZA[86] = ZA[79] + 0x53380d13U;
+    ZA[57] = ZA[37] + ZA[53];
+    
+    ZA[87] = ZR15(ZA[78]) + ZA[57];
+    ZA[318] = ZA[316] + ZA[314];
+    ZA[317] = ZA[298] + ZA[316];
+    ZA[305] = ZA[302] + ZA[86];
+    ZA[46] = ZA[35] + ZA[9];
+    
+    ZA[321] = (ZCh(ZA[317], ZA[312], ZA[307]) + ZA[305]) + ZR26(ZA[317]);
+    ZA[319] = ZMa(ZA[313], ZA[308], ZA[318]) + ZR30(ZA[318]);
+    ZA[71] = ZA[46] + ZA[66];
+    ZA[92] = ZA[87] + 0x650a7354U;
+    
+    ZA[42] = ZR25(ZA[38]) + ZA[7];
+    ZA[88] = ZR15(ZA[79]) + ZA[71];
+    ZA[322] = ZA[303] + ZA[321];
+    ZA[323] = ZA[321] + ZA[319];
+    ZA[310] = ZA[307] + ZA[92];
+    
+    ZA[326] = (ZCh(ZA[322], ZA[317], ZA[312]) + ZA[310]) + ZR26(ZA[322]);
+    ZA[324] = ZMa(ZA[318], ZA[313], ZA[323]) + ZR30(ZA[323]);
+    ZA[14] = ZR25(ZA[10]);
+    ZA[93] = ZA[88] + 0x766a0abbU;
+    ZA[72] = ZA[42] + ZA[67];
+    
+    ZA[94] = ZR15(ZA[87]) + ZA[72];
+    ZA[328] = ZA[326] + ZA[324];
+    ZA[327] = ZA[308] + ZA[326];
+    ZA[315] = ZA[312] + ZA[93];
+    ZA[51] = ZA[38] + ZA[14];
+    
+    ZA[331] = (ZCh(ZA[327], ZA[322], ZA[317]) + ZA[315]) + ZR26(ZA[327]);
+    ZA[329] = ZMa(ZA[323], ZA[318], ZA[328]) + ZR30(ZA[328]);
+    ZA[80] = ZA[51] + ZA[75];
+    ZA[100] = ZA[94] + 0x81c2c92eU;
+    
+    ZA[47] = ZR25(ZA[43]) + ZA[10];
+    ZA[95] = ZR15(ZA[88]) + ZA[80];
+    ZA[332] = ZA[313] + ZA[331];
+    ZA[333] = ZA[331] + ZA[329];
+    ZA[320] = ZA[317] + ZA[100];
+    
+    ZA[336] = (ZCh(ZA[332], ZA[327], ZA[322]) + ZA[320]) + ZR26(ZA[332]);
+    ZA[334] = ZMa(ZA[328], ZA[323], ZA[333]) + ZR30(ZA[333]);
+    ZA[81] = ZA[47] + ZA[70];
+    ZA[101] = ZA[95] + 0x92722c85U;
+    
+    ZA[58] = ZR25(ZA[52]) + ZA[43];
+    ZA[102] = ZR15(ZA[94]) + ZA[81];
+    ZA[337] = ZA[318] + ZA[336];
+    ZA[338] = ZA[336] + ZA[334];
+    ZA[325] = ZA[322] + ZA[101];
+    
+    ZA[341] = (ZCh(ZA[337], ZA[332], ZA[327]) + ZA[325]) + ZR26(ZA[337]);
+    ZA[339] = ZMa(ZA[333], ZA[328], ZA[338]) + ZR30(ZA[338]);
+    ZA[89] = ZA[58] + ZA[78];
+    ZA[108] = ZA[102] + 0xa2bfe8a1U;
+    
+    ZA[59] = ZR25(ZA[48]) + ZA[52];
+    ZA[103] = ZR15(ZA[95]) + ZA[89];
+    ZA[342] = ZA[323] + ZA[341];
+    ZA[343] = ZA[341] + ZA[339];
+    ZA[330] = ZA[327] + ZA[108];
+    
+    ZA[346] = (ZCh(ZA[342], ZA[337], ZA[332]) + ZA[330]) + ZR26(ZA[342]);
+    ZA[344] = ZMa(ZA[338], ZA[333], ZA[343]) + ZR30(ZA[343]);
+    ZA[90] = ZA[59] + ZA[79];
+    ZA[109] = ZA[103] + 0xa81a664bU;
+    
+    ZA[64] = ZR25(ZA[61]) + ZA[48];
+    ZA[110] = ZR15(ZA[102]) + ZA[90];
+    ZA[347] = ZA[328] + ZA[346];
+    ZA[348] = ZA[346] + ZA[344];
+    ZA[335] = ZA[332] + ZA[109];
+    
+    ZA[351] = (ZCh(ZA[347], ZA[342], ZA[337]) + ZA[335]) + ZR26(ZA[347]);
+    ZA[349] = ZMa(ZA[343], ZA[338], ZA[348]) + ZR30(ZA[348]);
+    ZA[60] = ZR25(ZA[53]);
+    ZA[116] = ZA[110] + 0xc24b8b70U;
+    ZA[96] = ZA[87] + ZA[64];
+    
+    ZA[111] = ZR15(ZA[103]) + ZA[96];
+    ZA[353] = ZA[351] + ZA[349];
+    ZA[352] = ZA[333] + ZA[351];
+    ZA[340] = ZA[337] + ZA[116];
+    ZA[65] = ZA[60] + ZA[61];
+    
+    ZA[356] = (ZCh(ZA[352], ZA[347], ZA[342]) + ZA[340]) + ZR26(ZA[352]);
+    ZA[354] = ZMa(ZA[348], ZA[343], ZA[353]) + ZR30(ZA[353]);
+    ZA[97] = ZA[88] + ZA[65];
+    ZA[117] = ZA[111] + 0xc76c51a3U;
+    
+    ZA[73] = ZR25(ZA[66]) + ZA[53];
+    ZA[118] = ZR15(ZA[110]) + ZA[97];
+    ZA[357] = ZA[338] + ZA[356];
+    ZA[358] = ZA[356] + ZA[354];
+    ZA[345] = ZA[342] + ZA[117];
+    
+    ZA[361] = (ZCh(ZA[357], ZA[352], ZA[347]) + ZA[345]) + ZR26(ZA[357]);
+    ZA[359] = ZMa(ZA[353], ZA[348], ZA[358]) + ZR30(ZA[358]);
+    ZA[104] = ZA[73] + ZA[94];
+    ZA[124] = ZA[118] + 0xd192e819U;
+    
+    ZA[74] = ZR25(ZA[67]) + ZA[66];
+    ZA[119] = ZR15(ZA[111]) + ZA[104];
+    ZA[362] = ZA[343] + ZA[361];
+    ZA[363] = ZA[361] + ZA[359];
+    ZA[350] = ZA[347] + ZA[124];
+    
+    ZA[366] = (ZCh(ZA[362], ZA[357], ZA[352]) + ZA[350]) + ZR26(ZA[362]);
+    ZA[364] = ZMa(ZA[358], ZA[353], ZA[363]) + ZR30(ZA[363]);
+    ZA[105] = ZA[74] + ZA[95];
+    ZA[125] = ZA[119] + 0xd6990624U;
+    
+    ZA[82] = ZR25(ZA[75]) + ZA[67];
+    ZA[126] = ZR15(ZA[118]) + ZA[105];
+    ZA[367] = ZA[348] + ZA[366];
+    ZA[368] = ZA[366] + ZA[364];
+    ZA[355] = ZA[352] + ZA[125];
+    
+    ZA[371] = (ZCh(ZA[367], ZA[362], ZA[357]) + ZA[355]) + ZR26(ZA[367]);
+    ZA[369] = ZMa(ZA[363], ZA[358], ZA[368]) + ZR30(ZA[368]);
+    ZA[112] = ZA[102] + ZA[82];
+    ZA[132] = ZA[126] + 0xf40e3585U;
+    
+    ZA[83] = ZR25(ZA[70]) + ZA[75];
+    ZA[127] = ZR15(ZA[119]) + ZA[112];
+    ZA[372] = ZA[353] + ZA[371];
+    ZA[373] = ZA[371] + ZA[369];
+    ZA[360] = ZA[357] + ZA[132];
+    
+    ZA[376] = (ZCh(ZA[372], ZA[367], ZA[362]) + ZA[360]) + ZR26(ZA[372]);
+    ZA[374] = ZMa(ZA[368], ZA[363], ZA[373]) + ZR30(ZA[373]);
+    ZA[113] = ZA[103] + ZA[83];
+    ZA[133] = ZA[127] + 0x106aa070U;
+    
+    ZA[84] = ZR25(ZA[78]) + ZA[70];
+    ZA[134] = ZR15(ZA[126]) + ZA[113];
+    ZA[377] = ZA[358] + ZA[376];
+    ZA[378] = ZA[376] + ZA[374];
+    ZA[365] = ZA[362] + ZA[133];
+    
+    ZA[381] = (ZCh(ZA[377], ZA[372], ZA[367]) + ZA[365]) + ZR26(ZA[377]);
+    ZA[379] = ZMa(ZA[373], ZA[368], ZA[378]) + ZR30(ZA[378]);
+    ZA[120] = ZA[110] + ZA[84];
+    ZA[140] = ZA[134] + 0x19a4c116U;
+    
+    ZA[91] = ZR25(ZA[79]) + ZA[78];
+    ZA[135] = ZR15(ZA[127]) + ZA[120];
+    ZA[382] = ZA[363] + ZA[381];
+    ZA[383] = ZA[381] + ZA[379];
+    ZA[370] = ZA[367] + ZA[140];
+    
+    ZA[386] = (ZCh(ZA[382], ZA[377], ZA[372]) + ZA[370]) + ZR26(ZA[382]);
+    ZA[384] = ZMa(ZA[378], ZA[373], ZA[383]) + ZR30(ZA[383]);
+    ZA[121] = ZA[111] + ZA[91];
+    ZA[141] = ZA[135] + 0x1e376c08U;
+    
+    ZA[98] = ZR25(ZA[87]) + ZA[79];
+    ZA[142] = ZR15(ZA[134]) + ZA[121];
+    ZA[387] = ZA[368] + ZA[386];
+    ZA[388] = ZA[386] + ZA[384];
+    ZA[375] = ZA[372] + ZA[141];
+    
+    ZA[391] = (ZCh(ZA[387], ZA[382], ZA[377]) + ZA[375]) + ZR26(ZA[387]);
+    ZA[389] = ZMa(ZA[383], ZA[378], ZA[388]) + ZR30(ZA[388]);
+    ZA[128] = ZA[118] + ZA[98];
+    ZA[147] = ZA[142] + 0x2748774cU;
+    
+    ZA[99] = ZR25(ZA[88]) + ZA[87];
+    ZA[143] = ZR15(ZA[135]) + ZA[128];
+    ZA[392] = ZA[373] + ZA[391];
+    ZA[393] = ZA[391] + ZA[389];
+    ZA[380] = ZA[377] + ZA[147];
+    
+    ZA[396] = (ZCh(ZA[392], ZA[387], ZA[382]) + ZA[380]) + ZR26(ZA[392]);
+    ZA[394] = ZMa(ZA[388], ZA[383], ZA[393]) + ZR30(ZA[393]);
+    ZA[129] = ZA[119] + ZA[99];
+    ZA[148] = ZA[143] + 0x34b0bcb5U;
+    
+    ZA[106] = ZR25(ZA[94]) + ZA[88];
+    ZA[149] = ZR15(ZA[142]) + ZA[129];
+    ZA[397] = ZA[378] + ZA[396];
+    ZA[398] = ZA[396] + ZA[394];
+    ZA[385] = ZA[382] + ZA[148];
+    
+    ZA[401] = (ZCh(ZA[397], ZA[392], ZA[387]) + ZA[385]) + ZR26(ZA[397]);
+    ZA[399] = ZMa(ZA[393], ZA[388], ZA[398]) + ZR30(ZA[398]);
+    ZA[136] = ZA[126] + ZA[106];
+    ZA[153] = ZA[149] + 0x391c0cb3U;
+    
+    ZA[107] = ZR25(ZA[95]) + ZA[94];
+    ZA[150] = ZR15(ZA[143]) + ZA[136];
+    ZA[402] = ZA[383] + ZA[401];
+    ZA[403] = ZA[401] + ZA[399];
+    ZA[390] = ZA[387] + ZA[153];
+    
+    ZA[406] = (ZCh(ZA[402], ZA[397], ZA[392]) + ZA[390]) + ZR26(ZA[402]);
+    ZA[404] = ZMa(ZA[398], ZA[393], ZA[403]) + ZR30(ZA[403]);
+    ZA[137] = ZA[127] + ZA[107];
+    ZA[154] = ZA[150] + 0x4ed8aa4aU;
+    
+    ZA[114] = ZR25(ZA[102]) + ZA[95];
+    ZA[155] = ZR15(ZA[149]) + ZA[137];
+    ZA[407] = ZA[388] + ZA[406];
+    ZA[408] = ZA[406] + ZA[404];
+    ZA[395] = ZA[392] + ZA[154];
+    
+    ZA[411] = (ZCh(ZA[407], ZA[402], ZA[397]) + ZA[395]) + ZR26(ZA[407]);
+    ZA[409] = ZMa(ZA[403], ZA[398], ZA[408]) + ZR30(ZA[408]);
+    ZA[144] = ZA[134] + ZA[114];
+    ZA[159] = ZA[155] + 0x5b9cca4fU;
+    
+    ZA[115] = ZR25(ZA[103]) + ZA[102];
+    ZA[156] = ZR15(ZA[150]) + ZA[144];
+    ZA[412] = ZA[393] + ZA[411];
+    ZA[413] = ZA[411] + ZA[409];
+    ZA[400] = ZA[397] + ZA[159];
+    
+    ZA[416] = (ZCh(ZA[412], ZA[407], ZA[402]) + ZA[400]) + ZR26(ZA[412]);
+    ZA[414] = ZMa(ZA[408], ZA[403], ZA[413]) + ZR30(ZA[413]);
+    ZA[145] = ZA[135] + ZA[115];
+    ZA[160] = ZA[156] + 0x682e6ff3U;
+    
+    ZA[122] = ZR25(ZA[110]) + ZA[103];
+    ZA[161] = ZR15(ZA[155]) + ZA[145];
+    ZA[417] = ZA[398] + ZA[416];
+    ZA[418] = ZA[416] + ZA[414];
+    ZA[405] = ZA[402] + ZA[160];
+    
+    ZA[421] = (ZCh(ZA[417], ZA[412], ZA[407]) + ZA[405]) + ZR26(ZA[417]);
+    ZA[419] = ZMa(ZA[413], ZA[408], ZA[418]) + ZR30(ZA[418]);
+    ZA[151] = ZA[142] + ZA[122];
+    ZA[165] = ZA[161] + 0x748f82eeU;
+    
+    ZA[123] = ZR25(ZA[111]) + ZA[110];
+    ZA[162] = ZR15(ZA[156]) + ZA[151];
+    ZA[422] = ZA[403] + ZA[421];
+    ZA[423] = ZA[421] + ZA[419];
+    ZA[410] = ZA[407] + ZA[165];
+    
+    ZA[426] = (ZCh(ZA[422], ZA[417], ZA[412]) + ZA[410]) + ZR26(ZA[422]);
+    ZA[424] = ZMa(ZA[418], ZA[413], ZA[423]) + ZR30(ZA[423]);
+    ZA[152] = ZA[143] + ZA[123];
+    ZA[166] = ZA[162] + 0x78a5636fU;
+    
+    ZA[130] = ZR25(ZA[118]) + ZA[111];
+    ZA[167] = ZR15(ZA[161]) + ZA[152];
+    ZA[427] = ZA[408] + ZA[426];
+    ZA[428] = ZA[426] + ZA[424];
+    ZA[415] = ZA[412] + ZA[166];
+    
+    ZA[431] = (ZCh(ZA[427], ZA[422], ZA[417]) + ZA[415]) + ZR26(ZA[427]);
+    ZA[429] = ZMa(ZA[423], ZA[418], ZA[428]) + ZR30(ZA[428]);
+    ZA[157] = ZA[149] + ZA[130];
+    ZA[170] = ZA[167] + 0x84c87814U;
+    
+    ZA[131] = ZR25(ZA[119]) + ZA[118];
+    ZA[168] = ZR15(ZA[162]) + ZA[157];
+    ZA[432] = ZA[413] + ZA[431];
+    ZA[433] = ZA[431] + ZA[429];
+    ZA[420] = ZA[417] + ZA[170];
+    
+    ZA[436] = (ZCh(ZA[432], ZA[427], ZA[422]) + ZA[420]) + ZR26(ZA[432]);
+    ZA[434] = ZMa(ZA[428], ZA[423], ZA[433]) + ZR30(ZA[433]);
+    ZA[158] = ZA[150] + ZA[131];
+    ZA[171] = ZA[168] + 0x8cc70208U;
+    
+    ZA[138] = ZR25(ZA[126]) + ZA[119];
+    ZA[172] = ZR15(ZA[167]) + ZA[158];
+    ZA[437] = ZA[418] + ZA[436];
+    ZA[438] = ZA[436] + ZA[434];
+    ZA[425] = ZA[422] + ZA[171];
+    
+    ZA[441] = (ZCh(ZA[437], ZA[432], ZA[427]) + ZA[425]) + ZR26(ZA[437]);
+    ZA[439] = ZMa(ZA[433], ZA[428], ZA[438]) + ZR30(ZA[438]);
+    ZA[163] = ZA[155] + ZA[138];
+    ZA[174] = ZA[172] + 0x90befffaU;
+    
+    ZA[139] = ZR25(ZA[127]) + ZA[126];
+    ZA[173] = ZR15(ZA[168]) + ZA[163];
+    ZA[442] = ZA[423] + ZA[441];
+    ZA[443] = ZA[441] + ZA[439];
+    ZA[430] = ZA[427] + ZA[174];
+    
+    ZA[445] = (ZCh(ZA[442], ZA[437], ZA[432]) + ZA[430]) + ZR26(ZA[442]);
+    ZA[444] = ZMa(ZA[438], ZA[433], ZA[443]) + ZR30(ZA[443]);
+    ZA[164] = ZA[156] + ZA[139];
+    ZA[175] = ZA[173] + 0xa4506cebU;
+    
+    ZA[146] = ZR25(ZA[134]) + ZA[127];
+    ZA[176] = ZR15(ZA[172]) + ZA[164];
+    ZA[446] = ZA[428] + ZA[445];
+    ZA[447] = ZA[445] + ZA[444];
+    ZA[435] = ZA[432] + ZA[175];
+    
+    ZA[449] = (ZCh(ZA[446], ZA[442], ZA[437]) + ZA[435]) + ZR26(ZA[446]);
+    ZA[448] = ZMa(ZA[443], ZA[438], ZA[447]) + ZR30(ZA[447]);
+    ZA[169] = ZA[161] + ZA[146];
+    ZA[178] = ZA[176] + 0xbef9a3f7U;
+    
+    ZA[177] = ZR15(ZA[173]) + ZA[169];
+    ZA[451] = ZA[449] + ZA[448];
+    ZA[450] = ZA[433] + ZA[449];
+    ZA[440] = ZA[437] + ZA[178];
+    
+    ZA[453] = (ZCh(ZA[450], ZA[446], ZA[442]) + ZA[440]) + ZR26(ZA[450]);
+    ZA[452] = ZMa(ZA[447], ZA[443], ZA[451]) + ZR30(ZA[451]);
+    ZA[179] = ZA[177] + 0xc67178f2U;
+    
+    ZA[454] = ZA[438] + ZA[453];
+    ZA[494] = ZA[442] + ZA[179];
+    ZA[455] = ZA[453] + ZA[452];
+    
+    ZA[457] = (ZCh(ZA[454], ZA[450], ZA[446]) + ZA[494]) + ZR26(ZA[454]);
+    ZA[456] = ZMa(ZA[451], ZA[447], ZA[455]) + ZR30(ZA[455]);
+    
+    ZA[459] = ZA[457] + ZA[456];
+    
+    ZA[461] = ZA[455] + state1;
+    ZA[460] = ZA[459] + state0;
+    
+    ZA[495] = ZA[460] + 0x98c7e2a2U;
+    ZA[469] = ZA[461] + 0x90bb1e3cU;
+    
+    ZA[498] = (ZCh(ZA[495], 0x510e527fU, 0x9b05688cU) + ZA[469]) + ZR26(ZA[495]);
+    ZA[462] = ZA[451] + state2;
+    
+    ZA[496] = ZA[460] + 0xfc08884dU;
+    ZA[506] = ZA[498] + 0x3c6ef372U;
+    ZA[470] = ZA[462] + 0x50c6645bU;
+    
+    ZA[507] = (ZCh(ZA[506], ZA[495], 0x510e527fU) + ZA[470]) + ZR26(ZA[506]);
+    ZA[500] = ZMa(0x6a09e667U, 0xbb67ae85U, ZA[496]) + ZR30(ZA[496]);
+    ZA[463] = ZA[447] + state3;
+    
+    ZA[458] = ZA[443] + ZA[457];
+    ZA[499] = ZA[498] + ZA[500];
+    ZA[508] = ZA[507] + 0xbb67ae85U;
+    ZA[473] = ZA[463] + 0x3ac42e24U;
+    
+    ZA[510] = (ZCh(ZA[508], ZA[506], ZA[495]) + ZA[473]) + ZR26(ZA[508]);
+    ZA[928] = ZMa(ZA[496], 0x6a09e667U, ZA[499]) + ZR30(ZA[499]);
+    ZA[464] = ZA[458] + state4;
+    
+    ZA[476] = ZA[464] + ZA[460] + 0xd21ea4fdU;
+    ZA[511] = ZA[510] + 0x6a09e667U;
+    ZA[509] = ZA[928] + ZA[507];
+    ZA[465] = ZA[454] + state5;
+    
+    ZA[514] = (ZCh(ZA[511], ZA[508], ZA[506]) + ZA[476]) + ZR26(ZA[511]);
+    ZA[512] = ZMa(ZA[499], ZA[496], ZA[509]) + ZR30(ZA[509]);
+    ZA[478] = ZA[465] + 0x59f111f1U;
+    
+    ZA[519] = ZA[506] + ZA[478];
+    ZA[516] = ZA[496] + ZA[514];
+    ZA[513] = ZA[510] + ZA[512];
+    ZA[466] = ZA[450] + state6;
+    
+    ZA[520] = (ZCh(ZA[516], ZA[511], ZA[508]) + ZA[519]) + ZR26(ZA[516]);
+    ZA[515] = ZMa(ZA[509], ZA[499], ZA[513]) + ZR30(ZA[513]);
+    ZA[480] = ZA[466] + 0x923f82a4U;
+    
+    ZA[524] = ZA[508] + ZA[480];
+    ZA[521] = ZA[499] + ZA[520];
+    ZA[517] = ZA[514] + ZA[515];
+    ZA[467] = ZA[446] + state7;
+    
+    ZA[525] = (ZCh(ZA[521], ZA[516], ZA[511]) + ZA[524]) + ZR26(ZA[521]);
+    ZA[522] = ZMa(ZA[513], ZA[509], ZA[517]) + ZR30(ZA[517]);
+    ZA[484] = ZA[467] + 0xab1c5ed5U;
+    
+    ZA[529] = ZA[511] + ZA[484];
+    ZA[526] = ZA[509] + ZA[525];
+    ZA[523] = ZA[520] + ZA[522];
+    
+    ZA[530] = (ZCh(ZA[526], ZA[521], ZA[516]) + ZA[529]) + ZR26(ZA[526]);
+    ZA[550] = ZMa(ZA[517], ZA[513], ZA[523]) + ZR30(ZA[523]);
+    
+    ZA[531] = ZA[513] + ZA[530];
+    ZA[533] = ZA[516] + 0x5807aa98U;
+    ZA[527] = ZA[550] + ZA[525];
+    
+    ZA[534] = (ZCh(ZA[531], ZA[526], ZA[521]) + ZA[533]) + ZR26(ZA[531]);
+    ZA[551] = ZMa(ZA[523], ZA[517], ZA[527]) + ZR30(ZA[527]);
+    
+    ZA[535] = ZA[517] + ZA[534];
+    ZA[538] = ZA[521] + 0x12835b01U;
+    ZA[532] = ZA[551] + ZA[530];
+    
+    ZA[539] = (ZCh(ZA[535], ZA[531], ZA[526]) + ZA[538]) + ZR26(ZA[535]);
+    ZA[552] = ZMa(ZA[527], ZA[523], ZA[532]) + ZR30(ZA[532]);
+    
+    ZA[540] = ZA[523] + ZA[539];
+    ZA[542] = ZA[526] + 0x243185beU;
+    ZA[536] = ZA[552] + ZA[534];
+    
+    ZA[543] = (ZCh(ZA[540], ZA[535], ZA[531]) + ZA[542]) + ZR26(ZA[540]);
+    ZA[553] = ZMa(ZA[532], ZA[527], ZA[536]) + ZR30(ZA[536]);
+    
+    ZA[544] = ZA[527] + ZA[543];
+    ZA[555] = ZA[531] + 0x550c7dc3U;
+    ZA[541] = ZA[553] + ZA[539];
+    
+    ZA[558] = (ZCh(ZA[544], ZA[540], ZA[535]) + ZA[555]) + ZR26(ZA[544]);
+    ZA[547] = ZMa(ZA[536], ZA[532], ZA[541]) + ZR30(ZA[541]);
+    
+    ZA[559] = ZA[532] + ZA[558];
+    ZA[556] = ZA[535] + 0x72be5d74U;
+    ZA[545] = ZA[547] + ZA[543];
+    
+    ZA[562] = (ZCh(ZA[559], ZA[544], ZA[540]) + ZA[556]) + ZR26(ZA[559]);
+    ZA[561] = ZMa(ZA[541], ZA[536], ZA[545]) + ZR30(ZA[545]);
+    
+    ZA[563] = ZA[536] + ZA[562];
+    ZA[560] = ZA[561] + ZA[558];
+    ZA[557] = ZA[540] + 0x80deb1feU;
+    
+    ZA[568] = (ZCh(ZA[563], ZA[559], ZA[544]) + ZA[557]) + ZR26(ZA[563]);
+    ZA[564] = ZMa(ZA[545], ZA[541], ZA[560]) + ZR30(ZA[560]);
+    
+    ZA[569] = ZA[541] + ZA[568];
+    ZA[572] = ZA[544] + 0x9bdc06a7U;
+    ZA[565] = ZA[562] + ZA[564];
+    
+    ZA[574] = (ZCh(ZA[569], ZA[563], ZA[559]) + ZA[572]) + ZR26(ZA[569]);
+    ZA[570] = ZMa(ZA[560], ZA[545], ZA[565]) + ZR30(ZA[565]);
+    ZA[468] = ZR25(ZA[461]);
+    
+    ZA[497] = ZA[468] + ZA[460];
+    ZA[575] = ZA[545] + ZA[574];
+    ZA[571] = ZA[568] + ZA[570];
+    ZA[573] = ZA[559] + 0xc19bf274U;
+    
+    ZA[578] = (ZCh(ZA[575], ZA[569], ZA[563]) + ZA[573]) + ZR26(ZA[575]);
+    ZA[576] = ZMa(ZA[565], ZA[560], ZA[571]) + ZR30(ZA[571]);
+    ZA[929] = ZR25(ZA[462]);
+    ZA[503] = ZA[497] + 0xe49b69c1U;
+    
+    ZA[471] = ZA[929] + ZA[461] + 0x00a00000U;
+    ZA[582] = ZA[563] + ZA[503];
+    ZA[579] = ZA[560] + ZA[578];
+    ZA[577] = ZA[574] + ZA[576];
+    
+    ZA[583] = (ZCh(ZA[579], ZA[575], ZA[569]) + ZA[582]) + ZR26(ZA[579]);
+    ZA[580] = ZMa(ZA[571], ZA[565], ZA[577]) + ZR30(ZA[577]);
+    ZA[488] = ZA[471] + 0xefbe4786U;
+    
+    ZA[472] = ZR25(ZA[463]) + ZA[462];
+    ZA[587] = ZA[569] + ZA[488];
+    ZA[584] = ZA[565] + ZA[583];
+    ZA[581] = ZA[578] + ZA[580];
+    
+    ZA[588] = (ZCh(ZA[584], ZA[579], ZA[575]) + ZA[587]) + ZR26(ZA[584]);
+    ZA[586] = ZMa(ZA[577], ZA[571], ZA[581]) + ZR30(ZA[581]);
+    ZA[501] = ZR15(ZA[497]) + ZA[472];
+    ZA[475] = ZR15(ZA[471]);
+    ZA[926] = ZA[575] + 0x0fc19dc6U;
+    
+    ZA[474] = ZA[475] + ZA[463] + ZR25(ZA[464]);
+    ZA[927] = ZA[926] + ZA[501];
+    ZA[589] = ZA[571] + ZA[588];
+    ZA[585] = ZA[583] + ZA[586];
+    
+    ZA[592] = (ZCh(ZA[589], ZA[584], ZA[579]) + ZA[927]) + ZR26(ZA[589]);
+    ZA[590] = ZMa(ZA[581], ZA[577], ZA[585]) + ZR30(ZA[585]);
+    ZA[477] = ZR25(ZA[465]) + ZA[464];
+    ZA[489] = ZA[474] + 0x240ca1ccU;
+    
+    ZA[518] = ZR15(ZA[501]) + ZA[477];
+    ZA[479] = ZR25(ZA[466]);
+    ZA[596] = ZA[579] + ZA[489];
+    ZA[593] = ZA[577] + ZA[592];
+    ZA[591] = ZA[588] + ZA[590];
+    
+    ZA[597] = (ZCh(ZA[593], ZA[589], ZA[584]) + ZA[596]) + ZR26(ZA[593]);
+    ZA[594] = ZMa(ZA[585], ZA[581], ZA[591]) + ZR30(ZA[591]);
+    ZA[481] = ZA[479] + ZA[465];
+    ZA[601] = ZA[518] + 0x2de92c6fU;
+    
+    ZA[482] = ZR15(ZA[474]) + ZA[481];
+    ZA[602] = ZA[584] + ZA[601];
+    ZA[598] = ZA[581] + ZA[597];
+    ZA[595] = ZA[592] + ZA[594];
+    
+    ZA[632] = (ZCh(ZA[598], ZA[593], ZA[589]) + ZA[602]) + ZR26(ZA[598]);
+    ZA[599] = ZMa(ZA[591], ZA[585], ZA[595]) + ZR30(ZA[595]);
+    ZA[483] = ZA[466] + 0x00000100U + ZR25(ZA[467]);
+    ZA[490] = ZA[482] + 0x4a7484aaU;
+    
+    ZA[528] = ZR15(ZA[518]) + ZA[483];
+    ZA[736] = ZA[585] + ZA[632];
+    ZA[605] = ZA[589] + ZA[490];
+    ZA[600] = ZA[597] + ZA[599];
+    ZA[485] = ZA[467] + 0x11002000U;
+    
+    ZA[738] = (ZCh(ZA[736], ZA[598], ZA[593]) + ZA[605]) + ZR26(ZA[736]);
+    ZA[744] = ZMa(ZA[595], ZA[591], ZA[600]) + ZR30(ZA[600]);
+    ZA[487] = ZR15(ZA[482]) + ZA[485];
+    ZA[603] = ZA[528] + 0x5cb0a9dcU;
+    
+    ZA[502] = ZA[497] + ZA[487];
+    ZA[739] = ZA[591] + ZA[738];
+    ZA[604] = ZA[593] + ZA[603];
+    ZA[737] = ZA[744] + ZA[632];
+    
+    ZA[741] = (ZCh(ZA[739], ZA[736], ZA[598]) + ZA[604]) + ZR26(ZA[739]);
+    ZA[745] = ZMa(ZA[600], ZA[595], ZA[737]) + ZR30(ZA[737]);
+    ZA[486] = ZA[471] + 0x80000000U;
+    ZA[606] = ZA[502] + 0x76f988daU;
+    
+    ZA[537] = ZR15(ZA[528]) + ZA[486];
+    ZA[742] = ZA[595] + ZA[741];
+    ZA[613] = ZA[598] + ZA[606];
+    ZA[740] = ZA[745] + ZA[738];
+    
+    ZA[747] = (ZCh(ZA[742], ZA[739], ZA[736]) + ZA[613]) + ZR26(ZA[742]);
+    ZA[746] = ZMa(ZA[737], ZA[600], ZA[740]) + ZR30(ZA[740]);
+    ZA[607] = ZA[537] + 0x983e5152U;
+    
+    ZA[546] = ZR15(ZA[502]) + ZA[501];
+    ZA[751] = ZA[736] + ZA[607];
+    ZA[748] = ZA[600] + ZA[747];
+    ZA[743] = ZA[746] + ZA[741];
+    
+    ZA[752] = (ZCh(ZA[748], ZA[742], ZA[739]) + ZA[751]) + ZR26(ZA[748]);
+    ZA[749] = ZMa(ZA[740], ZA[737], ZA[743]) + ZR30(ZA[743]);
+    ZA[608] = ZA[546] + 0xa831c66dU;
+    
+    ZA[554] = ZR15(ZA[537]) + ZA[474];
+    ZA[756] = ZA[739] + ZA[608];
+    ZA[753] = ZA[737] + ZA[752];
+    ZA[750] = ZA[747] + ZA[749];
+    
+    ZA[757] = (ZCh(ZA[753], ZA[748], ZA[742]) + ZA[756]) + ZR26(ZA[753]);
+    ZA[754] = ZMa(ZA[743], ZA[740], ZA[750]) + ZR30(ZA[750]);
+    ZA[609] = ZA[554] + 0xb00327c8U;
+    
+    ZA[566] = ZR15(ZA[546]) + ZA[518];
+    ZA[761] = ZA[742] + ZA[609];
+    ZA[758] = ZA[740] + ZA[757];
+    ZA[755] = ZA[752] + ZA[754];
+    
+    ZA[762] = (ZCh(ZA[758], ZA[753], ZA[748]) + ZA[761]) + ZR26(ZA[758]);
+    ZA[759] = ZMa(ZA[750], ZA[743], ZA[755]) + ZR30(ZA[755]);
+    ZA[610] = ZA[566] + 0xbf597fc7U;
+    
+    ZA[567] = ZR15(ZA[554]) + ZA[482];
+    ZA[766] = ZA[748] + ZA[610];
+    ZA[763] = ZA[743] + ZA[762];
+    ZA[760] = ZA[757] + ZA[759];
+    
+    ZA[767] = (ZCh(ZA[763], ZA[758], ZA[753]) + ZA[766]) + ZR26(ZA[763]);
+    ZA[764] = ZMa(ZA[755], ZA[750], ZA[760]) + ZR30(ZA[760]);
+    ZA[611] = ZA[567] + 0xc6e00bf3U;
+    
+    ZA[614] = ZR15(ZA[566]) + ZA[528];
+    ZA[771] = ZA[753] + ZA[611];
+    ZA[768] = ZA[750] + ZA[767];
+    ZA[765] = ZA[762] + ZA[764];
+    
+    ZA[772] = (ZCh(ZA[768], ZA[763], ZA[758]) + ZA[771]) + ZR26(ZA[768]);
+    ZA[769] = ZMa(ZA[760], ZA[755], ZA[765]) + ZR30(ZA[765]);
+    ZA[612] = ZA[502] + 0x00400022U;
+    ZA[615] = ZA[614] + 0xd5a79147U;
+    
+    ZA[616] = ZR15(ZA[567]) + ZA[612];
+    ZA[504] = ZR25(ZA[497]) + 0x00000100U;
+    ZA[776] = ZA[758] + ZA[615];
+    ZA[773] = ZA[755] + ZA[772];
+    ZA[770] = ZA[767] + ZA[769];
+    
+    ZA[777] = (ZCh(ZA[773], ZA[768], ZA[763]) + ZA[776]) + ZR26(ZA[773]);
+    ZA[774] = ZMa(ZA[765], ZA[760], ZA[770]) + ZR30(ZA[770]);
+    ZA[492] = ZR25(ZA[471]);
+    ZA[618] = ZA[537] + ZA[504];
+    ZA[617] = ZA[616] + 0x06ca6351U;
+    
+    ZA[619] = ZR15(ZA[614]) + ZA[618];
+    ZA[781] = ZA[763] + ZA[617];
+    ZA[778] = ZA[760] + ZA[777];
+    ZA[775] = ZA[772] + ZA[774];
+    ZA[505] = ZA[492] + ZA[497];
+    
+    ZA[782] = (ZCh(ZA[778], ZA[773], ZA[768]) + ZA[781]) + ZR26(ZA[778]);
+    ZA[779] = ZMa(ZA[770], ZA[765], ZA[775]) + ZR30(ZA[775]);
+    ZA[621] = ZA[505] + ZA[546];
+    ZA[620] = ZA[619] + 0x14292967U;
+    
+    ZA[622] = ZR15(ZA[616]) + ZA[621];
+    ZA[625] = ZR25(ZA[501]);
+    ZA[786] = ZA[768] + ZA[620];
+    ZA[783] = ZA[765] + ZA[782];
+    ZA[624] = ZA[554] + ZA[471];
+    ZA[780] = ZA[777] + ZA[779];
+    
+    ZA[787] = (ZCh(ZA[783], ZA[778], ZA[773]) + ZA[786]) + ZR26(ZA[783]);
+    ZA[784] = ZMa(ZA[775], ZA[770], ZA[780]) + ZR30(ZA[780]);
+    ZA[493] = ZR25(ZA[474]);
+    ZA[626] = ZA[625] + ZA[624];
+    ZA[623] = ZA[622] + 0x27b70a85U;
+    
+    ZA[627] = ZR15(ZA[619]) + ZA[626];
+    ZA[791] = ZA[773] + ZA[623];
+    ZA[788] = ZA[770] + ZA[787];
+    ZA[785] = ZA[782] + ZA[784];
+    ZA[629] = ZA[493] + ZA[501];
+    
+    ZA[792] = (ZCh(ZA[788], ZA[783], ZA[778]) + ZA[791]) + ZR26(ZA[788]);
+    ZA[789] = ZMa(ZA[780], ZA[775], ZA[785]) + ZR30(ZA[785]);
+    ZA[630] = ZA[566] + ZA[629];
+    ZA[628] = ZA[627] + 0x2e1b2138U;
+    
+    ZA[634] = ZR25(ZA[518]) + ZA[474];
+    ZA[631] = ZR15(ZA[622]) + ZA[630];
+    ZA[796] = ZA[778] + ZA[628];
+    ZA[793] = ZA[775] + ZA[792];
+    ZA[790] = ZA[787] + ZA[789];
+    
+    ZA[797] = (ZCh(ZA[793], ZA[788], ZA[783]) + ZA[796]) + ZR26(ZA[793]);
+    ZA[794] = ZMa(ZA[785], ZA[780], ZA[790]) + ZR30(ZA[790]);
+    ZA[491] = ZR25(ZA[482]);
+    ZA[635] = ZA[567] + ZA[634];
+    ZA[633] = ZA[631] + 0x4d2c6dfcU;
+    
+    ZA[636] = ZR15(ZA[627]) + ZA[635];
+    ZA[801] = ZA[783] + ZA[633];
+    ZA[798] = ZA[780] + ZA[797];
+    ZA[795] = ZA[792] + ZA[794];
+    ZA[638] = ZA[491] + ZA[518];
+    
+    ZA[802] = (ZCh(ZA[798], ZA[793], ZA[788]) + ZA[801]) + ZR26(ZA[798]);
+    ZA[799] = ZMa(ZA[790], ZA[785], ZA[795]) + ZR30(ZA[795]);
+    ZA[639] = ZA[638] + ZA[614];
+    ZA[637] = ZA[636] + 0x53380d13U;
+    
+    ZA[642] = ZR25(ZA[528]) + ZA[482];
+    ZA[640] = ZR15(ZA[631]) + ZA[639];
+    ZA[806] = ZA[788] + ZA[637];
+    ZA[803] = ZA[785] + ZA[802];
+    ZA[800] = ZA[797] + ZA[799];
+    
+    ZA[807] = (ZCh(ZA[803], ZA[798], ZA[793]) + ZA[806]) + ZR26(ZA[803]);
+    ZA[804] = ZMa(ZA[795], ZA[790], ZA[800]) + ZR30(ZA[800]);
+    ZA[643] = ZA[616] + ZA[642];
+    ZA[641] = ZA[640] + 0x650a7354U;
+    
+    ZA[646] = ZR25(ZA[502]) + ZA[528];
+    ZA[644] = ZR15(ZA[636]) + ZA[643];
+    ZA[811] = ZA[793] + ZA[641];
+    ZA[808] = ZA[790] + ZA[807];
+    ZA[805] = ZA[802] + ZA[804];
+    
+    ZA[812] = (ZCh(ZA[808], ZA[803], ZA[798]) + ZA[811]) + ZR26(ZA[808]);
+    ZA[809] = ZMa(ZA[800], ZA[795], ZA[805]) + ZR30(ZA[805]);
+    ZA[647] = ZA[619] + ZA[646];
+    ZA[645] = ZA[644] + 0x766a0abbU;
+    
+    ZA[650] = ZR25(ZA[537]) + ZA[502];
+    ZA[648] = ZR15(ZA[640]) + ZA[647];
+    ZA[816] = ZA[798] + ZA[645];
+    ZA[813] = ZA[795] + ZA[812];
+    ZA[810] = ZA[807] + ZA[809];
+    
+    ZA[817] = (ZCh(ZA[813], ZA[808], ZA[803]) + ZA[816]) + ZR26(ZA[813]);
+    ZA[814] = ZMa(ZA[805], ZA[800], ZA[810]) + ZR30(ZA[810]);
+    ZA[925] = ZA[622] + ZA[650];
+    ZA[649] = ZA[648] + 0x81c2c92eU;
+    
+    ZA[653] = ZR25(ZA[546]) + ZA[537];
+    ZA[651] = ZR15(ZA[644]) + ZA[925];
+    ZA[821] = ZA[803] + ZA[649];
+    ZA[818] = ZA[800] + ZA[817];
+    ZA[815] = ZA[812] + ZA[814];
+    
+    ZA[822] = (ZCh(ZA[818], ZA[813], ZA[808]) + ZA[821]) + ZR26(ZA[818]);
+    ZA[819] = ZMa(ZA[810], ZA[805], ZA[815]) + ZR30(ZA[815]);
+    ZA[654] = ZA[627] + ZA[653];
+    ZA[652] = ZA[651] + 0x92722c85U;
+    
+    ZA[657] = ZR25(ZA[554]) + ZA[546];
+    ZA[655] = ZR15(ZA[648]) + ZA[654];
+    ZA[826] = ZA[808] + ZA[652];
+    ZA[823] = ZA[805] + ZA[822];
+    ZA[820] = ZA[817] + ZA[819];
+    
+    ZA[827] = (ZCh(ZA[823], ZA[818], ZA[813]) + ZA[826]) + ZR26(ZA[823]);
+    ZA[824] = ZMa(ZA[815], ZA[810], ZA[820]) + ZR30(ZA[820]);
+    ZA[658] = ZA[631] + ZA[657];
+    ZA[656] = ZA[655] + 0xa2bfe8a1U;
+    
+    ZA[661] = ZR25(ZA[566]) + ZA[554];
+    ZA[659] = ZR15(ZA[651]) + ZA[658];
+    ZA[831] = ZA[813] + ZA[656];
+    ZA[828] = ZA[810] + ZA[827];
+    ZA[825] = ZA[822] + ZA[824];
+    
+    ZA[832] = (ZCh(ZA[828], ZA[823], ZA[818]) + ZA[831]) + ZR26(ZA[828]);
+    ZA[829] = ZMa(ZA[820], ZA[815], ZA[825]) + ZR30(ZA[825]);
+    ZA[662] = ZA[636] + ZA[661];
+    ZA[660] = ZA[659] + 0xa81a664bU;
+    
+    ZA[665] = ZR25(ZA[567]) + ZA[566];
+    ZA[663] = ZR15(ZA[655]) + ZA[662];
+    ZA[836] = ZA[818] + ZA[660];
+    ZA[833] = ZA[815] + ZA[832];
+    ZA[830] = ZA[827] + ZA[829];
+    
+    ZA[837] = (ZCh(ZA[833], ZA[828], ZA[823]) + ZA[836]) + ZR26(ZA[833]);
+    ZA[834] = ZMa(ZA[825], ZA[820], ZA[830]) + ZR30(ZA[830]);
+    ZA[666] = ZA[640] + ZA[665];
+    ZA[664] = ZA[663] + 0xc24b8b70U;
+    
+    ZA[669] = ZR25(ZA[614]) + ZA[567];
+    ZA[667] = ZR15(ZA[659]) + ZA[666];
+    ZA[841] = ZA[823] + ZA[664];
+    ZA[838] = ZA[820] + ZA[837];
+    ZA[835] = ZA[832] + ZA[834];
+    
+    ZA[842] = (ZCh(ZA[838], ZA[833], ZA[828]) + ZA[841]) + ZR26(ZA[838]);
+    ZA[839] = ZMa(ZA[830], ZA[825], ZA[835]) + ZR30(ZA[835]);
+    ZA[670] = ZA[644] + ZA[669];
+    ZA[668] = ZA[667] + 0xc76c51a3U;
+    
+    ZA[677] = ZR25(ZA[616]) + ZA[614];
+    ZA[671] = ZR15(ZA[663]) + ZA[670];
+    ZA[846] = ZA[828] + ZA[668];
+    ZA[843] = ZA[825] + ZA[842];
+    ZA[840] = ZA[837] + ZA[839];
+    
+    ZA[847] = (ZCh(ZA[843], ZA[838], ZA[833]) + ZA[846]) + ZR26(ZA[843]);
+    ZA[844] = ZMa(ZA[835], ZA[830], ZA[840]) + ZR30(ZA[840]);
+    ZA[678] = ZA[648] + ZA[677];
+    ZA[676] = ZA[671] + 0xd192e819U;
+    
+    ZA[682] = ZR25(ZA[619]) + ZA[616];
+    ZA[679] = ZR15(ZA[667]) + ZA[678];
+    ZA[851] = ZA[833] + ZA[676];
+    ZA[848] = ZA[830] + ZA[847];
+    ZA[845] = ZA[842] + ZA[844];
+    
+    ZA[852] = (ZCh(ZA[848], ZA[843], ZA[838]) + ZA[851]) + ZR26(ZA[848]);
+    ZA[849] = ZMa(ZA[840], ZA[835], ZA[845]) + ZR30(ZA[845]);
+    ZA[683] = ZA[651] + ZA[682];
+    ZA[680] = ZA[679] + 0xd6990624U;
+    
+    ZA[686] = ZR25(ZA[622]) + ZA[619];
+    ZA[684] = ZR15(ZA[671]) + ZA[683];
+    ZA[856] = ZA[838] + ZA[680];
+    ZA[853] = ZA[835] + ZA[852];
+    ZA[850] = ZA[847] + ZA[849];
+    
+    ZA[857] = (ZCh(ZA[853], ZA[848], ZA[843]) + ZA[856]) + ZR26(ZA[853]);
+    ZA[854] = ZMa(ZA[845], ZA[840], ZA[850]) + ZR30(ZA[850]);
+    ZA[687] = ZA[655] + ZA[686];
+    ZA[685] = ZA[684] + 0xf40e3585U;
+    
+    ZA[690] = ZR25(ZA[627]) + ZA[622];
+    ZA[688] = ZR15(ZA[679]) + ZA[687];
+    ZA[861] = ZA[843] + ZA[685];
+    ZA[858] = ZA[840] + ZA[857];
+    ZA[855] = ZA[852] + ZA[854];
+    
+    ZA[862] = (ZCh(ZA[858], ZA[853], ZA[848]) + ZA[861]) + ZR26(ZA[858]);
+    ZA[859] = ZMa(ZA[850], ZA[845], ZA[855]) + ZR30(ZA[855]);
+    ZA[691] = ZA[659] + ZA[690];
+    ZA[689] = ZA[688] + 0x106aa070U;
+    
+    ZA[694] = ZR25(ZA[631]) + ZA[627];
+    ZA[692] = ZR15(ZA[684]) + ZA[691];
+    ZA[866] = ZA[848] + ZA[689];
+    ZA[863] = ZA[845] + ZA[862];
+    ZA[860] = ZA[857] + ZA[859];
+    
+    ZA[867] = (ZCh(ZA[863], ZA[858], ZA[853]) + ZA[866]) + ZR26(ZA[863]);
+    ZA[864] = ZMa(ZA[855], ZA[850], ZA[860]) + ZR30(ZA[860]);
+    ZA[695] = ZA[663] + ZA[694];
+    ZA[693] = ZA[692] + 0x19a4c116U;
+    
+    ZA[698] = ZR25(ZA[636]) + ZA[631];
+    ZA[696] = ZR15(ZA[688]) + ZA[695];
+    ZA[871] = ZA[853] + ZA[693];
+    ZA[868] = ZA[850] + ZA[867];
+    ZA[865] = ZA[862] + ZA[864];
+    
+    ZA[873] = (ZCh(ZA[868], ZA[863], ZA[858]) + ZA[871]) + ZR26(ZA[868]);
+    ZA[869] = ZMa(ZA[860], ZA[855], ZA[865]) + ZR30(ZA[865]);
+    ZA[699] = ZA[667] + ZA[698];
+    ZA[697] = ZA[696] + 0x1e376c08U;
+    
+    ZA[702] = ZR25(ZA[640]) + ZA[636];
+    ZA[700] = ZR15(ZA[692]) + ZA[699];
+    ZA[877] = ZA[858] + ZA[697];
+    ZA[874] = ZA[855] + ZA[873];
+    ZA[870] = ZA[867] + ZA[869];
+    
+    ZA[878] = (ZCh(ZA[874], ZA[868], ZA[863]) + ZA[877]) + ZR26(ZA[874]);
+    ZA[875] = ZMa(ZA[865], ZA[860], ZA[870]) + ZR30(ZA[870]);
+    ZA[703] = ZA[671] + ZA[702];
+    ZA[701] = ZA[700] + 0x2748774cU;
+    
+    ZA[706] = ZR25(ZA[644]) + ZA[640];
+    ZA[704] = ZR15(ZA[696]) + ZA[703];
+    ZA[882] = ZA[863] + ZA[701];
+    ZA[879] = ZA[860] + ZA[878];
+    ZA[876] = ZA[873] + ZA[875];
+    
+    ZA[883] = (ZCh(ZA[879], ZA[874], ZA[868]) + ZA[882]) + ZR26(ZA[879]);
+    ZA[880] = ZMa(ZA[870], ZA[865], ZA[876]) + ZR30(ZA[876]);
+    ZA[707] = ZA[679] + ZA[706];
+    ZA[705] = ZA[704] + 0x34b0bcb5U;
+    
+    ZA[710] = ZR25(ZA[648]) + ZA[644];
+    ZA[708] = ZR15(ZA[700]) + ZA[707];
+    ZA[887] = ZA[868] + ZA[705];
+    ZA[884] = ZA[865] + ZA[883];
+    ZA[881] = ZA[878] + ZA[880];
+    
+    ZA[888] = (ZCh(ZA[884], ZA[879], ZA[874]) + ZA[887]) + ZR26(ZA[884]);
+    ZA[885] = ZMa(ZA[876], ZA[870], ZA[881]) + ZR30(ZA[881]);
+    ZA[711] = ZA[684] + ZA[710];
+    ZA[709] = ZA[708] + 0x391c0cb3U;
+    
+    ZA[714] = ZR25(ZA[651]) + ZA[648];
+    ZA[712] = ZR15(ZA[704]) + ZA[711];
+    ZA[892] = ZA[874] + ZA[709];
+    ZA[889] = ZA[870] + ZA[888];
+    ZA[886] = ZA[883] + ZA[885];
+    
+    ZA[893] = (ZCh(ZA[889], ZA[884], ZA[879]) + ZA[892]) + ZR26(ZA[889]);
+    ZA[890] = ZMa(ZA[881], ZA[876], ZA[886]) + ZR30(ZA[886]);
+    ZA[715] = ZA[688] + ZA[714];
+    ZA[713] = ZA[712] + 0x4ed8aa4aU;
+    
+    ZA[718] = ZR25(ZA[655]) + ZA[651];
+    ZA[716] = ZR15(ZA[708]) + ZA[715];
+    ZA[897] = ZA[879] + ZA[713];
+    ZA[894] = ZA[876] + ZA[893];
+    ZA[891] = ZA[888] + ZA[890];
+    
+    ZA[898] = (ZCh(ZA[894], ZA[889], ZA[884]) + ZA[897]) + ZR26(ZA[894]);
+    ZA[895] = ZMa(ZA[886], ZA[881], ZA[891]) + ZR30(ZA[891]);
+    ZA[719] = ZA[692] + ZA[718];
+    ZA[717] = ZA[716] + 0x5b9cca4fU;
+    
+    ZA[722] = ZR25(ZA[659]) + ZA[655];
+    ZA[720] = ZR15(ZA[712]) + ZA[719];
+    ZA[902] = ZA[884] + ZA[717];
+    ZA[899] = ZA[881] + ZA[898];
+    ZA[896] = ZA[893] + ZA[895];
+    
+    ZA[903] = (ZCh(ZA[899], ZA[894], ZA[889]) + ZA[902]) + ZR26(ZA[899]);
+    ZA[900] = ZMa(ZA[891], ZA[886], ZA[896]) + ZR30(ZA[896]);
+    ZA[723] = ZA[696] + ZA[722];
+    ZA[721] = ZA[720] + 0x682e6ff3U;
+    
+    ZA[672] = ZR25(ZA[663]) + ZA[659];
+    ZA[724] = ZR15(ZA[716]) + ZA[723];
+    ZA[907] = ZA[889] + ZA[721];
+    ZA[904] = ZA[886] + ZA[903];
+    ZA[901] = ZA[898] + ZA[900];
+    
+    ZA[908] = (ZCh(ZA[904], ZA[899], ZA[894]) + ZA[907]) + ZR26(ZA[904]);
+    ZA[905] = ZMa(ZA[896], ZA[891], ZA[901]) + ZR30(ZA[901]);
+    ZA[673] = ZR25(ZA[667]) + ZA[663];
+    ZA[726] = ZA[700] + ZA[672];
+    ZA[725] = ZA[724] + 0x748f82eeU;
+    
+    ZA[727] = ZR15(ZA[720]) + ZA[726];
+    ZA[912] = ZA[894] + ZA[725];
+    ZA[909] = ZA[891] + ZA[908];
+    ZA[906] = ZA[903] + ZA[905];
+    ZA[675] = ZA[667] + 0x8cc70208U;
+    ZA[729] = ZA[704] + ZA[673];
+    
+    ZA[913] = (ZCh(ZA[909], ZA[904], ZA[899]) + ZA[912]) + ZR26(ZA[909]);
+    ZA[910] = ZMa(ZA[901], ZA[896], ZA[906]) + ZR30(ZA[906]);
+    ZA[674] = ZR25(ZA[671]) + ZA[675];
+    ZA[730] = ZR15(ZA[724]) + ZA[729];
+    ZA[728] = ZA[727] + 0x78a5636fU;
+    
+    ZA[681] = ZR25(ZA[679]) + ZA[671];
+    ZA[917] = ZA[899] + ZA[901] + ZA[728];
+    ZA[914] = ZA[896] + ZA[913];
+    ZA[911] = ZA[908] + ZA[910];
+    ZA[732] = ZA[708] + ZA[674];
+    ZA[731] = ZA[730] + 0x84c87814U;
+    
+    ZA[918] = (ZCh(ZA[914], ZA[909], ZA[904]) + ZA[917]) + ZR26(ZA[914]);
+    ZA[915] = ZMa(ZA[906], ZA[901], ZA[911]) + ZR30(ZA[911]);
+    ZA[733] = ZR15(ZA[727]) + ZA[732];
+    ZA[919] = ZA[906] + ZA[904] + ZA[731];
+    ZA[734] = ZA[712] + ZA[681];
+    
+    ZA[920] = (ZCh(ZA[918], ZA[914], ZA[909]) + ZA[919]) + ZR26(ZA[918]);
+    ZA[735] = ZR15(ZA[730]) + ZA[734];
+    ZA[921] = ZA[911] + ZA[909] + ZA[733];
+    ZA[916] = ZA[913] + ZA[915];
+    
+    ZA[922] = (ZCh(ZA[920], ZA[918], ZA[914]) + ZA[921]) + ZR26(ZA[920]);
+    ZA[923] = ZA[916] + ZA[914] + ZA[735];
+    
+    ZA[924] = (ZCh(ZA[922], ZA[920], ZA[918]) + ZA[923]) + ZR26(ZA[922]);
+    
+#define FOUND (0x80)
+#define NFLAG (0x7F)
+
+#if defined(VECTORS4)
+	bool result = any(ZA[924] == 0x136032EDU);
+
+	if (result) {
+		if (ZA[924].x == 0x136032EDU)
+			output[FOUND] = output[NFLAG & Znonce.x] =  Znonce.x;
+		if (ZA[924].y == 0x136032EDU)
+			output[FOUND] = output[NFLAG & Znonce.y] =  Znonce.y;
+		if (ZA[924].z == 0x136032EDU)
+			output[FOUND] = output[NFLAG & Znonce.z] =  Znonce.z;
+		if (ZA[924].w == 0x136032EDU)
+			output[FOUND] = output[NFLAG & Znonce.w] =  Znonce.w;
+	}
+#elif defined(VECTORS2)
+	bool result = any(ZA[924] == 0x136032EDU);
+
+	if (result) {
+		if (ZA[924].x == 0x136032EDU)
+			output[FOUND] = output[NFLAG & Znonce.x] =  Znonce.x;
+		if (ZA[924].y == 0x136032EDU)
+			output[FOUND] = output[NFLAG & Znonce.y] =  Znonce.y;
+	}
+#else
+	if (ZA[924] == 0x136032EDU)
+		output[FOUND] = output[NFLAG & Znonce] =  Znonce;
+#endif
+}

+ 581 - 0
diakgcn120223.cl

@@ -0,0 +1,581 @@
+// DiaKGCN 16-03-2012 - OpenCL kernel by Diapolo
+//
+// Parts and / or ideas for this kernel are based upon the public-domain poclbm project, the phatk kernel by Phateus and the DiabloMiner kernel by DiabloD3.
+// The kernel was rewritten by me (Diapolo) and is still public-domain!
+
+#ifdef VECTORS4
+	typedef uint4 u;
+#elif defined VECTORS2
+	typedef uint2 u;
+#else
+	typedef uint u;
+#endif
+
+#ifdef BITALIGN
+	#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+	#ifdef BFI_INT
+		#define ch(x, y, z) amd_bytealign(x, y, z)
+		#define ma(x, y, z) amd_bytealign(z ^ x, y, x)
+	#else
+		#define ch(x, y, z) bitselect(z, y, x)
+		#define ma(z, x, y) bitselect(z, y, z ^ x)
+	#endif
+#else
+	#define ch(x, y, z) (z ^ (x & (y ^ z)))
+	#define ma(x, y, z) ((x & z) | (y & (x | z)))
+#endif
+
+#define rotr15(n) (rotate(n, 15U) ^ rotate(n, 13U) ^ (n >> 10U))
+#define rotr25(n) (rotate(n, 25U) ^ rotate(n, 14U) ^ (n >> 3U))
+#define rotr26(n) (rotate(n, 26U) ^ rotate(n, 21U) ^ rotate(n, 7U))
+#define rotr30(n) (rotate(n, 30U) ^ rotate(n, 19U) ^ rotate(n, 10U))
+
+__kernel
+	__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+	void search(	
+			const u base,
+			const uint PreVal0, const uint PreVal4,
+			const uint H1, const uint D1A, const uint B1, const uint C1,
+			const uint F1, const uint G1, const uint C1addK5, const uint B1addK6, const uint PreVal0addK7,
+			const uint W16addK16, const uint W17addK17,
+			const uint PreW18, const uint PreW19,
+			const uint W16, const uint W17,
+			const uint PreW31, const uint PreW32,
+			const uint state0, const uint state1, const uint state2, const uint state3,
+			const uint state4, const uint state5, const uint state6, const uint state7,
+			const uint state0A, const uint state0B,
+			const uint state1A, const uint state2A, const uint state3A, const uint state4A,
+			const uint state5A, const uint state6A, const uint state7A,
+			__global uint * output)
+{
+	u V[8];
+	u W[16];
+
+#ifdef VECTORS4
+	const u nonce = (uint)(get_local_id(0)) * 4U + (uint)(get_group_id(0)) * (uint)(WORKVEC) + base;
+#elif defined VECTORS2
+	const u nonce = (uint)(get_local_id(0)) * 2U + (uint)(get_group_id(0)) * (uint)(WORKVEC) + base;
+#else
+	const u nonce = (uint)(get_local_id(0)) + (uint)(get_group_id(0)) * (uint)(WORKSIZE) + base;
+#endif
+
+	V[0] = PreVal0 + nonce;
+	V[1] = B1;
+	V[2] = C1;
+	V[3] = D1A;
+	V[4] = PreVal4 + nonce;
+	V[5] = F1;
+	V[6] = G1;
+	V[7] = H1;
+
+	V[7] += V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += C1addK5 + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  C1addK5 + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += B1addK6 + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  B1addK6 + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += PreVal0addK7 + nonce + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  PreVal0addK7 + nonce + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0xd807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0xd807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0xc19bf3f4U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0xc19bf3f4U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += W16addK16 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  W16addK16 + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += W17addK17 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  W17addK17 + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+//----------------------------------------------------------------------------------
+
+#ifdef VECTORS4
+	 W[0] = PreW18 + (u)(rotr25(nonce.x), rotr25(nonce.x) ^ 0x2004000U, rotr25(nonce.x) ^ 0x4008000U, rotr25(nonce.x) ^ 0x600c000U);
+#elif defined VECTORS2
+	 W[0] = PreW18 + (u)(rotr25(nonce.x), rotr25(nonce.x) ^ 0x2004000U);
+#else
+	 W[0] = PreW18 + rotr25(nonce);
+#endif
+	 W[1] = PreW19 + nonce;
+	 W[2] = 0x80000000U + rotr15(W[0]);
+	 W[3] = rotr15(W[1]);
+	 W[4] = 0x00000280U + rotr15(W[2]);
+	 W[5] = W16 + rotr15(W[3]);
+	 W[6] = W17 + rotr15(W[4]);
+	 W[7] = W[0] + rotr15(W[5]);
+	 W[8] = W[1] + rotr15(W[6]);
+	 W[9] = W[2] + rotr15(W[7]);
+	W[10] = W[3] + rotr15(W[8]);
+	W[11] = W[4] + rotr15(W[9]);
+	W[12] = W[5] + 0x00a00055U + rotr15(W[10]);
+	W[13] = W[6] + PreW31 + rotr15(W[11]);
+	W[14] = W[7] + PreW32 + rotr15(W[12]);
+	W[15] = W[8] + W17 + rotr15(W[13]) + rotr25(W[0]);
+
+	V[1] += 0x0fc19dc6U + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + W[0];
+	V[5] =  0x0fc19dc6U + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + W[0] + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x240ca1ccU + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x240ca1ccU + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x2de92c6fU + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x2de92c6fU + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x4a7484aaU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x4a7484aaU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x5cb0a9dcU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x5cb0a9dcU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x76f988daU + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x76f988daU + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0x983e5152U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x983e5152U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0xa831c66dU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0xa831c66dU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0xb00327c8U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0xb00327c8U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0xbf597fc7U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0xbf597fc7U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0xc6e00bf3U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0xc6e00bf3U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0xd5a79147U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0xd5a79147U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x06ca6351U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x06ca6351U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x14292967U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x14292967U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0x27b70a85U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x27b70a85U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x2e1b2138U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0x2e1b2138U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+//----------------------------------------------------------------------------------
+
+	 W[0] =  W[0] +  W[9] + rotr15(W[14]) + rotr25( W[1]);
+	 W[1] =  W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
+	 W[2] =  W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
+	 W[3] =  W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
+	 W[4] =  W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
+	 W[5] =  W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
+	 W[6] =  W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
+	 W[7] =  W[7] +  W[0] + rotr15( W[5]) + rotr25( W[8]);
+	 W[8] =  W[8] +  W[1] + rotr15( W[6]) + rotr25( W[9]);
+	 W[9] =  W[9] +  W[2] + rotr15( W[7]) + rotr25(W[10]);
+	W[10] = W[10] +  W[3] + rotr15( W[8]) + rotr25(W[11]);
+	W[11] = W[11] +  W[4] + rotr15( W[9]) + rotr25(W[12]);
+	W[12] = W[12] +  W[5] + rotr15(W[10]) + rotr25(W[13]);
+	W[13] = W[13] +  W[6] + rotr15(W[11]) + rotr25(W[14]);
+	W[14] = W[14] +  W[7] + rotr15(W[12]) + rotr25(W[15]);
+	W[15] = W[15] +  W[8] + rotr15(W[13]) + rotr25( W[0]);
+
+	V[1] += 0x4d2c6dfcU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x4d2c6dfcU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x53380d13U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x53380d13U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x650a7354U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x650a7354U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x766a0abbU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x766a0abbU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x81c2c92eU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x81c2c92eU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x92722c85U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x92722c85U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0xa2bfe8a1U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0xa2bfe8a1U + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0xa81a664bU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0xa81a664bU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0xc24b8b70U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0xc24b8b70U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0xc76c51a3U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0xc76c51a3U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0xd192e819U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0xd192e819U + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0xd6990624U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0xd6990624U + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0xf40e3585U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0xf40e3585U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x106aa070U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x106aa070U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0x19a4c116U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x19a4c116U + V[7] + W[14] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x1e376c08U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0x1e376c08U + V[6] + W[15] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+//----------------------------------------------------------------------------------
+
+	 W[0] =  W[0] +  W[9] + rotr15(W[14]) + rotr25( W[1]);
+	 W[1] =  W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
+	 W[2] =  W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
+	 W[3] =  W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
+	 W[4] =  W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
+	 W[5] =  W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
+	 W[6] =  W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
+	 W[7] =  W[7] +  W[0] + rotr15( W[5]) + rotr25( W[8]);
+	 W[8] =  W[8] +  W[1] + rotr15( W[6]) + rotr25( W[9]);
+	 W[9] =  W[9] +  W[2] + rotr15( W[7]) + rotr25(W[10]);
+	W[10] = W[10] +  W[3] + rotr15( W[8]) + rotr25(W[11]);
+	W[11] = W[11] +  W[4] + rotr15( W[9]) + rotr25(W[12]);
+	W[12] = W[12] +  W[5] + rotr15(W[10]) + rotr25(W[13]);
+	W[13] = W[13] +  W[6] + rotr15(W[11]) + rotr25(W[14]);
+
+	V[1] += 0x2748774cU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x2748774cU + V[5] + W[0] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x34b0bcb5U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x34b0bcb5U + V[4] + W[1] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x391c0cb3U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x391c0cb3U + V[3] + W[2] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x4ed8aa4aU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x4ed8aa4aU + V[2] + W[3] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x5b9cca4fU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x5b9cca4fU + V[1] + W[4] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x682e6ff3U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x682e6ff3U + V[0] + W[5] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0x748f82eeU + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x748f82eeU + V[7] + W[6] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x78a5636fU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0x78a5636fU + V[6] + W[7] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0x84c87814U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x84c87814U + V[5] + W[8] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x8cc70208U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x8cc70208U + V[4] + W[9] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x90befffaU + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x90befffaU + V[3] + W[10] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0xa4506cebU + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0xa4506cebU + V[2] + W[11] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0xbef9a3f7U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0xbef9a3f7U + V[1] + W[12] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0xc67178f2U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0xc67178f2U + V[0] + W[13] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+//----------------------------------------------------------------------------------
+
+	 W[0] = state0 + V[0] + rotr25(state1 + V[1]);
+	 W[1] = state1 + V[1] + 0x00a00000U + rotr25(state2 + V[2]);
+	 W[2] = state2 + V[2] + rotr15(W[0]) + rotr25(state3 + V[3]);
+	 W[3] = state3 + V[3] + rotr15(W[1]) + rotr25(state4 + V[4]);
+	 W[4] = state4 + V[4] + rotr15(W[2]) + rotr25(state5 + V[5]);
+	 W[5] = state5 + V[5] + rotr15(W[3]) + rotr25(state6 + V[6]);
+	 W[6] = state6 + V[6] + 0x00000100U + rotr15(W[4]) + rotr25(state7 + V[7]);	
+	 W[7] = state7 + V[7] + W[0] + 0x11002000U + rotr15(W[5]);
+	 W[8] = W[1] + 0x80000000U + rotr15(W[6]);	
+	 W[9] = W[2] + rotr15(W[7]);
+	W[10] = W[3] + rotr15(W[8]);
+	W[11] = W[4] + rotr15(W[9]);
+	W[12] = W[5] + rotr15(W[10]);
+	W[13] = W[6] + rotr15(W[11]);
+	W[14] = W[7] + 0x00400022U + rotr15(W[12]);
+	W[15] = W[8] + 0x00000100U + rotr15(W[13]) + rotr25(W[0]);
+
+	// 0x71374491U + 0x1f83d9abU + state1
+	const u state1AaddV1 = state1A + V[1];
+	// 0xb5c0fbcfU + 0x9b05688cU + state2
+	const u state2AaddV2 = state2A + V[2];
+	// 0x510e527fU + 0xe9b5dba5U + state3
+	const u state3AaddV3 = state3A + V[3];
+	// 0x3956c25bU + state4
+	const u state4AaddV4 = state4A + V[4];
+	// 0x59f111f1U + state5
+	const u state5AaddV5 = state5A + V[5];
+	// 0x923f82a4U + state6
+	const u state6AaddV6 = state6A + V[6];
+	// 0xab1c5ed5U + state7
+	const u state7AaddV7 = state7A + V[7];
+
+	// 0x98c7e2a2U + state0	
+	V[3] = state0A + V[0];
+	// 0xfc08884dU + state0
+	V[7] = state0B + V[0];
+	V[0] = 0x6a09e667U;
+	V[1] = 0xbb67ae85U;
+	V[2] = 0x3c6ef372U;
+	V[4] = 0x510e527fU;
+	V[5] = 0x9b05688cU;
+	V[6] = 0x1f83d9abU;
+
+	V[2] += state1AaddV1 + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  state1AaddV1 + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += state2AaddV2 + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  state2AaddV2 + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += state3AaddV3 + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  state3AaddV3 + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += state4AaddV4 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  state4AaddV4 + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += state5AaddV5 + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  state5AaddV5 + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += state6AaddV6 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  state6AaddV6 + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += state7AaddV7 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  state7AaddV7 + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0x5807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x5807aa98U + V[7] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0x12835b01U + V[6] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x243185beU + V[5] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x550c7dc3U + V[4] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x72be5d74U + V[3] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x80deb1feU + V[2] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x9bdc06a7U + V[1] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0xc19bf274U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0xc19bf274U + V[0] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0xe49b69c1U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0xe49b69c1U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0xefbe4786U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0xefbe4786U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0x0fc19dc6U + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x0fc19dc6U + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x240ca1ccU + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x240ca1ccU + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x2de92c6fU + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x2de92c6fU + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x4a7484aaU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x4a7484aaU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x5cb0a9dcU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x5cb0a9dcU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x76f988daU + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x76f988daU + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0x983e5152U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x983e5152U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0xa831c66dU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0xa831c66dU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0xb00327c8U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0xb00327c8U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0xbf597fc7U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0xbf597fc7U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0xc6e00bf3U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0xc6e00bf3U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0xd5a79147U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0xd5a79147U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x06ca6351U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x06ca6351U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x14292967U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x14292967U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+//----------------------------------------------------------------------------------
+
+	 W[0] =  W[0] +  W[9] + rotr15(W[14]) + rotr25( W[1]);
+	 W[1] =  W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
+	 W[2] =  W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
+	 W[3] =  W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
+	 W[4] =  W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
+	 W[5] =  W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
+	 W[6] =  W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
+	 W[7] =  W[7] +  W[0] + rotr15( W[5]) + rotr25( W[8]);
+	 W[8] =  W[8] +  W[1] + rotr15( W[6]) + rotr25( W[9]);
+	 W[9] =  W[9] +  W[2] + rotr15( W[7]) + rotr25(W[10]);
+	W[10] = W[10] +  W[3] + rotr15( W[8]) + rotr25(W[11]);
+	W[11] = W[11] +  W[4] + rotr15( W[9]) + rotr25(W[12]);
+	W[12] = W[12] +  W[5] + rotr15(W[10]) + rotr25(W[13]);
+	W[13] = W[13] +  W[6] + rotr15(W[11]) + rotr25(W[14]);
+	W[14] = W[14] +  W[7] + rotr15(W[12]) + rotr25(W[15]);
+	W[15] = W[15] +  W[8] + rotr15(W[13]) + rotr25( W[0]);
+
+	V[3] += 0x27b70a85U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x27b70a85U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x2e1b2138U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0x2e1b2138U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0x4d2c6dfcU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x4d2c6dfcU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x53380d13U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x53380d13U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x650a7354U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x650a7354U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x766a0abbU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x766a0abbU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x81c2c92eU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x81c2c92eU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x92722c85U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x92722c85U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0xa2bfe8a1U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0xa2bfe8a1U + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0xa81a664bU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0xa81a664bU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0xc24b8b70U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0xc24b8b70U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0xc76c51a3U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0xc76c51a3U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0xd192e819U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0xd192e819U + V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0xd6990624U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0xd6990624U + V[2] + W[13] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0xf40e3585U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0xf40e3585U + V[1] + W[14] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x106aa070U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x106aa070U + V[0] + W[15] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+//----------------------------------------------------------------------------------
+
+	 W[0] =  W[0] +  W[9] + rotr15(W[14]) + rotr25( W[1]);
+	 W[1] =  W[1] + W[10] + rotr15(W[15]) + rotr25( W[2]);
+	 W[2] =  W[2] + W[11] + rotr15( W[0]) + rotr25( W[3]);
+	 W[3] =  W[3] + W[12] + rotr15( W[1]) + rotr25( W[4]);
+	 W[4] =  W[4] + W[13] + rotr15( W[2]) + rotr25( W[5]);
+	 W[5] =  W[5] + W[14] + rotr15( W[3]) + rotr25( W[6]);
+	 W[6] =  W[6] + W[15] + rotr15( W[4]) + rotr25( W[7]);
+	 W[7] =  W[7] +  W[0] + rotr15( W[5]) + rotr25( W[8]);
+	 W[8] =  W[8] +  W[1] + rotr15( W[6]) + rotr25( W[9]);
+	 W[9] =  W[9] +  W[2] + rotr15( W[7]) + rotr25(W[10]);
+	W[10] = W[10] +  W[3] + rotr15( W[8]) + rotr25(W[11]);
+	W[11] = W[11] +  W[4] + rotr15( W[9]) + rotr25(W[12]);
+	W[12] = W[12] +  W[5] + rotr15(W[10]) + rotr25(W[13]);
+
+	V[3] += 0x19a4c116U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x19a4c116U + V[7] + W[0] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x1e376c08U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+	V[6] =  0x1e376c08U + V[6] + W[1] + ch(V[3], V[4], V[5]) + rotr26(V[3]) + rotr30(V[7]) + ma(V[0], V[1], V[7]);
+
+	V[1] += 0x2748774cU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+	V[5] =  0x2748774cU + V[5] + W[2] + ch(V[2], V[3], V[4]) + rotr26(V[2]) + rotr30(V[6]) + ma(V[7], V[0], V[6]);
+
+	V[0] += 0x34b0bcb5U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+	V[4] =  0x34b0bcb5U + V[4] + W[3] + ch(V[1], V[2], V[3]) + rotr26(V[1]) + rotr30(V[5]) + ma(V[6], V[7], V[5]);
+
+	V[7] += 0x391c0cb3U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+	V[3] =  0x391c0cb3U + V[3] + W[4] + ch(V[0], V[1], V[2]) + rotr26(V[0]) + rotr30(V[4]) + ma(V[5], V[6], V[4]);
+
+	V[6] += 0x4ed8aa4aU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]);
+	V[2] =  0x4ed8aa4aU + V[2] + W[5] + ch(V[7], V[0], V[1]) + rotr26(V[7]) + rotr30(V[3]) + ma(V[4], V[5], V[3]);
+
+	V[5] += 0x5b9cca4fU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]);
+	V[1] =  0x5b9cca4fU + V[1] + W[6] + ch(V[6], V[7], V[0]) + rotr26(V[6]) + rotr30(V[2]) + ma(V[3], V[4], V[2]);
+
+	V[4] += 0x682e6ff3U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]);
+	V[0] =  0x682e6ff3U + V[0] + W[7] + ch(V[5], V[6], V[7]) + rotr26(V[5]) + rotr30(V[1]) + ma(V[2], V[3], V[1]);
+
+	V[3] += 0x748f82eeU + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]);
+	V[7] =  0x748f82eeU + V[7] + W[8] + ch(V[4], V[5], V[6]) + rotr26(V[4]) + rotr30(V[0]) + ma(V[1], V[2], V[0]);
+
+	V[2] += 0x78a5636fU + V[6] + W[9] + ch(V[3], V[4], V[5]) + rotr26(V[3]);
+
+	V[1] += 0x84c87814U + V[5] + W[10] + ch(V[2], V[3], V[4]) + rotr26(V[2]);
+
+	V[0] += 0x8cc70208U + V[4] + W[11] + ch(V[1], V[2], V[3]) + rotr26(V[1]);
+
+	V[7] += V[3] + W[12] + ch(V[0], V[1], V[2]) + rotr26(V[0]);
+
+#define FOUND (0x80)
+#define NFLAG (0x7F)
+
+#ifdef VECTORS4
+	if ((V[7].x == 0x136032edU) ^ (V[7].y == 0x136032edU) ^ (V[7].z == 0x136032edU) ^ (V[7].w == 0x136032edU))
+		output[FOUND] = output[NFLAG & nonce.x] = (V[7].x == 0x136032edU) ? nonce.x : ((V[7].y == 0x136032edU) ? nonce.y : ((V[7].z == 0x136032edU) ? nonce.z : nonce.w));
+#elif defined VECTORS2
+	if ((V[7].x == 0x136032edU) + (V[7].y == 0x136032edU))
+		output[FOUND] = output[NFLAG & nonce.x] = (V[7].x == 0x136032edU) ? nonce.x : nonce.y;
+#else
+	if (V[7] == 0x136032edU)
+		output[FOUND] = output[NFLAG & nonce] = nonce;
+#endif
+}

+ 164 - 58
bitforce.c → driver-bitforce.c

@@ -1,15 +1,17 @@
 /*
  * Copyright 2012 Luke Dashjr
+ * Copyright 2012 Con Kolivas
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
+ * Software Foundation; either version 3 of the License, or (at your option)
  * any later version.  See COPYING for more details.
  */
 
 #include <limits.h>
 #include <pthread.h>
 #include <stdio.h>
+#include <strings.h>
 #include <sys/time.h>
 #include <sys/types.h>
 #include <dirent.h>
@@ -17,36 +19,56 @@
 #include <termios.h>
 #include <sys/stat.h>
 #include <fcntl.h>
+#ifndef O_CLOEXEC
+#define O_CLOEXEC 0
+#endif
 #else
 #include <windows.h>
 #include <io.h>
 #endif
 #include <unistd.h>
 
+#include "config.h"
+
+#ifdef HAVE_LIBUDEV
+#include <libudev.h>
+#endif
+
 #include "elist.h"
 #include "miner.h"
 
 
 struct device_api bitforce_api;
 
-#ifdef WIN32
-
 static int BFopen(const char *devpath)
 {
+#ifdef WIN32
 	HANDLE hSerial = CreateFile(devpath, GENERIC_READ | GENERIC_WRITE, 0, NULL, OPEN_EXISTING, 0, NULL);
 	if (unlikely(hSerial == INVALID_HANDLE_VALUE))
 		return -1;
+	
+	COMMTIMEOUTS cto = {30000, 0, 30000, 0, 30000};
+	SetCommTimeouts(hSerial, &cto);
+	
 	return _open_osfhandle((LONG)hSerial, 0);
-}
-
 #else
-
-static int BFopen(const char *devpath)
-{
-	return open(devpath, O_CLOEXEC | O_NOCTTY);
-}
-
+	int fdDev = open(devpath, O_RDWR | O_CLOEXEC | O_NOCTTY);
+	if (likely(fdDev != -1)) {
+		struct termios pattr;
+		
+		tcgetattr(fdDev, &pattr);
+		pattr.c_iflag &= ~(IGNBRK | BRKINT | PARMRK | ISTRIP | INLCR | IGNCR | ICRNL | IXON);
+		pattr.c_oflag &= ~OPOST;
+		pattr.c_lflag &= ~(ECHO | ECHONL | ICANON | ISIG | IEXTEN);
+		pattr.c_cflag &= ~(CSIZE | PARENB);
+		pattr.c_cflag |= CS8;
+		tcsetattr(fdDev, TCSANOW, &pattr);
+	}
+	tcflush(fdDev, TCOFLUSH);
+	tcflush(fdDev, TCIFLUSH);
+	return fdDev;
 #endif
+}
 
 static void BFgets(char *buf, size_t bufLen, int fd)
 {
@@ -57,34 +79,37 @@ static void BFgets(char *buf, size_t bufLen, int fd)
 	buf[0] = '\0';
 }
 
-#define BFwrite(fd, buf, bufLen) write(fd, buf, bufLen)
-#define BFclose(fd) close(fd)
+static void BFwrite(int fd, const void *buf, ssize_t bufLen)
+{
+	ssize_t ret = write(fd, buf, bufLen);
 
+	if (unlikely(ret != bufLen))
+		quit(1, "BFwrite failed");
+}
+
+#define BFclose(fd) close(fd)
 
 static bool bitforce_detect_one(const char *devpath)
 {
+	char *s;
 	char pdevbuf[0x100];
-	int i = 0;
 
 	if (total_devices == MAX_DEVICES)
 		return false;
 
 	int fdDev = BFopen(devpath);
-	if (unlikely(fdDev == -1))
-	{
+	if (unlikely(fdDev == -1)) {
 		applog(LOG_DEBUG, "BitForce Detect: Failed to open %s", devpath);
 		return false;
 	}
 	BFwrite(fdDev, "ZGX", 3);
 	BFgets(pdevbuf, sizeof(pdevbuf), fdDev);
-	if (unlikely(!pdevbuf[0]))
-	{
+	if (unlikely(!pdevbuf[0])) {
 		applog(LOG_ERR, "Error reading from BitForce (ZGX)");
 		return 0;
 	}
 	BFclose(fdDev);
-	if (unlikely(!strstr(pdevbuf, "SHA256")))
-	{
+	if (unlikely(!strstr(pdevbuf, "SHA256"))) {
 		applog(LOG_DEBUG, "BitForce Detect: Didn't recognise BitForce on %s", devpath);
 		return false;
 	}
@@ -92,17 +117,56 @@ static bool bitforce_detect_one(const char *devpath)
 	// We have a real BitForce!
 	struct cgpu_info *bitforce;
 	bitforce = calloc(1, sizeof(*bitforce));
-	devices[total_devices++] = bitforce;
 	bitforce->api = &bitforce_api;
-	bitforce->device_id = i++;
 	bitforce->device_path = strdup(devpath);
-	bitforce->enabled = true;
+	bitforce->deven = DEV_ENABLED;
 	bitforce->threads = 1;
+	if (likely((!memcmp(pdevbuf, ">>>ID: ", 7)) && (s = strstr(pdevbuf + 3, ">>>"))))
+	{
+		s[0] = '\0';
+		bitforce->name = strdup(pdevbuf + 7);
+	}
 
-	return true;
+	return add_cgpu(bitforce);
 }
 
-static void bitforce_detect_auto()
+static bool bitforce_detect_auto_udev()
+{
+#ifdef HAVE_LIBUDEV
+	struct udev *udev = udev_new();
+	struct udev_enumerate *enumerate = udev_enumerate_new(udev);
+	struct udev_list_entry *list_entry;
+	bool foundany = false;
+	
+	udev_enumerate_add_match_subsystem(enumerate, "tty");
+	udev_enumerate_add_match_property(enumerate, "ID_MODEL", "BitFORCE*SHA256");
+	udev_enumerate_scan_devices(enumerate);
+	udev_list_entry_foreach(list_entry, udev_enumerate_get_list_entry(enumerate)) {
+		struct udev_device *device = udev_device_new_from_syspath(
+			udev_enumerate_get_udev(enumerate),
+			udev_list_entry_get_name(list_entry)
+		);
+		if (!device)
+			continue;
+		
+		const char *devpath = udev_device_get_devnode(device);
+		if (devpath) {
+			foundany = true;
+			bitforce_detect_one(devpath);
+		}
+		
+		udev_device_unref(device);
+	}
+	udev_enumerate_unref(enumerate);
+	udev_unref(udev);
+	
+	return foundany;
+#else
+	return false;
+#endif
+}
+
+static bool bitforce_detect_auto_devserial()
 {
 #ifndef WIN32
 	DIR *D;
@@ -110,32 +174,66 @@ static void bitforce_detect_auto()
 	const char udevdir[] = "/dev/serial/by-id";
 	char devpath[sizeof(udevdir) + 1 + NAME_MAX];
 	char *devfile = devpath + sizeof(udevdir);
-
+	bool foundany = false;
+	
 	D = opendir(udevdir);
 	if (!D)
-		return;
+		return false;
 	memcpy(devpath, udevdir, sizeof(udevdir) - 1);
 	devpath[sizeof(udevdir) - 1] = '/';
 	while ( (de = readdir(D)) ) {
 		if (!strstr(de->d_name, "BitFORCE_SHA256"))
 			continue;
+		foundany = true;
 		strcpy(devfile, de->d_name);
 		bitforce_detect_one(devpath);
 	}
 	closedir(D);
+	
+	return foundany;
+#else
+	return false;
 #endif
 }
 
+static void bitforce_detect_auto()
+{
+	bitforce_detect_auto_udev() ?:
+	bitforce_detect_auto_devserial() ?:
+	0;
+}
+
 static void bitforce_detect()
 {
 	struct string_elist *iter, *tmp;
+	const char*s;
+	bool found = false;
+	bool autoscan = false;
 
 	list_for_each_entry_safe(iter, tmp, &scan_devices, list) {
-		if (bitforce_detect_one(iter->string))
+		s = iter->string;
+		if (!strncmp("bitforce:", iter->string, 9))
+			s += 9;
+		if (!strcmp(s, "auto"))
+			autoscan = true;
+		else if (bitforce_detect_one(s)) {
 			string_elist_del(iter);
+			found = true;
+		}
 	}
 
-	bitforce_detect_auto();
+	if (autoscan || !found)
+		bitforce_detect_auto();
+}
+
+static void get_bitforce_statline_before(char *buf, struct cgpu_info *bitforce)
+{
+	float gt = bitforce->temp;
+	if (gt > 0)
+		tailsprintf(buf, "%5.1fC ", gt);
+	else
+		tailsprintf(buf, "       ", gt);
+	tailsprintf(buf, "        | ");
 }
 
 static bool bitforce_thread_prepare(struct thr_info *thr)
@@ -145,25 +243,11 @@ static bool bitforce_thread_prepare(struct thr_info *thr)
 	struct timeval now;
 
 	int fdDev = BFopen(bitforce->device_path);
-	if (unlikely(-1 == fdDev))
-	{
+	if (unlikely(-1 == fdDev)) {
 		applog(LOG_ERR, "Failed to open BitForce on %s", bitforce->device_path);
 		return false;
 	}
 
-#ifndef WIN32
-	{
-		struct termios pattr;
-
-		tcgetattr(fdDev, &pattr);
-		pattr.c_iflag &= ~(IGNBRK | BRKINT | PARMRK | ISTRIP | INLCR | IGNCR | ICRNL | IXON);
-		pattr.c_oflag &= ~OPOST;
-		pattr.c_lflag &= ~(ECHO | ECHONL | ICANON | ISIG | IEXTEN);
-		pattr.c_cflag &= ~(CSIZE | PARENB);
-		pattr.c_cflag |= CS8;
-		tcsetattr(fdDev, TCSANOW, &pattr);
-	}
-#endif
 	bitforce->device_fd = fdDev;
 
 	applog(LOG_INFO, "Opened BitForce on %s", bitforce->device_path);
@@ -173,7 +257,7 @@ static bool bitforce_thread_prepare(struct thr_info *thr)
 	return true;
 }
 
-static uint64_t bitforce_scanhash(struct thr_info *thr, struct work *work, uint64_t max_nonce)
+static uint64_t bitforce_scanhash(struct thr_info *thr, struct work *work, uint64_t __maybe_unused max_nonce)
 {
 	struct cgpu_info *bitforce = thr->cgpu;
 	int fdDev = bitforce->device_fd;
@@ -182,6 +266,7 @@ static uint64_t bitforce_scanhash(struct thr_info *thr, struct work *work, uint6
 	unsigned char ob[61] = ">>>>>>>>12345678901234567890123456789012123456789012>>>>>>>>";
 	int i;
 	char *pnoncebuf;
+	char *s;
 	uint32_t nonce;
 
 	BFwrite(fdDev, "ZDX", 3);
@@ -190,8 +275,7 @@ static uint64_t bitforce_scanhash(struct thr_info *thr, struct work *work, uint6
 		applog(LOG_ERR, "Error reading from BitForce (ZDX)");
 		return 0;
 	}
-	if (unlikely(pdevbuf[0] != 'O' || pdevbuf[1] != 'K'))
-	{
+	if (unlikely(pdevbuf[0] != 'O' || pdevbuf[1] != 'K')) {
 		applog(LOG_ERR, "BitForce ZDX reports: %s", pdevbuf);
 		return 0;
 	}
@@ -199,27 +283,49 @@ static uint64_t bitforce_scanhash(struct thr_info *thr, struct work *work, uint6
 	memcpy(ob + 8, work->midstate, 32);
 	memcpy(ob + 8 + 32, work->data + 64, 12);
 	BFwrite(fdDev, ob, 60);
-	applog(LOG_DEBUG, "BitForce block data: %s", bin2hex(ob + 8, 44));
+	if (opt_debug) {
+		s = bin2hex(ob + 8, 44);
+		applog(LOG_DEBUG, "BitForce block data: %s", s);
+		free(s);
+	}
 
 	BFgets(pdevbuf, sizeof(pdevbuf), fdDev);
-	if (unlikely(!pdevbuf[0]))
-	{
+	if (unlikely(!pdevbuf[0])) {
 		applog(LOG_ERR, "Error reading from BitForce (block data)");
 		return 0;
 	}
-	if (unlikely(pdevbuf[0] != 'O' || pdevbuf[1] != 'K'))
-	{
+	if (unlikely(pdevbuf[0] != 'O' || pdevbuf[1] != 'K')) {
 		applog(LOG_ERR, "BitForce block data reports: %s", pdevbuf);
 		return 0;
 	}
 
+	BFwrite(fdDev, "ZLX", 3);
+	BFgets(pdevbuf, sizeof(pdevbuf), fdDev);
+	if (unlikely(!pdevbuf[0])) {
+		applog(LOG_ERR, "Error reading from BitForce (ZKX)");
+		return 0;
+	}
+	if ((!strncasecmp(pdevbuf, "TEMP", 4)) && (s = strchr(pdevbuf + 4, ':'))) {
+		float temp = strtof(s + 1, NULL);
+		if (temp > 0) {
+			bitforce->temp = temp;
+			if (temp > bitforce->cutofftemp) {
+				applog(LOG_WARNING, "Hit thermal cutoff limit on %s %d, disabling!", bitforce->api->name, bitforce->device_id);
+				bitforce->deven = DEV_RECOVER;
+
+				bitforce->device_last_not_well = time(NULL);
+				bitforce->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
+				bitforce->dev_thermal_cutoff_count++;
+			}
+		}
+	}
+
 	usleep(4500000);
 	i = 4500;
 	while (1) {
 		BFwrite(fdDev, "ZFX", 3);
 		BFgets(pdevbuf, sizeof(pdevbuf), fdDev);
-		if (unlikely(!pdevbuf[0]))
-		{
+		if (unlikely(!pdevbuf[0])) {
 			applog(LOG_ERR, "Error reading from BitForce (ZFX)");
 			return 0;
 		}
@@ -232,8 +338,7 @@ static uint64_t bitforce_scanhash(struct thr_info *thr, struct work *work, uint6
 	work->blk.nonce = 0xffffffff;
 	if (pdevbuf[2] == '-')
 		return 0xffffffff;
-	else
-	if (strncasecmp(pdevbuf, "NONCE-FOUND", 11)) {
+	else if (strncasecmp(pdevbuf, "NONCE-FOUND", 11)) {
 		applog(LOG_ERR, "BitForce result reports: %s", pdevbuf);
 		return 0;
 	}
@@ -256,9 +361,10 @@ static uint64_t bitforce_scanhash(struct thr_info *thr, struct work *work, uint6
 }
 
 struct device_api bitforce_api = {
-	.name = "BFL",
+	.dname = "bitforce",
+	.name = "PGA",
 	.api_detect = bitforce_detect,
-	// .reinit_device = TODO
+	.get_statline_before = get_bitforce_statline_before,
 	.thread_prepare = bitforce_thread_prepare,
 	.scanhash = bitforce_scanhash,
 };

+ 842 - 0
driver-cpu.c

@@ -0,0 +1,842 @@
+/*
+ * Copyright 2011-2012 Con Kolivas
+ * Copyright 2011-2012 Luke Dashjr
+ * Copyright 2010 Jeff Garzik
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.  See COPYING for more details.
+ */
+
+#include "config.h"
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <signal.h>
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#ifndef WIN32
+#include <sys/wait.h>
+#include <sys/resource.h>
+#endif
+#include <libgen.h>
+
+#include "compat.h"
+#include "miner.h"
+#include "bench_block.h"
+#include "driver-cpu.h"
+
+#if defined(unix)
+	#include <errno.h>
+	#include <fcntl.h>
+#endif
+
+#if defined(__linux) && defined(cpu_set_t) /* Linux specific policy and affinity management */
+#include <sched.h>
+static inline void drop_policy(void)
+{
+	struct sched_param param;
+
+#ifdef SCHED_BATCH
+#ifdef SCHED_IDLE
+	if (unlikely(sched_setscheduler(0, SCHED_IDLE, &param) == -1))
+#endif
+		sched_setscheduler(0, SCHED_BATCH, &param);
+#endif
+}
+
+static inline void affine_to_cpu(int id, int cpu)
+{
+	cpu_set_t set;
+
+	CPU_ZERO(&set);
+	CPU_SET(cpu, &set);
+	sched_setaffinity(0, sizeof(&set), &set);
+	applog(LOG_INFO, "Binding cpu mining thread %d to cpu %d", id, cpu);
+}
+#else
+static inline void drop_policy(void)
+{
+}
+
+static inline void affine_to_cpu(int id, int cpu)
+{
+}
+#endif
+
+
+
+/* TODO: resolve externals */
+extern bool submit_work_sync(struct thr_info *thr, const struct work *work_in);
+extern char *set_int_range(const char *arg, int *i, int min, int max);
+extern int dev_from_id(int thr_id);
+
+
+/* chipset-optimized hash functions */
+extern bool ScanHash_4WaySSE2(int, const unsigned char *pmidstate,
+	unsigned char *pdata, unsigned char *phash1, unsigned char *phash,
+	const unsigned char *ptarget,
+	uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce);
+
+extern bool ScanHash_altivec_4way(int thr_id, const unsigned char *pmidstate,
+	unsigned char *pdata,
+	unsigned char *phash1, unsigned char *phash,
+	const unsigned char *ptarget,
+	uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce);
+
+extern bool scanhash_via(int, const unsigned char *pmidstate,
+	unsigned char *pdata,
+	unsigned char *phash1, unsigned char *phash,
+	const unsigned char *target,
+	uint32_t max_nonce, uint32_t *last_nonce, uint32_t n);
+
+extern bool scanhash_c(int, const unsigned char *midstate, unsigned char *data,
+	      unsigned char *hash1, unsigned char *hash,
+	      const unsigned char *target,
+	      uint32_t max_nonce, uint32_t *last_nonce, uint32_t n);
+
+extern bool scanhash_cryptopp(int, const unsigned char *midstate,unsigned char *data,
+	      unsigned char *hash1, unsigned char *hash,
+	      const unsigned char *target,
+	      uint32_t max_nonce, uint32_t *last_nonce, uint32_t n);
+
+extern bool scanhash_asm32(int, const unsigned char *midstate,unsigned char *data,
+	      unsigned char *hash1, unsigned char *hash,
+	      const unsigned char *target,
+	      uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce);
+
+extern bool scanhash_sse2_64(int, const unsigned char *pmidstate, unsigned char *pdata,
+	unsigned char *phash1, unsigned char *phash,
+	const unsigned char *ptarget,
+	uint32_t max_nonce, uint32_t *last_nonce,
+	uint32_t nonce);
+
+extern bool scanhash_sse4_64(int, const unsigned char *pmidstate, unsigned char *pdata,
+	unsigned char *phash1, unsigned char *phash,
+	const unsigned char *ptarget,
+	uint32_t max_nonce, uint32_t *last_nonce,
+	uint32_t nonce);
+
+extern bool scanhash_sse2_32(int, const unsigned char *pmidstate, unsigned char *pdata,
+	unsigned char *phash1, unsigned char *phash,
+	const unsigned char *ptarget,
+	uint32_t max_nonce, uint32_t *last_nonce,
+	uint32_t nonce);
+
+
+
+
+#ifdef WANT_CPUMINE
+static size_t max_name_len = 0;
+static char *name_spaces_pad = NULL;
+const char *algo_names[] = {
+	[ALGO_C]		= "c",
+#ifdef WANT_SSE2_4WAY
+	[ALGO_4WAY]		= "4way",
+#endif
+#ifdef WANT_VIA_PADLOCK
+	[ALGO_VIA]		= "via",
+#endif
+	[ALGO_CRYPTOPP]		= "cryptopp",
+#ifdef WANT_CRYPTOPP_ASM32
+	[ALGO_CRYPTOPP_ASM32]	= "cryptopp_asm32",
+#endif
+#ifdef WANT_X8632_SSE2
+	[ALGO_SSE2_32]		= "sse2_32",
+#endif
+#ifdef WANT_X8664_SSE2
+	[ALGO_SSE2_64]		= "sse2_64",
+#endif
+#ifdef WANT_X8664_SSE4
+	[ALGO_SSE4_64]		= "sse4_64",
+#endif
+#ifdef WANT_ALTIVEC_4WAY
+    [ALGO_ALTIVEC_4WAY] = "altivec_4way",
+#endif
+};
+
+static const sha256_func sha256_funcs[] = {
+	[ALGO_C]		= (sha256_func)scanhash_c,
+#ifdef WANT_SSE2_4WAY
+	[ALGO_4WAY]		= (sha256_func)ScanHash_4WaySSE2,
+#endif
+#ifdef WANT_ALTIVEC_4WAY
+    [ALGO_ALTIVEC_4WAY] = (sha256_func) ScanHash_altivec_4way,
+#endif
+#ifdef WANT_VIA_PADLOCK
+	[ALGO_VIA]		= (sha256_func)scanhash_via,
+#endif
+	[ALGO_CRYPTOPP]		=  (sha256_func)scanhash_cryptopp,
+#ifdef WANT_CRYPTOPP_ASM32
+	[ALGO_CRYPTOPP_ASM32]	= (sha256_func)scanhash_asm32,
+#endif
+#ifdef WANT_X8632_SSE2
+	[ALGO_SSE2_32]		= (sha256_func)scanhash_sse2_32,
+#endif
+#ifdef WANT_X8664_SSE2
+	[ALGO_SSE2_64]		= (sha256_func)scanhash_sse2_64,
+#endif
+#ifdef WANT_X8664_SSE4
+	[ALGO_SSE4_64]		= (sha256_func)scanhash_sse4_64
+#endif
+};
+#endif
+
+
+
+#ifdef WANT_CPUMINE
+#if defined(WANT_X8664_SSE2) && defined(__SSE2__)
+enum sha256_algos opt_algo = ALGO_SSE2_64;
+#elif defined(WANT_X8632_SSE2) && defined(__SSE2__)
+enum sha256_algos opt_algo = ALGO_SSE2_32;
+#else
+enum sha256_algos opt_algo = ALGO_C;
+#endif
+bool opt_usecpu = false;
+static int cpur_thr_id;
+static bool forced_n_threads;
+#endif
+
+
+
+
+#ifdef WANT_CPUMINE
+// Algo benchmark, crash-prone, system independent stage
+double bench_algo_stage3(
+	enum sha256_algos algo
+)
+{
+	// Use a random work block pulled from a pool
+	static uint8_t bench_block[] = { CGMINER_BENCHMARK_BLOCK };
+	struct work work __attribute__((aligned(128)));
+
+	size_t bench_size = sizeof(work);
+	size_t work_size = sizeof(bench_block);
+	size_t min_size = (work_size < bench_size ? work_size : bench_size);
+	memset(&work, 0, sizeof(work));
+	memcpy(&work, &bench_block, min_size);
+
+	struct work_restart dummy;
+	work_restart = &dummy;
+
+	struct timeval end;
+	struct timeval start;
+	uint32_t max_nonce = (1<<22);
+	uint32_t last_nonce = 0;
+
+	gettimeofday(&start, 0);
+			{
+				sha256_func func = sha256_funcs[algo];
+				(*func)(
+					0,
+					work.midstate,
+					work.data,
+					work.hash1,
+					work.hash,
+					work.target,
+					max_nonce,
+					&last_nonce,
+					work.blk.nonce
+				);
+			}
+	gettimeofday(&end, 0);
+	work_restart = NULL;
+
+	uint64_t usec_end = ((uint64_t)end.tv_sec)*1000*1000 + end.tv_usec;
+	uint64_t usec_start = ((uint64_t)start.tv_sec)*1000*1000 + start.tv_usec;
+	uint64_t usec_elapsed = usec_end - usec_start;
+
+	double rate = -1.0;
+	if (0<usec_elapsed) {
+		rate = (1.0*(last_nonce+1))/usec_elapsed;
+	}
+	return rate;
+}
+
+#if defined(unix)
+
+	// Change non-blocking status on a file descriptor
+	static void set_non_blocking(
+		int fd,
+		int yes
+	)
+	{
+		int flags = fcntl(fd, F_GETFL, 0);
+		if (flags<0) {
+			perror("fcntl(GET) failed");
+			exit(1);
+		}
+		flags = yes ? (flags|O_NONBLOCK) : (flags&~O_NONBLOCK);
+
+		int r = fcntl(fd, F_SETFL, flags);
+		if (r<0) {
+			perror("fcntl(SET) failed");
+			exit(1);
+		}
+	}
+
+#endif // defined(unix)
+
+// Algo benchmark, crash-safe, system-dependent stage
+static double bench_algo_stage2(
+	enum sha256_algos algo
+)
+{
+	// Here, the gig is to safely run a piece of code that potentially
+	// crashes. Unfortunately, the Right Way (tm) to do this is rather
+	// heavily platform dependent :(
+
+	double rate = -1.23457;
+
+	#if defined(unix)
+
+		// Make a pipe: [readFD, writeFD]
+		int pfd[2];
+		int r = pipe(pfd);
+		if (r<0) {
+			perror("pipe - failed to create pipe for --algo auto");
+			exit(1);
+		}
+
+		// Make pipe non blocking
+		set_non_blocking(pfd[0], 1);
+		set_non_blocking(pfd[1], 1);
+
+		// Don't allow a crashing child to kill the main process
+		sighandler_t sr0 = signal(SIGPIPE, SIG_IGN);
+		sighandler_t sr1 = signal(SIGPIPE, SIG_IGN);
+		if (SIG_ERR==sr0 || SIG_ERR==sr1) {
+			perror("signal - failed to edit signal mask for --algo auto");
+			exit(1);
+		}
+
+		// Fork a child to do the actual benchmarking
+		pid_t child_pid = fork();
+		if (child_pid<0) {
+			perror("fork - failed to create a child process for --algo auto");
+			exit(1);
+		}
+
+		// Do the dangerous work in the child, knowing we might crash
+		if (0==child_pid) {
+
+			// TODO: some umask trickery to prevent coredumps
+
+			// Benchmark this algorithm
+			double r = bench_algo_stage3(algo);
+
+			// We survived, send result to parent and bail
+			int loop_count = 0;
+			while (1) {
+				ssize_t bytes_written = write(pfd[1], &r, sizeof(r));
+				int try_again = (0==bytes_written || (bytes_written<0 && EAGAIN==errno));
+				int success = (sizeof(r)==(size_t)bytes_written);
+
+				if (success)
+					break;
+
+				if (!try_again) {
+					perror("write - child failed to write benchmark result to pipe");
+					exit(1);
+				}
+
+				if (5<loop_count) {
+					applog(LOG_ERR, "child tried %d times to communicate with parent, giving up", loop_count);
+					exit(1);
+				}
+				++loop_count;
+				sleep(1);
+			}
+			exit(0);
+		}
+
+		// Parent waits for a result from child
+		int loop_count = 0;
+		while (1) {
+
+			// Wait for child to die
+			int status;
+			int r = waitpid(child_pid, &status, WNOHANG);
+			if ((child_pid==r) || (r<0 && ECHILD==errno)) {
+
+				// Child died somehow. Grab result and bail
+				double tmp;
+				ssize_t bytes_read = read(pfd[0], &tmp, sizeof(tmp));
+				if (sizeof(tmp)==(size_t)bytes_read)
+					rate = tmp;
+				break;
+
+			} else if (r<0) {
+				perror("bench_algo: waitpid failed. giving up.");
+				exit(1);
+			}
+
+			// Give up on child after a ~60s
+			if (60<loop_count) {
+				kill(child_pid, SIGKILL);
+				waitpid(child_pid, &status, 0);
+				break;
+			}
+
+			// Wait a bit longer
+			++loop_count;
+			sleep(1);
+		}
+
+		// Close pipe
+		r = close(pfd[0]);
+		if (r<0) {
+			perror("close - failed to close read end of pipe for --algo auto");
+			exit(1);
+		}
+		r = close(pfd[1]);
+		if (r<0) {
+			perror("close - failed to close read end of pipe for --algo auto");
+			exit(1);
+		}
+
+	#elif defined(WIN32)
+
+		// Get handle to current exe
+		HINSTANCE module = GetModuleHandle(0);
+		if (!module) {
+			applog(LOG_ERR, "failed to retrieve module handle");
+			exit(1);
+		}
+
+		// Create a unique name
+		char unique_name[32];
+		snprintf(
+			unique_name,
+			sizeof(unique_name)-1,
+			"cgminer-%p",
+			(void*)module
+		);
+
+		// Create and init a chunked of shared memory
+		HANDLE map_handle = CreateFileMapping(
+			INVALID_HANDLE_VALUE,   // use paging file
+			NULL,                   // default security attributes
+			PAGE_READWRITE,         // read/write access
+			0,                      // size: high 32-bits
+			4096,			// size: low 32-bits
+			unique_name		// name of map object
+		);
+		if (NULL==map_handle) {
+			applog(LOG_ERR, "could not create shared memory");
+			exit(1);
+		}
+
+		void *shared_mem = MapViewOfFile(
+			map_handle,	// object to map view of
+			FILE_MAP_WRITE, // read/write access
+			0,              // high offset:  map from
+			0,              // low offset:   beginning
+			0		// default: map entire file
+		);
+		if (NULL==shared_mem) {
+			applog(LOG_ERR, "could not map shared memory");
+			exit(1);
+		}
+		SetEnvironmentVariable("CGMINER_SHARED_MEM", unique_name);
+		CopyMemory(shared_mem, &rate, sizeof(rate));
+
+		// Get path to current exe
+		char cmd_line[256 + MAX_PATH];
+		const size_t n = sizeof(cmd_line)-200;
+		DWORD size = GetModuleFileName(module, cmd_line, n);
+		if (0==size) {
+			applog(LOG_ERR, "failed to retrieve module path");
+			exit(1);
+		}
+
+		// Construct new command line based on that
+		char *p = strlen(cmd_line) + cmd_line;
+		sprintf(p, " --bench-algo %d", algo);
+		SetEnvironmentVariable("CGMINER_BENCH_ALGO", "1");
+
+		// Launch a debug copy of cgminer
+		STARTUPINFO startup_info;
+		PROCESS_INFORMATION process_info;
+		ZeroMemory(&startup_info, sizeof(startup_info));
+		ZeroMemory(&process_info, sizeof(process_info));
+		startup_info.cb = sizeof(startup_info);
+
+		BOOL ok = CreateProcess(
+			NULL,			// No module name (use command line)
+			cmd_line,		// Command line
+			NULL,			// Process handle not inheritable
+			NULL,			// Thread handle not inheritable
+			FALSE,			// Set handle inheritance to FALSE
+			DEBUG_ONLY_THIS_PROCESS,// We're going to debug the child
+			NULL,			// Use parent's environment block
+			NULL,			// Use parent's starting directory
+			&startup_info,		// Pointer to STARTUPINFO structure
+			&process_info		// Pointer to PROCESS_INFORMATION structure
+		);
+		if (!ok) {
+			applog(LOG_ERR, "CreateProcess failed with error %d\n", GetLastError() );
+			exit(1);
+		}
+
+		// Debug the child (only clean way to catch exceptions)
+		while (1) {
+
+			// Wait for child to do something
+			DEBUG_EVENT debug_event;
+			ZeroMemory(&debug_event, sizeof(debug_event));
+
+			BOOL ok = WaitForDebugEvent(&debug_event, 60 * 1000);
+			if (!ok)
+				break;
+
+			// Decide if event is "normal"
+			int go_on =
+				CREATE_PROCESS_DEBUG_EVENT== debug_event.dwDebugEventCode	||
+				CREATE_THREAD_DEBUG_EVENT == debug_event.dwDebugEventCode	||
+				EXIT_THREAD_DEBUG_EVENT   == debug_event.dwDebugEventCode	||
+				EXCEPTION_DEBUG_EVENT     == debug_event.dwDebugEventCode	||
+				LOAD_DLL_DEBUG_EVENT      == debug_event.dwDebugEventCode	||
+				OUTPUT_DEBUG_STRING_EVENT == debug_event.dwDebugEventCode	||
+				UNLOAD_DLL_DEBUG_EVENT    == debug_event.dwDebugEventCode;
+			if (!go_on)
+				break;
+
+			// Some exceptions are also "normal", apparently.
+			if (EXCEPTION_DEBUG_EVENT== debug_event.dwDebugEventCode) {
+
+				int go_on =
+					EXCEPTION_BREAKPOINT== debug_event.u.Exception.ExceptionRecord.ExceptionCode;
+				if (!go_on)
+					break;
+			}
+
+			// If nothing unexpected happened, let child proceed
+			ContinueDebugEvent(
+				debug_event.dwProcessId,
+				debug_event.dwThreadId,
+				DBG_CONTINUE
+			);
+		}
+
+		// Clean up child process
+		TerminateProcess(process_info.hProcess, 1);
+		CloseHandle(process_info.hProcess);
+		CloseHandle(process_info.hThread);
+
+		// Reap return value and cleanup
+		CopyMemory(&rate, shared_mem, sizeof(rate));
+		(void)UnmapViewOfFile(shared_mem);
+		(void)CloseHandle(map_handle);
+
+	#else
+
+		// Not linux, not unix, not WIN32 ... do our best
+		rate = bench_algo_stage3(algo);
+
+	#endif // defined(unix)
+
+	// Done
+	return rate;
+}
+
+static void bench_algo(
+	double            *best_rate,
+	enum sha256_algos *best_algo,
+	enum sha256_algos algo
+)
+{
+	size_t n = max_name_len - strlen(algo_names[algo]);
+	memset(name_spaces_pad, ' ', n);
+	name_spaces_pad[n] = 0;
+
+	applog(
+		LOG_ERR,
+		"\"%s\"%s : benchmarking algorithm ...",
+		algo_names[algo],
+		name_spaces_pad
+	);
+
+	double rate = bench_algo_stage2(algo);
+	if (rate<0.0) {
+		applog(
+			LOG_ERR,
+			"\"%s\"%s : algorithm fails on this platform",
+			algo_names[algo],
+			name_spaces_pad
+		);
+	} else {
+		applog(
+			LOG_ERR,
+			"\"%s\"%s : algorithm runs at %.5f MH/s",
+			algo_names[algo],
+			name_spaces_pad,
+			rate
+		);
+		if (*best_rate<rate) {
+			*best_rate = rate;
+			*best_algo = algo;
+		}
+	}
+}
+
+// Figure out the longest algorithm name
+void init_max_name_len()
+{
+	size_t i;
+	size_t nb_names = sizeof(algo_names)/sizeof(algo_names[0]);
+	for (i=0; i<nb_names; ++i) {
+		const char *p = algo_names[i];
+		size_t name_len = p ? strlen(p) : 0;
+		if (max_name_len<name_len)
+			max_name_len = name_len;
+	}
+
+	name_spaces_pad = (char*) malloc(max_name_len+16);
+	if (0==name_spaces_pad) {
+		perror("malloc failed");
+		exit(1);
+	}
+}
+
+// Pick the fastest CPU hasher
+static enum sha256_algos pick_fastest_algo()
+{
+	double best_rate = -1.0;
+	enum sha256_algos best_algo = 0;
+	applog(LOG_ERR, "benchmarking all sha256 algorithms ...");
+
+	bench_algo(&best_rate, &best_algo, ALGO_C);
+
+	#if defined(WANT_SSE2_4WAY)
+		bench_algo(&best_rate, &best_algo, ALGO_4WAY);
+	#endif
+
+	#if defined(WANT_VIA_PADLOCK)
+		bench_algo(&best_rate, &best_algo, ALGO_VIA);
+	#endif
+
+	bench_algo(&best_rate, &best_algo, ALGO_CRYPTOPP);
+
+	#if defined(WANT_CRYPTOPP_ASM32)
+		bench_algo(&best_rate, &best_algo, ALGO_CRYPTOPP_ASM32);
+	#endif
+
+	#if defined(WANT_X8632_SSE2)
+		bench_algo(&best_rate, &best_algo, ALGO_SSE2_32);
+	#endif
+
+	#if defined(WANT_X8664_SSE2)
+		bench_algo(&best_rate, &best_algo, ALGO_SSE2_64);
+	#endif
+
+	#if defined(WANT_X8664_SSE4)
+		bench_algo(&best_rate, &best_algo, ALGO_SSE4_64);
+	#endif
+
+        #if defined(WANT_ALTIVEC_4WAY)
+                bench_algo(&best_rate, &best_algo, ALGO_ALTIVEC_4WAY);
+        #endif
+
+	size_t n = max_name_len - strlen(algo_names[best_algo]);
+	memset(name_spaces_pad, ' ', n);
+	name_spaces_pad[n] = 0;
+	applog(
+		LOG_ERR,
+		"\"%s\"%s : is fastest algorithm at %.5f MH/s",
+		algo_names[best_algo],
+		name_spaces_pad,
+		best_rate
+	);
+	return best_algo;
+}
+
+/* FIXME: Use asprintf for better errors. */
+char *set_algo(const char *arg, enum sha256_algos *algo)
+{
+	enum sha256_algos i;
+
+	if (!strcmp(arg, "auto")) {
+		*algo = pick_fastest_algo();
+		return NULL;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(algo_names); i++) {
+		if (algo_names[i] && !strcmp(arg, algo_names[i])) {
+			*algo = i;
+			return NULL;
+		}
+	}
+	return "Unknown algorithm";
+}
+
+void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo)
+{
+	strncpy(buf, algo_names[*algo], OPT_SHOW_LEN);
+}
+#endif
+
+#ifdef WANT_CPUMINE
+char *force_nthreads_int(const char *arg, int *i)
+{
+	forced_n_threads = true;
+	return set_int_range(arg, i, 0, 9999);
+}
+#endif
+
+#ifdef WANT_CPUMINE
+static void cpu_detect()
+{
+	int i;
+
+	// Reckon number of cores in the box
+	#if defined(WIN32)
+	{
+		DWORD system_am;
+		DWORD process_am;
+		BOOL ok = GetProcessAffinityMask(
+			GetCurrentProcess(),
+			&system_am,
+			&process_am
+		);
+		if (!ok) {
+			applog(LOG_ERR, "couldn't figure out number of processors :(");
+			num_processors = 1;
+		} else {
+			size_t n = 32;
+			num_processors = 0;
+			while (n--)
+				if (process_am & (1<<n))
+					++num_processors;
+		}
+	}
+	#else
+		num_processors = sysconf(_SC_NPROCESSORS_ONLN);
+	#endif /* !WIN32 */
+
+	if (opt_n_threads < 0 || !forced_n_threads) {
+		if (total_devices && !opt_usecpu)
+			opt_n_threads = 0;
+		else
+			opt_n_threads = num_processors;
+	}
+	if (num_processors < 1)
+		return;
+
+	if (total_devices + opt_n_threads > MAX_DEVICES)
+		opt_n_threads = MAX_DEVICES - total_devices;
+	cpus = calloc(opt_n_threads, sizeof(struct cgpu_info));
+	if (unlikely(!cpus))
+		quit(1, "Failed to calloc cpus");
+	for (i = 0; i < opt_n_threads; ++i) {
+		struct cgpu_info *cgpu;
+
+		cgpu = &cpus[i];
+		cgpu->api = &cpu_api;
+		cgpu->deven = DEV_ENABLED;
+		cgpu->threads = 1;
+		cgpu->kname = algo_names[opt_algo];
+		add_cgpu(cgpu);
+	}
+}
+
+static void reinit_cpu_device(struct cgpu_info *cpu)
+{
+	tq_push(thr_info[cpur_thr_id].q, cpu);
+}
+
+static bool cpu_thread_prepare(struct thr_info *thr)
+{
+	thread_reportin(thr);
+
+	return true;
+}
+
+static uint64_t cpu_can_limit_work(struct thr_info *thr)
+{
+	return 0xfffff;
+}
+
+static bool cpu_thread_init(struct thr_info *thr)
+{
+	const int thr_id = thr->id;
+
+	/* Set worker threads to nice 19 and then preferentially to SCHED_IDLE
+	 * and if that fails, then SCHED_BATCH. No need for this to be an
+	 * error if it fails */
+	setpriority(PRIO_PROCESS, 0, 19);
+	drop_policy();
+	/* Cpu affinity only makes sense if the number of threads is a multiple
+	 * of the number of CPUs */
+	if (!(opt_n_threads % num_processors))
+		affine_to_cpu(dev_from_id(thr_id), dev_from_id(thr_id) % num_processors);
+	return true;
+}
+
+static uint64_t cpu_scanhash(struct thr_info *thr, struct work *work, uint64_t max_nonce)
+{
+	const int thr_id = thr->id;
+
+	uint32_t first_nonce = work->blk.nonce;
+	uint32_t last_nonce;
+	bool rc;
+
+CPUSearch:
+	last_nonce = first_nonce;
+	rc = false;
+
+	/* scan nonces for a proof-of-work hash */
+	{
+		sha256_func func = sha256_funcs[opt_algo];
+		rc = (*func)(
+			thr_id,
+			work->midstate,
+			work->data,
+			work->hash1,
+			work->hash,
+			work->target,
+			max_nonce,
+			&last_nonce,
+			work->blk.nonce
+		);
+	}
+
+	/* if nonce found, submit work */
+	if (unlikely(rc)) {
+		applog(LOG_DEBUG, "CPU %d found something?", dev_from_id(thr_id));
+		if (unlikely(!submit_work_sync(thr, work))) {
+			applog(LOG_ERR, "Failed to submit_work_sync in miner_thread %d", thr_id);
+		}
+		work->blk.nonce = last_nonce + 1;
+		goto CPUSearch;
+	}
+	else
+	if (unlikely(last_nonce == first_nonce))
+		return 0;
+
+	work->blk.nonce = last_nonce + 1;
+	return last_nonce - first_nonce + 1;
+}
+
+struct device_api cpu_api = {
+	.dname = "cpu",
+	.name = "CPU",
+	.api_detect = cpu_detect,
+	.reinit_device = reinit_cpu_device,
+	.thread_prepare = cpu_thread_prepare,
+	.can_limit_work = cpu_can_limit_work,
+	.thread_init = cpu_thread_init,
+	.scanhash = cpu_scanhash,
+};
+#endif
+
+
+

+ 59 - 0
driver-cpu.h

@@ -0,0 +1,59 @@
+#ifndef __DEVICE_CPU_H__
+#define __DEVICE_CPU_H__
+
+#include "miner.h" /* for work_restart, TODO: re-factor dependency */
+
+#include "config.h"
+#include <stdbool.h>
+
+#ifndef OPT_SHOW_LEN
+#define OPT_SHOW_LEN 80
+#endif
+
+#ifdef __SSE2__
+#define WANT_SSE2_4WAY 1
+#endif
+
+#ifdef __ALTIVEC__
+#define WANT_ALTIVEC_4WAY 1
+#endif
+
+#if defined(__i386__) && defined(HAS_YASM) && defined(__SSE2__)
+#define WANT_X8632_SSE2 1
+#endif
+
+#if (defined(__i386__) || defined(__x86_64__)) &&  !defined(__APPLE__)
+#define WANT_VIA_PADLOCK 1
+#endif
+
+#if defined(__x86_64__) && defined(HAS_YASM)
+#define WANT_X8664_SSE2 1
+#endif
+
+#if defined(__x86_64__) && defined(HAS_YASM)
+#define WANT_X8664_SSE4 1
+#endif
+
+enum sha256_algos {
+	ALGO_C,			/* plain C */
+	ALGO_4WAY,		/* parallel SSE2 */
+	ALGO_VIA,		/* VIA padlock */
+	ALGO_CRYPTOPP,		/* Crypto++ (C) */
+	ALGO_CRYPTOPP_ASM32,	/* Crypto++ 32-bit assembly */
+	ALGO_SSE2_32,		/* SSE2 for x86_32 */
+	ALGO_SSE2_64,		/* SSE2 for x86_64 */
+	ALGO_SSE4_64,		/* SSE4 for x86_64 */
+	ALGO_ALTIVEC_4WAY,	/* parallel Altivec */
+};
+
+extern const char *algo_names[];
+extern bool opt_usecpu;
+extern struct device_api cpu_api;
+
+extern char *set_algo(const char *arg, enum sha256_algos *algo);
+extern void show_algo(char buf[OPT_SHOW_LEN], const enum sha256_algos *algo);
+extern char *force_nthreads_int(const char *arg, int *i);
+extern void init_max_name_len();
+extern double bench_algo_stage3(enum sha256_algos algo);
+
+#endif /* __DEVICE_CPU_H__ */

+ 341 - 0
driver-icarus.c

@@ -0,0 +1,341 @@
+/*
+ * Copyright 2012 Luke Dashjr
+ * Copyright 2012 Xiangfu <xiangfu@openmobilefree.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.  See COPYING for more details.
+ */
+
+/*
+ * Those code should be works fine with V2 and V3 bitstream of Icarus.
+ * Operation:
+ *   No detection implement.
+ *   Input: 64B = 32B midstate + 20B fill bytes + last 12 bytes of block head.
+ *   Return: send back 32bits immediately when Icarus found a valid nonce.
+ *           no query protocol implemented here, if no data send back in ~11.3
+ *           seconds (full cover time on 32bit nonce range by 380MH/s speed)
+ *           just send another work.
+ * Notice:
+ *   1. Icarus will start calculate when you push a work to them, even they
+ *      are busy.
+ *   2. The 2 FPGAs on Icarus will distribute the job, one will calculate the
+ *      0 ~ 7FFFFFFF, another one will cover the 80000000 ~ FFFFFFFF.
+ *   3. It's possible for 2 FPGAs both find valid nonce in the meantime, the 2
+ *      valid nonce will all be send back.
+ *   4. Icarus will stop work when: a valid nonce has been found or 32 bits
+ *      nonce range is completely calculated.
+ */
+
+#include <limits.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <unistd.h>
+#ifndef WIN32
+  #include <termios.h>
+  #include <sys/stat.h>
+  #include <fcntl.h>
+  #ifndef O_CLOEXEC
+    #define O_CLOEXEC 0
+  #endif
+#else
+  #include <windows.h>
+  #include <io.h>
+#endif
+
+#include "elist.h"
+#include "miner.h"
+
+#define ICARUS_READ_FAULT_COUNT	(8)
+
+struct device_api icarus_api;
+
+static void rev(unsigned char *s, size_t l)
+{
+	size_t i, j;
+	unsigned char t;
+
+	for (i = 0, j = l - 1; i < j; i++, j--) {
+		t = s[i];
+		s[i] = s[j];
+		s[j] = t;
+	}
+}
+
+static int icarus_open(const char *devpath)
+{
+#ifndef WIN32
+	struct termios my_termios;
+
+	int serialfd = open(devpath, O_RDWR | O_CLOEXEC | O_NOCTTY);
+
+	if (serialfd == -1)
+		return -1;
+
+	tcgetattr(serialfd, &my_termios);
+	my_termios.c_cflag = B115200;
+	my_termios.c_cflag |= CS8;
+	my_termios.c_cflag |= CREAD;
+	my_termios.c_cflag |= CLOCAL;
+	my_termios.c_cflag &= ~(CSIZE | PARENB);
+
+	my_termios.c_iflag &= ~(IGNBRK | BRKINT | PARMRK |
+				ISTRIP | INLCR | IGNCR | ICRNL | IXON);
+	my_termios.c_oflag &= ~OPOST;
+	my_termios.c_lflag &= ~(ECHO | ECHONL | ICANON | ISIG | IEXTEN);
+	my_termios.c_cc[VTIME] = 10; /* block 1 second */
+	my_termios.c_cc[VMIN] = 0;
+	tcsetattr(serialfd, TCSANOW, &my_termios);
+
+	tcflush(serialfd, TCOFLUSH);
+	tcflush(serialfd, TCIFLUSH);
+
+	return serialfd;
+#else
+	HANDLE hSerial = CreateFile(devpath, GENERIC_READ | GENERIC_WRITE, 0,
+				    NULL, OPEN_EXISTING, 0, NULL);
+	if (unlikely(hSerial == INVALID_HANDLE_VALUE))
+		return -1;
+
+	COMMTIMEOUTS cto = {1000, 0, 1000, 0, 1000};
+	SetCommTimeouts(hSerial, &cto);
+
+	return _open_osfhandle((LONG)hSerial, 0);
+#endif
+}
+
+static int icarus_gets(unsigned char *buf, size_t bufLen, int fd)
+{
+	ssize_t ret = 0;
+	int rc = 0;
+
+	while (bufLen) {
+		ret = read(fd, buf, 1);
+		if (ret == 1) {
+			bufLen--;
+			buf++;
+			continue;
+		}
+
+		rc++;
+		if (rc == ICARUS_READ_FAULT_COUNT) {
+			applog(LOG_DEBUG,
+			       "Icarus Read: No data in %d seconds", rc);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
+static int icarus_write(int fd, const void *buf, size_t bufLen)
+{
+	size_t ret;
+
+	ret = write(fd, buf, bufLen);
+	if (unlikely(ret != bufLen))
+		return 1;
+
+	return 0;
+}
+
+#define icarus_close(fd) close(fd)
+
+static bool icarus_detect_one(const char *devpath)
+{
+	int fd;
+
+	const char golden_ob[] =
+		"2db907f9cb4eb938ded904f4832c4331"
+		"0380e3aeb54364057e7fec5157bfc533"
+		"00000000000000000000000080000000"
+		"00000000a58e091ac342724e7c3dc346";
+	const char golden_nonce[] = "063c5e01";
+
+	unsigned char ob_bin[64], nonce_bin[4];
+	char *nonce_hex;
+
+	if (total_devices == MAX_DEVICES)
+		return false;
+
+	fd = icarus_open(devpath);
+	if (unlikely(fd == -1)) {
+		applog(LOG_ERR, "Icarus Detect: Failed to open %s", devpath);
+		return false;
+	}
+
+	hex2bin(ob_bin, golden_ob, sizeof(ob_bin));
+	icarus_write(fd, ob_bin, sizeof(ob_bin));
+
+	memset(nonce_bin, 0, sizeof(nonce_bin));
+	icarus_gets(nonce_bin, sizeof(nonce_bin), fd);
+
+	icarus_close(fd);
+
+	nonce_hex = bin2hex(nonce_bin, sizeof(nonce_bin));
+	if (nonce_hex) {
+		if (strncmp(nonce_hex, golden_nonce, 8)) {
+			applog(LOG_ERR, 
+			       "Icarus Detect: "
+			       "Test failed at %s: get %s, should: %s",
+			       devpath, nonce_hex, golden_nonce);
+			free(nonce_hex);
+			return false;
+		}
+		free(nonce_hex);
+	} else
+		return false;
+
+	/* We have a real Icarus! */
+	struct cgpu_info *icarus;
+	icarus = calloc(1, sizeof(struct cgpu_info));
+	icarus->api = &icarus_api;
+	icarus->device_path = strdup(devpath);
+	icarus->threads = 1;
+	add_cgpu(icarus);
+
+	applog(LOG_INFO, "Found Icarus at %s, mark as %d",
+	       devpath, icarus->device_id);
+
+	return true;
+}
+
+static void icarus_detect()
+{
+	struct string_elist *iter, *tmp;
+	const char*s;
+
+	list_for_each_entry_safe(iter, tmp, &scan_devices, list) {
+		s = iter->string;
+		if (!strncmp("icarus:", iter->string, 7))
+			s += 7;
+		if (icarus_detect_one(s))
+			string_elist_del(iter);
+	}
+}
+
+static bool icarus_prepare(struct thr_info *thr)
+{
+	struct cgpu_info *icarus = thr->cgpu;
+
+	struct timeval now;
+
+	int fd = icarus_open(icarus->device_path);
+	if (unlikely(-1 == fd)) {
+		applog(LOG_ERR, "Failed to open Icarus on %s",
+		       icarus->device_path);
+		return false;
+	}
+
+	icarus->device_fd = fd;
+
+	applog(LOG_INFO, "Opened Icarus on %s", icarus->device_path);
+	gettimeofday(&now, NULL);
+	get_datestamp(icarus->init, &now);
+
+	return true;
+}
+
+static uint64_t icarus_scanhash(struct thr_info *thr, struct work *work,
+				__maybe_unused uint64_t max_nonce)
+{
+	struct cgpu_info *icarus;
+	int fd;
+	int ret;
+
+	unsigned char ob_bin[64], nonce_bin[4];
+	char *ob_hex, *nonce_hex;
+	uint32_t nonce;
+	uint32_t hash_count;
+	time_t t = 0;
+
+	icarus = thr->cgpu;
+	fd = icarus->device_fd;
+
+	memset(ob_bin, 0, sizeof(ob_bin));
+	memcpy(ob_bin, work->midstate, 32);
+	memcpy(ob_bin + 52, work->data + 64, 12);
+	rev(ob_bin, 32);
+	rev(ob_bin + 52, 12);
+#ifndef WIN32
+	tcflush(fd, TCOFLUSH);
+#endif
+	ret = icarus_write(fd, ob_bin, sizeof(ob_bin));
+	if (ret)
+		return 0;	/* This should never happen */
+
+	ob_hex = bin2hex(ob_bin, sizeof(ob_bin));
+	if (ob_hex) {
+		t = time(NULL);
+		applog(LOG_DEBUG, "Icarus %s send: %s",
+		       icarus->device_id, ob_hex);
+		free(ob_hex);
+	}
+
+	/* Icarus will return 8 bytes nonces or nothing */
+	memset(nonce_bin, 0, sizeof(nonce_bin));
+	ret = icarus_gets(nonce_bin, sizeof(nonce_bin), fd);
+
+	nonce_hex = bin2hex(nonce_bin, sizeof(nonce_bin));
+	if (nonce_hex) {
+		t = time(NULL) - t;
+		applog(LOG_DEBUG, "Icarus %d return (elapse %d seconds): %s",
+		       icarus->device_id, t, nonce_hex);
+		free(nonce_hex);
+	}
+
+	memcpy((char *)&nonce, nonce_bin, sizeof(nonce_bin));
+
+        if (nonce == 0 && ret)
+                return 0xffffffff;
+
+#ifndef __BIG_ENDIAN__
+	nonce = swab32(nonce);
+#endif
+	work->blk.nonce = 0xffffffff;
+	submit_nonce(thr, work, nonce);
+
+	hash_count = (nonce & 0x7fffffff);
+        if (hash_count == 0)
+		hash_count = 2;
+        else {
+                if (hash_count++ == 0x7fffffff)
+                        hash_count = 0xffffffff;
+                else
+                        hash_count <<= 1;
+        }
+
+        return hash_count;
+}
+
+static void icarus_shutdown(struct thr_info *thr)
+{
+	struct cgpu_info *icarus;
+
+	if (thr->cgpu) {
+		icarus = thr->cgpu;
+
+		if (icarus->device_path)
+			free(icarus->device_path);
+
+		close(icarus->device_fd);
+
+		devices[icarus->device_id] = NULL;
+		free(icarus);
+
+		thr->cgpu = NULL;
+	}
+}
+
+struct device_api icarus_api = {
+	.dname = "icarus",
+	.name = "PGA",
+	.api_detect = icarus_detect,
+	.thread_prepare = icarus_prepare,
+	.scanhash = icarus_scanhash,
+	.thread_shutdown = icarus_shutdown,
+};

+ 1447 - 0
driver-opencl.c

@@ -0,0 +1,1447 @@
+/*
+ * Copyright 2011-2012 Con Kolivas
+ * Copyright 2011-2012 Luke Dashjr
+ * Copyright 2010 Jeff Garzik
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.  See COPYING for more details.
+ */
+
+#include "config.h"
+
+#ifdef HAVE_CURSES
+#include <curses.h>
+#endif
+
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <sys/types.h>
+
+#ifndef WIN32
+#include <sys/resource.h>
+#endif
+#include <ccan/opt/opt.h>
+
+#include "compat.h"
+#include "miner.h"
+#include "driver-opencl.h"
+#include "findnonce.h"
+#include "ocl.h"
+#include "adl.h"
+
+/* TODO: cleanup externals ********************/
+
+#ifdef HAVE_CURSES
+extern WINDOW *mainwin, *statuswin, *logwin;
+extern void enable_curses(void);
+#endif
+
+extern int mining_threads;
+extern double total_secs;
+extern int opt_g_threads;
+extern bool ping;
+extern bool opt_loginput;
+extern char *opt_kernel_path;
+extern int gpur_thr_id;
+extern bool opt_noadl;
+extern bool have_opencl;
+
+
+
+extern void *miner_thread(void *userdata);
+extern int dev_from_id(int thr_id);
+extern void tailsprintf(char *f, const char *fmt, ...);
+extern void wlog(const char *f, ...);
+extern void decay_time(double *f, double fadd);
+
+
+/**********************************************/
+
+#ifdef HAVE_ADL
+extern float gpu_temp(int gpu);
+extern int gpu_fanspeed(int gpu);
+extern int gpu_fanpercent(int gpu);
+#endif
+
+
+#ifdef HAVE_OPENCL
+char *set_vector(char *arg)
+{
+	int i, val = 0, device = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set vector";
+	val = atoi(nextptr);
+	if (val != 1 && val != 2 && val != 4)
+		return "Invalid value passed to set_vector";
+
+	gpus[device++].vwidth = val;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		val = atoi(nextptr);
+		if (val != 1 && val != 2 && val != 4)
+			return "Invalid value passed to set_vector";
+
+		gpus[device++].vwidth = val;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++)
+			gpus[i].vwidth = gpus[0].vwidth;
+	}
+
+	return NULL;
+}
+
+char *set_worksize(char *arg)
+{
+	int i, val = 0, device = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set work size";
+	val = atoi(nextptr);
+	if (val < 1 || val > 9999)
+		return "Invalid value passed to set_worksize";
+
+	gpus[device++].work_size = val;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		val = atoi(nextptr);
+		if (val < 1 || val > 9999)
+			return "Invalid value passed to set_worksize";
+
+		gpus[device++].work_size = val;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++)
+			gpus[i].work_size = gpus[0].work_size;
+	}
+
+	return NULL;
+}
+
+static enum cl_kernels select_kernel(char *arg)
+{
+	if (!strcmp(arg, "diablo"))
+		return KL_DIABLO;
+	if (!strcmp(arg, "diakgcn"))
+		return KL_DIAKGCN;
+	if (!strcmp(arg, "poclbm"))
+		return KL_POCLBM;
+	if (!strcmp(arg, "phatk"))
+		return KL_PHATK;
+	return KL_NONE;
+}
+
+char *set_kernel(char *arg)
+{
+	enum cl_kernels kern;
+	int i, device = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set kernel";
+	kern = select_kernel(nextptr);
+	if (kern == KL_NONE)
+		return "Invalid parameter to set_kernel";
+	gpus[device++].kernel = kern;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		kern = select_kernel(nextptr);
+		if (kern == KL_NONE)
+			return "Invalid parameter to set_kernel";
+
+		gpus[device++].kernel = kern;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++)
+			gpus[i].kernel = gpus[0].kernel;
+	}
+
+	return NULL;
+}
+#endif
+
+#ifdef HAVE_ADL
+void get_intrange(char *arg, int *val1, int *val2)
+{
+	if (sscanf(arg, "%d-%d", val1, val2) == 1) {
+		*val2 = *val1;
+		*val1 = 0;
+	}
+}
+
+char *set_gpu_engine(char *arg)
+{
+	int i, val1 = 0, val2 = 0, device = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set gpu engine";
+	get_intrange(nextptr, &val1, &val2);
+	if (val1 < 0 || val1 > 9999 || val2 < 0 || val2 > 9999)
+		return "Invalid value passed to set_gpu_engine";
+
+	gpus[device].min_engine = val1;
+	gpus[device].gpu_engine = val2;
+	device++;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		get_intrange(nextptr, &val1, &val2);
+		if (val1 < 0 || val1 > 9999 || val2 < 0 || val2 > 9999)
+			return "Invalid value passed to set_gpu_engine";
+		gpus[device].min_engine = val1;
+		gpus[device].gpu_engine = val2;
+		device++;
+	}
+
+	if (device == 1) {
+		for (i = 1; i < MAX_GPUDEVICES; i++) {
+			gpus[i].min_engine = gpus[0].min_engine;
+			gpus[i].gpu_engine = gpus[0].gpu_engine;
+		}
+	}
+
+	return NULL;
+}
+
+char *set_gpu_fan(char *arg)
+{
+	int i, val1 = 0, val2 = 0, device = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set gpu fan";
+	get_intrange(nextptr, &val1, &val2);
+	if (val1 < 0 || val1 > 100 || val2 < 0 || val2 > 100)
+		return "Invalid value passed to set_gpu_fan";
+
+	gpus[device].min_fan = val1;
+	gpus[device].gpu_fan = val2;
+	device++;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		get_intrange(nextptr, &val1, &val2);
+		if (val1 < 0 || val1 > 100 || val2 < 0 || val2 > 100)
+			return "Invalid value passed to set_gpu_fan";
+
+		gpus[device].min_fan = val1;
+		gpus[device].gpu_fan = val2;
+		device++;
+	}
+
+	if (device == 1) {
+		for (i = 1; i < MAX_GPUDEVICES; i++) {
+			gpus[i].min_fan = gpus[0].min_fan;
+			gpus[i].gpu_fan = gpus[0].gpu_fan;
+		}
+	}
+
+	return NULL;
+}
+
+char *set_gpu_memclock(char *arg)
+{
+	int i, val = 0, device = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set gpu memclock";
+	val = atoi(nextptr);
+	if (val < 0 || val >= 9999)
+		return "Invalid value passed to set_gpu_memclock";
+
+	gpus[device++].gpu_memclock = val;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		val = atoi(nextptr);
+		if (val < 0 || val >= 9999)
+			return "Invalid value passed to set_gpu_memclock";
+
+		gpus[device++].gpu_memclock = val;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++)
+			gpus[i].gpu_memclock = gpus[0].gpu_memclock;
+	}
+
+	return NULL;
+}
+
+char *set_gpu_memdiff(char *arg)
+{
+	int i, val = 0, device = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set gpu memdiff";
+	val = atoi(nextptr);
+	if (val < -9999 || val > 9999)
+		return "Invalid value passed to set_gpu_memdiff";
+
+	gpus[device++].gpu_memdiff = val;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		val = atoi(nextptr);
+		if (val < -9999 || val > 9999)
+			return "Invalid value passed to set_gpu_memdiff";
+
+		gpus[device++].gpu_memdiff = val;
+	}
+		if (device == 1) {
+			for (i = device; i < MAX_GPUDEVICES; i++)
+				gpus[i].gpu_memdiff = gpus[0].gpu_memdiff;
+		}
+
+			return NULL;
+}
+
+char *set_gpu_powertune(char *arg)
+{
+	int i, val = 0, device = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set gpu powertune";
+	val = atoi(nextptr);
+	if (val < -99 || val > 99)
+		return "Invalid value passed to set_gpu_powertune";
+
+	gpus[device++].gpu_powertune = val;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		val = atoi(nextptr);
+		if (val < -99 || val > 99)
+			return "Invalid value passed to set_gpu_powertune";
+
+		gpus[device++].gpu_powertune = val;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++)
+			gpus[i].gpu_powertune = gpus[0].gpu_powertune;
+	}
+
+	return NULL;
+}
+
+char *set_gpu_vddc(char *arg)
+{
+	int i, device = 0;
+	float val = 0;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set gpu vddc";
+	val = atof(nextptr);
+	if (val < 0 || val >= 9999)
+		return "Invalid value passed to set_gpu_vddc";
+
+	gpus[device++].gpu_vddc = val;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		val = atof(nextptr);
+		if (val < 0 || val >= 9999)
+			return "Invalid value passed to set_gpu_vddc";
+
+		gpus[device++].gpu_vddc = val;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++)
+			gpus[i].gpu_vddc = gpus[0].gpu_vddc;
+	}
+
+	return NULL;
+}
+
+char *set_temp_overheat(char *arg)
+{
+	int i, val = 0, device = 0, *to;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set temp overheat";
+	val = atoi(nextptr);
+	if (val < 0 || val > 200)
+		return "Invalid value passed to set temp overheat";
+
+	to = &gpus[device++].adl.overtemp;
+	*to = val;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		val = atoi(nextptr);
+		if (val < 0 || val > 200)
+			return "Invalid value passed to set temp overheat";
+
+		to = &gpus[device++].adl.overtemp;
+		*to = val;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++) {
+			to = &gpus[i].adl.overtemp;
+			*to = val;
+		}
+	}
+
+	return NULL;
+}
+
+char *set_temp_target(char *arg)
+{
+	int i, val = 0, device = 0, *tt;
+	char *nextptr;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set temp target";
+	val = atoi(nextptr);
+	if (val < 0 || val > 200)
+		return "Invalid value passed to set temp target";
+
+	tt = &gpus[device++].adl.targettemp;
+	*tt = val;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		val = atoi(nextptr);
+		if (val < 0 || val > 200)
+			return "Invalid value passed to set temp target";
+
+		tt = &gpus[device++].adl.targettemp;
+		*tt = val;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++) {
+			tt = &gpus[i].adl.targettemp;
+			*tt = val;
+		}
+	}
+
+	return NULL;
+}
+#endif
+#ifdef HAVE_OPENCL
+char *set_intensity(char *arg)
+{
+	int i, device = 0, *tt;
+	char *nextptr, val = 0;
+
+	nextptr = strtok(arg, ",");
+	if (nextptr == NULL)
+		return "Invalid parameters for set intensity";
+	if (!strncasecmp(nextptr, "d", 1))
+		gpus[device].dynamic = true;
+	else {
+		gpus[device].dynamic = false;
+		val = atoi(nextptr);
+		if (val < MIN_INTENSITY || val > MAX_INTENSITY)
+			return "Invalid value passed to set intensity";
+		tt = &gpus[device].intensity;
+		*tt = val;
+	}
+
+	device++;
+
+	while ((nextptr = strtok(NULL, ",")) != NULL) {
+		if (!strncasecmp(nextptr, "d", 1))
+			gpus[device].dynamic = true;
+		else {
+			gpus[device].dynamic = false;
+			val = atoi(nextptr);
+			if (val < MIN_INTENSITY || val > MAX_INTENSITY)
+				return "Invalid value passed to set intensity";
+
+			tt = &gpus[device].intensity;
+			*tt = val;
+		}
+		device++;
+	}
+	if (device == 1) {
+		for (i = device; i < MAX_GPUDEVICES; i++) {
+			gpus[i].dynamic = gpus[0].dynamic;
+			gpus[i].intensity = gpus[0].intensity;
+		}
+	}
+
+	return NULL;
+}
+#endif
+
+
+#ifdef HAVE_OPENCL
+struct device_api opencl_api;
+
+char *print_ndevs_and_exit(int *ndevs)
+{
+	opt_log_output = true;
+	opencl_api.api_detect();
+	clear_adl(*ndevs);
+	applog(LOG_INFO, "%i GPU devices max detected", *ndevs);
+	exit(*ndevs);
+}
+#endif
+
+
+struct cgpu_info gpus[MAX_GPUDEVICES]; /* Maximum number apparently possible */
+struct cgpu_info *cpus;
+
+
+
+#ifdef HAVE_OPENCL
+
+/* In dynamic mode, only the first thread of each device will be in use.
+ * This potentially could start a thread that was stopped with the start-stop
+ * options if one were to disable dynamic from the menu on a paused GPU */
+void pause_dynamic_threads(int gpu)
+{
+	struct cgpu_info *cgpu = &gpus[gpu];
+	int i, thread_no = 0;
+
+	for (i = 0; i < mining_threads; i++) {
+		struct thr_info *thr = &thr_info[i];
+
+		if (thr->cgpu != cgpu)
+			continue;
+		if (!thread_no++)
+			continue;
+		if (!thr->pause && cgpu->dynamic) {
+			applog(LOG_WARNING, "Disabling extra threads due to dynamic mode.");
+			applog(LOG_WARNING, "Tune dynamic intensity with --gpu-dyninterval");
+		}
+
+		thr->pause = cgpu->dynamic;
+		if (!cgpu->dynamic && cgpu->deven != DEV_DISABLED)
+			tq_push(thr->q, &ping);
+	}
+}
+
+
+struct device_api opencl_api;
+
+#endif /* HAVE_OPENCL */
+
+#if defined(HAVE_OPENCL) && defined(HAVE_CURSES)
+void manage_gpu(void)
+{
+	struct thr_info *thr;
+	int selected, gpu, i;
+	char checkin[40];
+	char input;
+
+	if (!opt_g_threads)
+		return;
+
+	opt_loginput = true;
+	immedok(logwin, true);
+	clear_logwin();
+retry:
+
+	for (gpu = 0; gpu < nDevs; gpu++) {
+		struct cgpu_info *cgpu = &gpus[gpu];
+
+		wlog("GPU %d: %.1f / %.1f Mh/s | A:%d  R:%d  HW:%d  U:%.2f/m  I:%d\n",
+			gpu, cgpu->rolling, cgpu->total_mhashes / total_secs,
+			cgpu->accepted, cgpu->rejected, cgpu->hw_errors,
+			cgpu->utility, cgpu->intensity);
+#ifdef HAVE_ADL
+		if (gpus[gpu].has_adl) {
+			int engineclock = 0, memclock = 0, activity = 0, fanspeed = 0, fanpercent = 0, powertune = 0;
+			float temp = 0, vddc = 0;
+
+			if (gpu_stats(gpu, &temp, &engineclock, &memclock, &vddc, &activity, &fanspeed, &fanpercent, &powertune)) {
+				char logline[255];
+
+				strcpy(logline, ""); // In case it has no data
+				if (temp != -1)
+					sprintf(logline, "%.1f C  ", temp);
+				if (fanspeed != -1 || fanpercent != -1) {
+					tailsprintf(logline, "F: ");
+					if (fanpercent != -1)
+						tailsprintf(logline, "%d%% ", fanpercent);
+					if (fanspeed != -1)
+						tailsprintf(logline, "(%d RPM) ", fanspeed);
+					tailsprintf(logline, " ");
+				}
+				if (engineclock != -1)
+					tailsprintf(logline, "E: %d MHz  ", engineclock);
+				if (memclock != -1)
+					tailsprintf(logline, "M: %d Mhz  ", memclock);
+				if (vddc != -1)
+					tailsprintf(logline, "V: %.3fV  ", vddc);
+				if (activity != -1)
+					tailsprintf(logline, "A: %d%%  ", activity);
+				if (powertune != -1)
+					tailsprintf(logline, "P: %d%%", powertune);
+				tailsprintf(logline, "\n");
+				wlog(logline);
+			}
+		}
+#endif
+		wlog("Last initialised: %s\n", cgpu->init);
+		wlog("Intensity: ");
+		if (gpus[gpu].dynamic)
+			wlog("Dynamic (only one thread in use)\n");
+		else
+			wlog("%d\n", gpus[gpu].intensity);
+		for (i = 0; i < mining_threads; i++) {
+			thr = &thr_info[i];
+			if (thr->cgpu != cgpu)
+				continue;
+			get_datestamp(checkin, &thr->last);
+			wlog("Thread %d: %.1f Mh/s %s ", i, thr->rolling, cgpu->deven != DEV_DISABLED ? "Enabled" : "Disabled");
+			switch (cgpu->status) {
+				default:
+				case LIFE_WELL:
+					wlog("ALIVE");
+					break;
+				case LIFE_SICK:
+					wlog("SICK reported in %s", checkin);
+					break;
+				case LIFE_DEAD:
+					wlog("DEAD reported in %s", checkin);
+					break;
+				case LIFE_NOSTART:
+					wlog("Never started");
+					break;
+			}
+			if (thr->pause)
+				wlog(" paused");
+			wlog("\n");
+		}
+		wlog("\n");
+	}
+
+	wlogprint("[E]nable [D]isable [I]ntensity [R]estart GPU %s\n",adl_active ? "[C]hange settings" : "");
+
+	wlogprint("Or press any other key to continue\n");
+	input = getch();
+
+	if (nDevs == 1)
+		selected = 0;
+	else
+		selected = -1;
+	if (!strncasecmp(&input, "e", 1)) {
+		struct cgpu_info *cgpu;
+
+		if (selected)
+			selected = curses_int("Select GPU to enable");
+		if (selected < 0 || selected >= nDevs) {
+			wlogprint("Invalid selection\n");
+			goto retry;
+		}
+		if (gpus[selected].deven != DEV_DISABLED) {
+			wlogprint("Device already enabled\n");
+			goto retry;
+		}
+		gpus[selected].deven = DEV_ENABLED;
+		for (i = 0; i < mining_threads; ++i) {
+			thr = &thr_info[i];
+			cgpu = thr->cgpu;
+			if (cgpu->api != &opencl_api)
+				continue;
+			if (dev_from_id(i) != selected)
+				continue;
+			if (cgpu->status != LIFE_WELL) {
+				wlogprint("Must restart device before enabling it");
+				goto retry;
+			}
+			applog(LOG_DEBUG, "Pushing ping to thread %d", thr->id);
+
+			tq_push(thr->q, &ping);
+		}
+		goto retry;
+	} if (!strncasecmp(&input, "d", 1)) {
+		if (selected)
+			selected = curses_int("Select GPU to disable");
+		if (selected < 0 || selected >= nDevs) {
+			wlogprint("Invalid selection\n");
+			goto retry;
+		}
+		if (gpus[selected].deven == DEV_DISABLED) {
+			wlogprint("Device already disabled\n");
+			goto retry;
+		}
+		gpus[selected].deven = DEV_DISABLED;
+		goto retry;
+	} else if (!strncasecmp(&input, "i", 1)) {
+		int intensity;
+		char *intvar;
+
+		if (selected)
+			selected = curses_int("Select GPU to change intensity on");
+		if (selected < 0 || selected >= nDevs) {
+			wlogprint("Invalid selection\n");
+			goto retry;
+		}
+		intvar = curses_input("Set GPU scan intensity (d or " _MIN_INTENSITY_STR " -> " _MAX_INTENSITY_STR ")");
+		if (!intvar) {
+			wlogprint("Invalid input\n");
+			goto retry;
+		}
+		if (!strncasecmp(intvar, "d", 1)) {
+			wlogprint("Dynamic mode enabled on gpu %d\n", selected);
+			gpus[selected].dynamic = true;
+			pause_dynamic_threads(selected);
+			free(intvar);
+			goto retry;
+		}
+		intensity = atoi(intvar);
+		free(intvar);
+		if (intensity < MIN_INTENSITY || intensity > MAX_INTENSITY) {
+			wlogprint("Invalid selection\n");
+			goto retry;
+		}
+		gpus[selected].dynamic = false;
+		gpus[selected].intensity = intensity;
+		wlogprint("Intensity on gpu %d set to %d\n", selected, intensity);
+		pause_dynamic_threads(selected);
+		goto retry;
+	} else if (!strncasecmp(&input, "r", 1)) {
+		if (selected)
+			selected = curses_int("Select GPU to attempt to restart");
+		if (selected < 0 || selected >= nDevs) {
+			wlogprint("Invalid selection\n");
+			goto retry;
+		}
+		wlogprint("Attempting to restart threads of GPU %d\n", selected);
+		reinit_device(&gpus[selected]);
+		goto retry;
+	} else if (adl_active && (!strncasecmp(&input, "c", 1))) {
+		if (selected)
+			selected = curses_int("Select GPU to change settings on");
+		if (selected < 0 || selected >= nDevs) {
+			wlogprint("Invalid selection\n");
+			goto retry;
+		}
+		change_gpusettings(selected);
+		goto retry;
+	} else
+		clear_logwin();
+
+	immedok(logwin, false);
+	opt_loginput = false;
+}
+#else
+void manage_gpu(void)
+{
+}
+#endif
+
+
+#ifdef HAVE_OPENCL
+static _clState *clStates[MAX_GPUDEVICES];
+
+#define CL_SET_BLKARG(blkvar) status |= clSetKernelArg(*kernel, num++, sizeof(uint), (void *)&blk->blkvar)
+#define CL_SET_ARG(var) status |= clSetKernelArg(*kernel, num++, sizeof(var), (void *)&var)
+#define CL_SET_VARG(args, var) status |= clSetKernelArg(*kernel, num++, args * sizeof(uint), (void *)var)
+
+static cl_int queue_poclbm_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads)
+{
+	cl_kernel *kernel = &clState->kernel;
+	unsigned int num = 0;
+	cl_int status = 0;
+
+	CL_SET_BLKARG(ctx_a);
+	CL_SET_BLKARG(ctx_b);
+	CL_SET_BLKARG(ctx_c);
+	CL_SET_BLKARG(ctx_d);
+	CL_SET_BLKARG(ctx_e);
+	CL_SET_BLKARG(ctx_f);
+	CL_SET_BLKARG(ctx_g);
+	CL_SET_BLKARG(ctx_h);
+
+	CL_SET_BLKARG(cty_b);
+	CL_SET_BLKARG(cty_c);
+
+	
+	CL_SET_BLKARG(cty_f);
+	CL_SET_BLKARG(cty_g);
+	CL_SET_BLKARG(cty_h);
+
+	if (!clState->goffset) {
+		cl_uint vwidth = clState->vwidth;
+		uint *nonces = alloca(sizeof(uint) * vwidth);
+		unsigned int i;
+
+		for (i = 0; i < vwidth; i++)
+			nonces[i] = blk->nonce + (i * threads);
+		CL_SET_VARG(vwidth, nonces);
+	}
+
+	CL_SET_BLKARG(fW0);
+	CL_SET_BLKARG(fW1);
+	CL_SET_BLKARG(fW2);
+	CL_SET_BLKARG(fW3);
+	CL_SET_BLKARG(fW15);
+	CL_SET_BLKARG(fW01r);
+
+	CL_SET_BLKARG(D1A);
+	CL_SET_BLKARG(C1addK5);
+	CL_SET_BLKARG(B1addK6);
+	CL_SET_BLKARG(W16addK16);
+	CL_SET_BLKARG(W17addK17);
+	CL_SET_BLKARG(PreVal4addT1);
+	CL_SET_BLKARG(PreVal0);
+
+	CL_SET_ARG(clState->outputBuffer);
+
+	return status;
+}
+
+static cl_int queue_phatk_kernel(_clState *clState, dev_blk_ctx *blk,
+				 __maybe_unused cl_uint threads)
+{
+	cl_kernel *kernel = &clState->kernel;
+	cl_uint vwidth = clState->vwidth;
+	unsigned int i, num = 0;
+	cl_int status = 0;
+	uint *nonces;
+
+	CL_SET_BLKARG(ctx_a);
+	CL_SET_BLKARG(ctx_b);
+	CL_SET_BLKARG(ctx_c);
+	CL_SET_BLKARG(ctx_d);
+	CL_SET_BLKARG(ctx_e);
+	CL_SET_BLKARG(ctx_f);
+	CL_SET_BLKARG(ctx_g);
+	CL_SET_BLKARG(ctx_h);
+
+	CL_SET_BLKARG(cty_b);
+	CL_SET_BLKARG(cty_c);
+	CL_SET_BLKARG(cty_d);
+	CL_SET_BLKARG(cty_f);
+	CL_SET_BLKARG(cty_g);
+	CL_SET_BLKARG(cty_h);
+
+	nonces = alloca(sizeof(uint) * vwidth);
+	for (i = 0; i < vwidth; i++)
+		nonces[i] = blk->nonce + i;
+	CL_SET_VARG(vwidth, nonces);
+
+	CL_SET_BLKARG(W16);
+	CL_SET_BLKARG(W17);
+	CL_SET_BLKARG(PreVal4_2);
+	CL_SET_BLKARG(PreVal0);
+	CL_SET_BLKARG(PreW18);
+	CL_SET_BLKARG(PreW19);
+	CL_SET_BLKARG(PreW31);
+	CL_SET_BLKARG(PreW32);
+
+	CL_SET_ARG(clState->outputBuffer);
+
+	return status;
+}
+
+static cl_int queue_diakgcn_kernel(_clState *clState, dev_blk_ctx *blk,
+				   __maybe_unused cl_uint threads)
+{
+	cl_kernel *kernel = &clState->kernel;
+	cl_uint vwidth = clState->vwidth;
+	unsigned int i, num = 0;
+	cl_int status = 0;
+	uint *nonces;
+
+	nonces = alloca(sizeof(uint) * vwidth);
+	for (i = 0; i < vwidth; i++)
+		nonces[i] = blk->nonce + i;
+	CL_SET_VARG(vwidth, nonces);
+
+	CL_SET_BLKARG(PreVal0);
+	CL_SET_BLKARG(PreVal4_2);
+	CL_SET_BLKARG(cty_h);
+	CL_SET_BLKARG(D1A);
+	CL_SET_BLKARG(cty_b);
+	CL_SET_BLKARG(cty_c);
+	CL_SET_BLKARG(cty_f);
+	CL_SET_BLKARG(cty_g);
+	CL_SET_BLKARG(C1addK5);
+	CL_SET_BLKARG(B1addK6);
+	CL_SET_BLKARG(PreVal0addK7);
+	CL_SET_BLKARG(W16addK16);
+	CL_SET_BLKARG(W17addK17);
+	CL_SET_BLKARG(PreW18);
+	CL_SET_BLKARG(PreW19);
+	CL_SET_BLKARG(W16);
+	CL_SET_BLKARG(W17);
+	CL_SET_BLKARG(PreW31);
+	CL_SET_BLKARG(PreW32);
+
+	CL_SET_BLKARG(ctx_a);
+	CL_SET_BLKARG(ctx_b);
+	CL_SET_BLKARG(ctx_c);
+	CL_SET_BLKARG(ctx_d);
+	CL_SET_BLKARG(ctx_e);
+	CL_SET_BLKARG(ctx_f);
+	CL_SET_BLKARG(ctx_g);
+	CL_SET_BLKARG(ctx_h);
+
+	CL_SET_BLKARG(zeroA);
+	CL_SET_BLKARG(zeroB);
+
+	CL_SET_BLKARG(oneA);
+	CL_SET_BLKARG(twoA);
+	CL_SET_BLKARG(threeA);
+	CL_SET_BLKARG(fourA);
+	CL_SET_BLKARG(fiveA);
+	CL_SET_BLKARG(sixA);
+	CL_SET_BLKARG(sevenA);
+
+	CL_SET_ARG(clState->outputBuffer);
+
+	return status;
+}
+
+static cl_int queue_diablo_kernel(_clState *clState, dev_blk_ctx *blk, cl_uint threads)
+{
+	cl_kernel *kernel = &clState->kernel;
+	unsigned int num = 0;
+	cl_int status = 0;
+
+	if (!clState->goffset) {
+		cl_uint vwidth = clState->vwidth;
+		uint *nonces = alloca(sizeof(uint) * vwidth);
+		unsigned int i;
+
+		for (i = 0; i < vwidth; i++)
+			nonces[i] = blk->nonce + (i * threads);
+		CL_SET_VARG(vwidth, nonces);
+	}
+
+
+	CL_SET_BLKARG(PreVal0);
+	CL_SET_BLKARG(PreVal0addK7);
+	CL_SET_BLKARG(PreVal4addT1);
+	CL_SET_BLKARG(PreW18);
+	CL_SET_BLKARG(PreW19);
+	CL_SET_BLKARG(W16);
+	CL_SET_BLKARG(W17);
+	CL_SET_BLKARG(W16addK16);
+	CL_SET_BLKARG(W17addK17);
+	CL_SET_BLKARG(PreW31);
+	CL_SET_BLKARG(PreW32);
+
+	CL_SET_BLKARG(D1A);
+	CL_SET_BLKARG(cty_b);
+	CL_SET_BLKARG(cty_c);
+	CL_SET_BLKARG(cty_h);
+	CL_SET_BLKARG(cty_f);
+	CL_SET_BLKARG(cty_g);
+
+	CL_SET_BLKARG(C1addK5);
+	CL_SET_BLKARG(B1addK6);
+
+	CL_SET_BLKARG(ctx_a);
+	CL_SET_BLKARG(ctx_b);
+	CL_SET_BLKARG(ctx_c);
+	CL_SET_BLKARG(ctx_d);
+	CL_SET_BLKARG(ctx_e);
+	CL_SET_BLKARG(ctx_f);
+	CL_SET_BLKARG(ctx_g);
+	CL_SET_BLKARG(ctx_h);
+
+	CL_SET_ARG(clState->outputBuffer);
+
+	return status;
+}
+
+static void set_threads_hashes(unsigned int vectors, unsigned int *threads,
+			       unsigned int *hashes, size_t *globalThreads,
+			       unsigned int minthreads, int intensity)
+{
+	*threads = 1 << (15 + intensity);
+	if (*threads < minthreads)
+		*threads = minthreads;
+	*globalThreads = *threads;
+	*hashes = *threads * vectors;
+}
+#endif /* HAVE_OPENCL */
+
+
+#ifdef HAVE_OPENCL
+/* We have only one thread that ever re-initialises GPUs, thus if any GPU
+ * init command fails due to a completely wedged GPU, the thread will never
+ * return, unable to harm other GPUs. If it does return, it means we only had
+ * a soft failure and then the reinit_gpu thread is ready to tackle another
+ * GPU */
+void *reinit_gpu(void *userdata)
+{
+	struct thr_info *mythr = userdata;
+	struct cgpu_info *cgpu;
+	struct thr_info *thr;
+	struct timeval now;
+	char name[256];
+	int thr_id;
+	int gpu;
+
+	pthread_detach(pthread_self());
+
+select_cgpu:
+	cgpu = tq_pop(mythr->q, NULL);
+	if (!cgpu)
+		goto out;
+
+	if (clDevicesNum() != nDevs) {
+		applog(LOG_WARNING, "Hardware not reporting same number of active devices, will not attempt to restart GPU");
+		goto out;
+	}
+
+	gpu = cgpu->device_id;
+
+	for (thr_id = 0; thr_id < mining_threads; ++thr_id) {
+		thr = &thr_info[thr_id];
+		cgpu = thr->cgpu;
+		if (cgpu->api != &opencl_api)
+			continue;
+		if (dev_from_id(thr_id) != gpu)
+			continue;
+
+		thr = &thr_info[thr_id];
+		if (!thr) {
+			applog(LOG_WARNING, "No reference to thread %d exists", thr_id);
+			continue;
+		}
+
+		thr->rolling = thr->cgpu->rolling = 0;
+		/* Reports the last time we tried to revive a sick GPU */
+		gettimeofday(&thr->sick, NULL);
+		if (!pthread_cancel(thr->pth)) {
+			applog(LOG_WARNING, "Thread %d still exists, killing it off", thr_id);
+		} else
+			applog(LOG_WARNING, "Thread %d no longer exists", thr_id);
+	}
+
+	for (thr_id = 0; thr_id < mining_threads; ++thr_id) {
+		int virtual_gpu;
+
+		thr = &thr_info[thr_id];
+		cgpu = thr->cgpu;
+		if (cgpu->api != &opencl_api)
+			continue;
+		if (dev_from_id(thr_id) != gpu)
+			continue;
+
+		virtual_gpu = cgpu->virtual_gpu;
+		/* Lose this ram cause we may get stuck here! */
+		//tq_freeze(thr->q);
+
+		thr->q = tq_new();
+		if (!thr->q)
+			quit(1, "Failed to tq_new in reinit_gpu");
+
+		/* Lose this ram cause we may dereference in the dying thread! */
+		//free(clState);
+
+		applog(LOG_INFO, "Reinit GPU thread %d", thr_id);
+		clStates[thr_id] = initCl(virtual_gpu, name, sizeof(name));
+		if (!clStates[thr_id]) {
+			applog(LOG_ERR, "Failed to reinit GPU thread %d", thr_id);
+			goto select_cgpu;
+		}
+		applog(LOG_INFO, "initCl() finished. Found %s", name);
+
+		if (unlikely(thr_info_create(thr, NULL, miner_thread, thr))) {
+			applog(LOG_ERR, "thread %d create failed", thr_id);
+			return NULL;
+		}
+		applog(LOG_WARNING, "Thread %d restarted", thr_id);
+	}
+
+	gettimeofday(&now, NULL);
+	get_datestamp(cgpu->init, &now);
+
+	for (thr_id = 0; thr_id < mining_threads; ++thr_id) {
+		thr = &thr_info[thr_id];
+		cgpu = thr->cgpu;
+		if (cgpu->api != &opencl_api)
+			continue;
+		if (dev_from_id(thr_id) != gpu)
+			continue;
+
+		tq_push(thr->q, &ping);
+	}
+
+	goto select_cgpu;
+out:
+	return NULL;
+}
+#else
+void *reinit_gpu(void *userdata)
+{
+	return NULL;
+}
+#endif
+
+
+#ifdef HAVE_OPENCL
+struct device_api opencl_api;
+
+static void opencl_detect()
+{
+	int i;
+
+	nDevs = clDevicesNum();
+	if (nDevs < 0) {
+		applog(LOG_ERR, "clDevicesNum returned error, no GPUs usable");
+		nDevs = 0;
+	}
+
+	if (MAX_DEVICES - total_devices < nDevs)
+		nDevs = MAX_DEVICES - total_devices;
+
+	if (!nDevs)
+		return;
+
+	for (i = 0; i < nDevs; ++i) {
+		struct cgpu_info *cgpu;
+
+		cgpu = &gpus[i];
+		cgpu->deven = DEV_ENABLED;
+		cgpu->api = &opencl_api;
+		cgpu->device_id = i;
+		cgpu->threads = opt_g_threads;
+		cgpu->virtual_gpu = i;
+		add_cgpu(cgpu);
+	}
+
+	if (!opt_noadl)
+		init_adl(nDevs);
+}
+
+static void reinit_opencl_device(struct cgpu_info *gpu)
+{
+	tq_push(thr_info[gpur_thr_id].q, gpu);
+}
+
+#ifdef HAVE_ADL
+static void get_opencl_statline_before(char *buf, struct cgpu_info *gpu)
+{
+	if (gpu->has_adl) {
+		int gpuid = gpu->device_id;
+		float gt = gpu_temp(gpuid);
+		int gf = gpu_fanspeed(gpuid);
+		int gp;
+
+		if (gt != -1)
+			tailsprintf(buf, "%5.1fC ", gt);
+		else
+			tailsprintf(buf, "       ", gt);
+		if (gf != -1)
+			tailsprintf(buf, "%4dRPM ", gf);
+		else if ((gp = gpu_fanpercent(gpuid)) != -1)
+			tailsprintf(buf, "%3d%%    ", gp);
+		else
+			tailsprintf(buf, "        ");
+		tailsprintf(buf, "| ");
+	}
+}
+#endif
+
+static void get_opencl_statline(char *buf, struct cgpu_info *gpu)
+{
+	tailsprintf(buf, " I:%2d", gpu->intensity);
+}
+
+struct opencl_thread_data {
+	cl_int (*queue_kernel_parameters)(_clState *, dev_blk_ctx *, cl_uint);
+	uint32_t *res;
+	struct work *last_work;
+	struct work _last_work;
+};
+
+static uint32_t *blank_res;
+
+static bool opencl_thread_prepare(struct thr_info *thr)
+{
+	char name[256];
+	struct timeval now;
+	struct cgpu_info *cgpu = thr->cgpu;
+	int gpu = cgpu->device_id;
+	int virtual_gpu = cgpu->virtual_gpu;
+	int i = thr->id;
+	static bool failmessage = false;
+
+	if (!blank_res)
+		blank_res = calloc(BUFFERSIZE, 1);
+	if (!blank_res) {
+		applog(LOG_ERR, "Failed to calloc in opencl_thread_init");
+		return false;
+	}
+
+	strcpy(name, "");
+	applog(LOG_INFO, "Init GPU thread %i GPU %i virtual GPU %i", i, gpu, virtual_gpu);
+	clStates[i] = initCl(virtual_gpu, name, sizeof(name));
+	if (!clStates[i]) {
+		if (use_curses)
+			enable_curses();
+		applog(LOG_ERR, "Failed to init GPU thread %d, disabling device %d", i, gpu);
+		if (!failmessage) {
+			char *buf;
+
+			applog(LOG_ERR, "Restarting the GPU from the menu will not fix this.");
+			applog(LOG_ERR, "Try restarting cgminer.");
+			failmessage = true;
+#ifdef HAVE_CURSES
+			if (use_curses) {
+				buf = curses_input("Press enter to continue");
+				if (buf)
+					free(buf);
+			}
+#endif
+		}
+		cgpu->deven = DEV_DISABLED;
+		cgpu->status = LIFE_NOSTART;
+
+		cgpu->device_last_not_well = time(NULL);
+		cgpu->device_not_well_reason = REASON_DEV_NOSTART;
+		cgpu->dev_nostart_count++;
+
+		return false;
+	}
+	if (!cgpu->name)
+		cgpu->name = strdup(name);
+	if (!cgpu->kname)
+	{
+		switch (clStates[i]->chosen_kernel) {
+		case KL_DIABLO:
+			cgpu->kname = "diablo";
+			break;
+		case KL_DIAKGCN:
+			cgpu->kname = "diakgcn";
+			break;
+		case KL_PHATK:
+			cgpu->kname = "phatk";
+			break;
+		case KL_POCLBM:
+			cgpu->kname = "poclbm";
+		default:
+			break;
+		}
+	}
+	applog(LOG_INFO, "initCl() finished. Found %s", name);
+	gettimeofday(&now, NULL);
+	get_datestamp(cgpu->init, &now);
+
+	have_opencl = true;
+
+	return true;
+}
+
+static bool opencl_thread_init(struct thr_info *thr)
+{
+	const int thr_id = thr->id;
+	struct cgpu_info *gpu = thr->cgpu;
+	struct opencl_thread_data *thrdata;
+	_clState *clState = clStates[thr_id];
+	cl_int status;
+	thrdata = calloc(1, sizeof(*thrdata));
+	thr->cgpu_data = thrdata;
+
+	if (!thrdata) {
+		applog(LOG_ERR, "Failed to calloc in opencl_thread_init");
+		return false;
+	}
+
+	switch (clState->chosen_kernel) {
+		case KL_POCLBM:
+			thrdata->queue_kernel_parameters = &queue_poclbm_kernel;
+			break;
+		case KL_PHATK:
+			thrdata->queue_kernel_parameters = &queue_phatk_kernel;
+			break;
+		case KL_DIAKGCN:
+			thrdata->queue_kernel_parameters = &queue_diakgcn_kernel;
+			break;
+		default:
+		case KL_DIABLO:
+			thrdata->queue_kernel_parameters = &queue_diablo_kernel;
+			break;
+	}
+
+	thrdata->res = calloc(BUFFERSIZE, 1);
+
+	if (!thrdata->res) {
+		free(thrdata);
+		applog(LOG_ERR, "Failed to calloc in opencl_thread_init");
+		return false;
+	}
+
+	status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0,
+			BUFFERSIZE, blank_res, 0, NULL, NULL);
+	if (unlikely(status != CL_SUCCESS)) {
+		applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
+		return false;
+	}
+
+	gpu->status = LIFE_WELL;
+
+	gpu->device_last_well = time(NULL);
+
+	return true;
+}
+
+static void opencl_free_work(struct thr_info *thr, struct work *work)
+{
+	const int thr_id = thr->id;
+	struct opencl_thread_data *thrdata = thr->cgpu_data;
+	_clState *clState = clStates[thr_id];
+
+	clFinish(clState->commandQueue);
+	if (thrdata->res[FOUND]) {
+		thrdata->last_work = &thrdata->_last_work;
+		memcpy(thrdata->last_work, work, sizeof(*thrdata->last_work));
+	}
+}
+
+static bool opencl_prepare_work(struct thr_info __maybe_unused *thr, struct work *work)
+{
+	precalc_hash(&work->blk, (uint32_t *)(work->midstate), (uint32_t *)(work->data + 64));
+	return true;
+}
+
+extern int opt_dynamic_interval;
+
+static uint64_t opencl_scanhash(struct thr_info *thr, struct work *work,
+				uint64_t __maybe_unused max_nonce)
+{
+	const int thr_id = thr->id;
+	struct opencl_thread_data *thrdata = thr->cgpu_data;
+	struct cgpu_info *gpu = thr->cgpu;
+	_clState *clState = clStates[thr_id];
+	const cl_kernel *kernel = &clState->kernel;
+
+	double gpu_ms_average = 7;
+	cl_int status;
+
+	size_t globalThreads[1];
+	size_t localThreads[1] = { clState->wsize };
+	unsigned int threads;
+	unsigned int hashes;
+
+
+	struct timeval tv_gpustart, tv_gpuend, diff;
+	suseconds_t gpu_us;
+
+	gettimeofday(&tv_gpustart, NULL);
+	timeval_subtract(&diff, &tv_gpustart, &tv_gpuend);
+	/* This finish flushes the readbuffer set with CL_FALSE later */
+	clFinish(clState->commandQueue);
+	gettimeofday(&tv_gpuend, NULL);
+	timeval_subtract(&diff, &tv_gpuend, &tv_gpustart);
+	gpu_us = diff.tv_sec * 1000000 + diff.tv_usec;
+	decay_time(&gpu_ms_average, gpu_us / 1000);
+	if (gpu->dynamic) {
+		/* Try to not let the GPU be out for longer than 6ms, but
+		 * increase intensity when the system is idle, unless
+		 * dynamic is disabled. */
+		if (gpu_ms_average > opt_dynamic_interval) {
+			if (gpu->intensity > MIN_INTENSITY)
+				--gpu->intensity;
+		} else if (gpu_ms_average < ((opt_dynamic_interval / 2) ? : 1)) {
+			if (gpu->intensity < MAX_INTENSITY)
+				++gpu->intensity;
+		}
+	}
+	set_threads_hashes(clState->vwidth, &threads, &hashes, globalThreads,
+			   localThreads[0], gpu->intensity);
+	if (hashes > gpu->max_hashes)
+		gpu->max_hashes = hashes;
+	status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]);
+	if (unlikely(status != CL_SUCCESS)) {
+		applog(LOG_ERR, "Error: clSetKernelArg of all params failed.");
+		return 0;
+	}
+
+	/* MAXBUFFERS entry is used as a flag to say nonces exist */
+	if (thrdata->res[FOUND]) {
+		/* Clear the buffer again */
+		status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
+				BUFFERSIZE, blank_res, 0, NULL, NULL);
+		if (unlikely(status != CL_SUCCESS)) {
+			applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed.");
+			return 0;
+		}
+		if (unlikely(thrdata->last_work)) {
+			applog(LOG_DEBUG, "GPU %d found something in last work?", gpu->device_id);
+			postcalc_hash_async(thr, thrdata->last_work, thrdata->res);
+			thrdata->last_work = NULL;
+		} else {
+			applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id);
+			postcalc_hash_async(thr, work, thrdata->res);
+		}
+		memset(thrdata->res, 0, BUFFERSIZE);
+		clFinish(clState->commandQueue);
+	}
+
+	if (clState->goffset) {
+		size_t global_work_offset[1];
+
+		global_work_offset[0] = work->blk.nonce;
+		status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, global_work_offset,
+						globalThreads, localThreads, 0,  NULL, NULL);
+	} else
+		status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL,
+						globalThreads, localThreads, 0,  NULL, NULL);
+	if (unlikely(status != CL_SUCCESS)) {
+		applog(LOG_ERR, "Error: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)");
+		return 0;
+	}
+
+	status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0,
+			BUFFERSIZE, thrdata->res, 0, NULL, NULL);
+	if (unlikely(status != CL_SUCCESS)) {
+		applog(LOG_ERR, "Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer)");
+		return 0;
+	}
+
+	/* The amount of work scanned can fluctuate when intensity changes
+	 * and since we do this one cycle behind, we increment the work more
+	 * than enough to prevent repeating work */
+	work->blk.nonce += gpu->max_hashes;
+
+	return hashes;
+}
+
+static void opencl_thread_shutdown(struct thr_info *thr)
+{
+	const int thr_id = thr->id;
+	_clState *clState = clStates[thr_id];
+
+	clReleaseCommandQueue(clState->commandQueue);
+	clReleaseKernel(clState->kernel);
+	clReleaseProgram(clState->program);
+	clReleaseContext(clState->context);
+}
+
+struct device_api opencl_api = {
+	.dname = "opencl",
+	.name = "GPU",
+	.api_detect = opencl_detect,
+	.reinit_device = reinit_opencl_device,
+#ifdef HAVE_ADL
+	.get_statline_before = get_opencl_statline_before,
+#endif
+	.get_statline = get_opencl_statline,
+	.thread_prepare = opencl_thread_prepare,
+	.thread_init = opencl_thread_init,
+	.free_work = opencl_free_work,
+	.prepare_work = opencl_prepare_work,
+	.scanhash = opencl_scanhash,
+	.thread_shutdown = opencl_thread_shutdown,
+};
+#endif
+
+
+

+ 29 - 0
driver-opencl.h

@@ -0,0 +1,29 @@
+#ifndef __DEVICE_GPU_H__
+#define __DEVICE_GPU_H__
+
+#include "miner.h"
+
+
+extern char *print_ndevs_and_exit(int *ndevs);
+extern void *reinit_gpu(void *userdata);
+extern char *set_gpu_engine(char *arg);
+extern char *set_gpu_fan(char *arg);
+extern char *set_gpu_memclock(char *arg);
+extern char *set_gpu_memdiff(char *arg);
+extern char *set_gpu_powertune(char *arg);
+extern char *set_gpu_vddc(char *arg);
+extern char *set_temp_overheat(char *arg);
+extern char *set_temp_target(char *arg);
+extern char *set_intensity(char *arg);
+extern char *set_vector(char *arg);
+extern char *set_worksize(char *arg);
+extern char *set_kernel(char *arg);
+void manage_gpu(void);
+extern void pause_dynamic_threads(int gpu);
+
+extern bool have_opencl;
+extern int opt_platform_id;
+
+extern struct device_api opencl_api;
+
+#endif /* __DEVICE_GPU_H__ */

+ 1 - 3
example.conf

@@ -27,7 +27,6 @@
 "temp-overheat" : "85,85,85,85",
 "temp-target" : "75,75,75,75",
 
-"algo" : "sse2_64",
 "auto-fan" : true,
 "auto-gpu" : true,
 "expiry" : "120",
@@ -40,7 +39,6 @@
 "temp-hysteresis" : "3",
 "worksize" : "0",
 
-"donation" : "1.00",
 "shares" : "0",
 "kernel-path" : "/usr/local/bin"
-}
+}

+ 30 - 22
findnonce.c

@@ -1,10 +1,10 @@
 /*
- * Copyright 2011 Con Kolivas
+ * Copyright 2011-2012 Con Kolivas
  * Copyright 2011 Nils Schneider
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
+ * Software Foundation; either version 3 of the License, or (at your option)
  * any later version.  See COPYING for more details.
  */
 
@@ -16,9 +16,7 @@
 #include <pthread.h>
 #include <string.h>
 
-#include "ocl.h"
 #include "findnonce.h"
-#include "miner.h"
 
 const uint32_t SHA256_K[64] = {
 	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
@@ -66,9 +64,6 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) {
 	blk->cty_a = A;
 	blk->cty_b = B;
 	blk->cty_c = C;
-
-	blk->C1addK5 = C + 0x59f111f1;
-
 	blk->cty_d = D;
 
 	blk->D1A = D + 0xb956c25b;
@@ -93,12 +88,12 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) {
 
 	blk->W16 = blk->fW0 = data[0] + (rotr(data[1], 7) ^ rotr(data[1], 18) ^ (data[1] >> 3));
 	blk->W17 = blk->fW1 = data[1] + (rotr(data[2], 7) ^ rotr(data[2], 18) ^ (data[2] >> 3)) + 0x01100000;
-	blk->PreVal4 = blk->fcty_e = E + (rotr(B, 6) ^ rotr(B, 11) ^ rotr(B, 25)) + (D ^ (B & (C ^ D))) + 0xe9b5dba5;
+	blk->PreVal4 = blk->fcty_e = blk->ctx_e + (rotr(B, 6) ^ rotr(B, 11) ^ rotr(B, 25)) + (D ^ (B & (C ^ D))) + 0xe9b5dba5;
 	blk->T1 = blk->fcty_e2 = (rotr(F, 2) ^ rotr(F, 13) ^ rotr(F, 22)) + ((F & G) | (H & (F | G)));
 	blk->PreVal4_2 = blk->PreVal4 + blk->T1;
-	blk->PreVal0 = blk->PreVal4 + state[0];
+	blk->PreVal0 = blk->PreVal4 + blk->ctx_a;
 	blk->PreW31 = 0x00000280 + (rotr(blk->W16,  7) ^ rotr(blk->W16, 18) ^ (blk->W16 >> 3));
-	blk->PreW32 = blk->W16 + ((rotr(blk->W17, 7) ^ rotr(blk->W17, 18) ^ (blk->W17 >> 3)));
+	blk->PreW32 = blk->W16 + (rotr(blk->W17, 7) ^ rotr(blk->W17, 18) ^ (blk->W17 >> 3));
 	blk->PreW18 = data[2] + (rotr(blk->W16, 17) ^ rotr(blk->W16, 19) ^ (blk->W16 >> 10));
 	blk->PreW19 = 0x11002000 + (rotr(blk->W17, 17) ^ rotr(blk->W17, 19) ^ (blk->W17 >> 10));
 
@@ -115,7 +110,23 @@ void precalc_hash(dev_blk_ctx *blk, uint32_t *state, uint32_t *data) {
 
 
 	blk->PreVal4addT1 = blk->PreVal4 + blk->T1;
-	blk->T1substate0 = state[0] - blk->T1;
+	blk->T1substate0 = blk->ctx_a - blk->T1;
+
+	blk->C1addK5 = blk->cty_c + SHA256_K[5];
+	blk->B1addK6 = blk->cty_b + SHA256_K[6];
+	blk->PreVal0addK7 = blk->PreVal0 + SHA256_K[7];
+	blk->W16addK16 = blk->W16 + SHA256_K[16];
+	blk->W17addK17 = blk->W17 + SHA256_K[17];
+
+	blk->zeroA = blk->ctx_a + 0x98c7e2a2;
+	blk->zeroB = blk->ctx_a + 0xfc08884d;
+	blk->oneA = blk->ctx_b + 0x90bb1e3c;
+	blk->twoA = blk->ctx_c + 0x50c6645b;
+	blk->threeA = blk->ctx_d + 0x3ac42e24;
+	blk->fourA = blk->ctx_e + SHA256_K[4];
+	blk->fiveA = blk->ctx_f + SHA256_K[5];
+	blk->sixA = blk->ctx_g + SHA256_K[6];
+	blk->sevenA = blk->ctx_h + SHA256_K[7];
 }
 
 #define P(t) (W[(t)&0xF] = W[(t-16)&0xF] + (rotate(W[(t-15)&0xF], 25) ^ rotate(W[(t-15)&0xF], 14) ^ (W[(t-15)&0xF] >> 3)) + W[(t-7)&0xF] + (rotate(W[(t-2)&0xF], 15) ^ rotate(W[(t-2)&0xF], 13) ^ (W[(t-2)&0xF] >> 10)))
@@ -174,7 +185,7 @@ static void send_nonce(struct pc_data *pcd, cl_uint nonce)
 	E = blk->cty_e; F = blk->cty_f;
 	G = blk->cty_g; H = blk->cty_h;
 	W[0] = blk->merkle; W[1] = blk->ntime;
-	W[2] = blk->nbits; W[3] = nonce;;
+	W[2] = blk->nbits; W[3] = nonce;
 	W[4] = 0x80000000; W[5] = 0x00000000; W[6] = 0x00000000; W[7] = 0x00000000;
 	W[8] = 0x00000000; W[9] = 0x00000000; W[10] = 0x00000000; W[11] = 0x00000000;
 	W[12] = 0x00000000; W[13] = 0x00000000; W[14] = 0x00000000; W[15] = 0x00000280;
@@ -198,12 +209,11 @@ static void send_nonce(struct pc_data *pcd, cl_uint nonce)
 	FR(32); FR(40);
 	FR(48); PFR(56);
 
-	if (likely(H == 0xA41F32E7)) {
+	if (likely(H == 0xa41f32e7)) {
 		if (unlikely(submit_nonce(thr, work, nonce) == false))
 			applog(LOG_ERR, "Failed to submit work, exiting");
 	} else {
-		if (opt_debug)
-			applog(LOG_DEBUG, "No best_g found! Error in OpenCL code?");
+		applog(LOG_DEBUG, "No best_g found! Error in OpenCL code?");
 		hw_errors++;
 		thr->cgpu->hw_errors++;
 	}
@@ -217,18 +227,16 @@ static void *postcalc_hash(void *userdata)
 
 	pthread_detach(pthread_self());
 
-	do {
-		if (pcd->res[entry]) {
+	for (entry = 0; entry < FOUND; entry++) {
+		if (pcd->res[entry])
 			send_nonce(pcd, pcd->res[entry]);
-			nonces++;
-		}
-	} while (++entry < FOUND);
+		nonces++;
+	}
 
 	free(pcd);
 
 	if (unlikely(!nonces)) {
-		if (opt_debug)
-			applog(LOG_DEBUG, "No nonces found! Error in OpenCL code?");
+		applog(LOG_DEBUG, "No nonces found! Error in OpenCL code?");
 		hw_errors++;
 		thr->cgpu->hw_errors++;
 	}

+ 176 - 0
logging.c

@@ -0,0 +1,176 @@
+/*
+ * Copyright 2011-2012 Con Kolivas
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.  See COPYING for more details.
+ */
+
+#include <unistd.h>
+
+#include "logging.h"
+#include "miner.h"
+
+bool opt_debug = false;
+bool opt_log_output = false;
+
+/* per default priorities higher than LOG_NOTICE are logged */
+int opt_log_level = LOG_NOTICE;
+
+static void my_log_curses(int prio, char *f, va_list ap)
+{
+#ifdef HAVE_CURSES
+	extern bool use_curses;
+	if (use_curses)
+		log_curses(prio, f, ap);
+	else
+#endif
+	{
+		int len = strlen(f);
+
+		strcpy(f + len - 1, "                    \n");
+
+#ifdef HAVE_CURSES
+		log_curses(prio, f, ap);
+#else
+		vprintf(f, ap);
+#endif
+	}
+}
+
+void vapplog(int prio, const char *fmt, va_list ap)
+{
+	if (!opt_debug && prio == LOG_DEBUG)
+		return;
+
+#ifdef HAVE_SYSLOG_H
+	if (use_syslog) {
+		vsyslog(prio, fmt, ap);
+	}
+#else
+	if (0) {}
+#endif
+	else if (opt_log_output || prio <= LOG_NOTICE) {
+		char *f;
+		int len;
+		struct timeval tv = {0, 0};
+		struct tm *tm;
+
+		gettimeofday(&tv, NULL);
+
+		tm = localtime(&tv.tv_sec);
+
+		len = 40 + strlen(fmt) + 22;
+		f = alloca(len);
+		sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d] %s\n",
+			tm->tm_year + 1900,
+			tm->tm_mon + 1,
+			tm->tm_mday,
+			tm->tm_hour,
+			tm->tm_min,
+			tm->tm_sec,
+			fmt);
+		/* Only output to stderr if it's not going to the screen as well */
+		if (!isatty(fileno((FILE *)stderr))) {
+			va_list apc;
+
+			va_copy(apc, ap);
+			vfprintf(stderr, f, apc);	/* atomic write to stderr */
+			fflush(stderr);
+		}
+
+		my_log_curses(prio, f, ap);
+	}
+}
+
+void applog(int prio, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	vapplog(prio, fmt, ap);
+	va_end(ap);
+}
+
+
+/* high-level logging functions, based on global opt_log_level */
+
+/*
+ * generic log function used by priority specific ones
+ * equals vapplog() without additional priority checks
+ */
+static void __maybe_unused log_generic(int prio, const char *fmt, va_list ap)
+{
+#ifdef HAVE_SYSLOG_H
+	if (use_syslog) {
+		vsyslog(prio, fmt, ap);
+	}
+#else
+	if (0) {}
+#endif
+	else {
+		char *f;
+		int len;
+		struct timeval tv = {0, 0};
+		struct tm *tm;
+
+		gettimeofday(&tv, NULL);
+
+		tm = localtime(&tv.tv_sec);
+
+		len = 40 + strlen(fmt) + 22;
+		f = alloca(len);
+		sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d] %s\n",
+			tm->tm_year + 1900,
+			tm->tm_mon + 1,
+			tm->tm_mday,
+			tm->tm_hour,
+			tm->tm_min,
+			tm->tm_sec,
+			fmt);
+		/* Only output to stderr if it's not going to the screen as well */
+		if (!isatty(fileno((FILE *)stderr))) {
+			va_list apc;
+
+			va_copy(apc, ap);
+			vfprintf(stderr, f, apc);	/* atomic write to stderr */
+			fflush(stderr);
+		}
+
+		my_log_curses(prio, f, ap);
+	}
+}
+/* we can not generalize variable argument list */
+#define LOG_TEMPLATE(PRIO)		\
+	if (PRIO <= opt_log_level) {	\
+		va_list ap;		\
+		va_start(ap, fmt);	\
+		vapplog(PRIO, fmt, ap);	\
+		va_end(ap);		\
+	}
+
+void log_error(const char *fmt, ...)
+{
+	LOG_TEMPLATE(LOG_ERR);
+}
+
+void log_warning(const char *fmt, ...)
+{
+	LOG_TEMPLATE(LOG_WARNING);
+}
+
+void log_notice(const char *fmt, ...)
+{
+	LOG_TEMPLATE(LOG_NOTICE);
+}
+
+void log_info(const char *fmt, ...)
+{
+	LOG_TEMPLATE(LOG_INFO);
+}
+
+void log_debug(const char *fmt, ...)
+{
+	LOG_TEMPLATE(LOG_DEBUG);
+}

+ 38 - 0
logging.h

@@ -0,0 +1,38 @@
+#ifndef __LOGGING_H__
+#define __LOGGING_H__
+
+#include "config.h"
+#include <stdbool.h>
+#include <stdarg.h>
+
+#ifdef HAVE_SYSLOG_H
+#include <syslog.h>
+#else
+enum {
+	LOG_ERR,
+	LOG_WARNING,
+	LOG_NOTICE,
+	LOG_INFO,
+	LOG_DEBUG,
+};
+#endif
+
+/* original / legacy debug flags */
+extern bool opt_debug;
+extern bool opt_log_output;
+
+/* global log_level, messages with lower or equal prio are logged */
+extern int opt_log_level;
+
+/* low-level logging functions with priority parameter */
+extern void vapplog(int prio, const char *fmt, va_list ap);
+extern void applog(int prio, const char *fmt, ...);
+
+/* high-level logging functions with implicit priority */
+extern void log_error(const char *fmt, ...);
+extern void log_warning(const char *fmt, ...);
+extern void log_notice(const char *fmt, ...);
+extern void log_info(const char *fmt, ...);
+extern void log_debug(const char *fmt, ...);
+
+#endif /* __LOGGING_H__ */

+ 110 - 130
miner.h

@@ -11,6 +11,7 @@
 #include <curl/curl.h>
 #include "elist.h"
 #include "uthash.h"
+#include "logging.h"
 
 #ifdef HAVE_OPENCL
 #ifdef __APPLE_CC__
@@ -60,30 +61,6 @@ void *alloca (size_t);
  #include "ADL_SDK/adl_sdk.h"
 #endif
 
-#ifdef __SSE2__
-#define WANT_SSE2_4WAY 1
-#endif
-
-#ifdef __ALTIVEC__
-#define WANT_ALTIVEC_4WAY 1
-#endif
-
-#if defined(__i386__) && defined(HAS_YASM) && defined(__SSE2__)
-#define WANT_X8632_SSE2 1
-#endif
-
-#if (defined(__i386__) || defined(__x86_64__)) &&  !defined(__APPLE__)
-#define WANT_VIA_PADLOCK 1
-#endif
-
-#if defined(__x86_64__) && defined(HAS_YASM)
-#define WANT_X8664_SSE2 1
-#endif
-
-#if defined(__x86_64__) && defined(HAS_YASM)
-#define WANT_X8664_SSE4 1
-#endif
-
 #if !defined(WIN32) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3))
 #define bswap_16 __builtin_bswap16
 #define bswap_32 __builtin_bswap32
@@ -126,18 +103,6 @@ void *alloca (size_t);
 #endif
 #endif
 
-#ifdef HAVE_SYSLOG_H
-#include <syslog.h>
-#else
-enum {
-	LOG_ERR,
-	LOG_WARNING,
-	LOG_NOTICE,
-	LOG_INFO,
-	LOG_DEBUG,
-};
-#endif
-
 #undef unlikely
 #undef likely
 #if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
@@ -147,6 +112,7 @@ enum {
 #define unlikely(expr) (expr)
 #define likely(expr) (expr)
 #endif
+#define __maybe_unused		__attribute__((unused))
 
 #if defined(__i386__)
 #define WANT_CRYPTOPP_ASM32
@@ -156,19 +122,6 @@ enum {
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
 #endif
 
-enum sha256_algos {
-	ALGO_C,			/* plain C */
-	ALGO_4WAY,		/* parallel SSE2 */
-	ALGO_VIA,		/* VIA padlock */
-	ALGO_CRYPTOPP,		/* Crypto++ (C) */
-	ALGO_CRYPTOPP_ASM32,	/* Crypto++ 32-bit assembly */
-	ALGO_SSE2_32,		/* SSE2 for x86_32 */
-	ALGO_SSE2_64,		/* SSE2 for x86_64 */
-	ALGO_SSE4_64,		/* SSE4 for x86_64 */
-	ALGO_ALTIVEC_4WAY,	/* parallel Altivec */
-};
-
-
 enum alive {
 	LIFE_WELL,
 	LIFE_SICK,
@@ -197,6 +150,8 @@ struct gpu_adl {
 	ADLTemperature lpTemperature;
 	int iAdapterIndex;
 	int lpAdapterID;
+	int iBusNumber;
+	char strAdapterName[256];
 
 	ADLPMActivity lpActivity;
 	ADLODParameters lpOdParameters;
@@ -204,7 +159,6 @@ struct gpu_adl {
 	ADLFanSpeedInfo lpFanSpeedInfo;
 	ADLFanSpeedValue lpFanSpeedValue;
 	ADLFanSpeedValue DefFanSpeedValue;
-	ADLThermalControllerInfo lpThermalControllerInfo;
 
 	int iEngineClock;
 	int iMemoryClock;
@@ -215,11 +169,11 @@ struct gpu_adl {
 	bool autoengine;
 	bool managed; /* Were the values ever changed on this card */
 
+	int lastengine;
 	int lasttemp;
 	int targetfan;
 	int targettemp;
 	int overtemp;
-	int cutofftemp;
 	int minspeed;
 	int maxspeed;
 
@@ -233,6 +187,7 @@ struct thr_info;
 struct work;
 
 struct device_api {
+	char*dname;
 	char*name;
 
 	// API-global functions
@@ -253,15 +208,52 @@ struct device_api {
 	void (*thread_shutdown)(struct thr_info*);
 };
 
+enum dev_enable {
+	DEV_ENABLED,
+	DEV_DISABLED,
+	DEV_RECOVER,
+};
+
+enum cl_kernels {
+	KL_NONE,
+	KL_POCLBM,
+	KL_PHATK,
+	KL_DIAKGCN,
+	KL_DIABLO,
+};
+
+enum dev_reason {
+	REASON_THREAD_FAIL_INIT,
+	REASON_THREAD_ZERO_HASH,
+	REASON_THREAD_FAIL_QUEUE,
+	REASON_DEV_SICK_IDLE_60,
+	REASON_DEV_DEAD_IDLE_600,
+	REASON_DEV_NOSTART,
+	REASON_DEV_OVER_HEAT,
+	REASON_DEV_THERMAL_CUTOFF,
+};
+
+#define REASON_NONE			"None"
+#define REASON_THREAD_FAIL_INIT_STR	"Thread failed to init"
+#define REASON_THREAD_ZERO_HASH_STR	"Thread got zero hashes"
+#define REASON_THREAD_FAIL_QUEUE_STR	"Thread failed to queue work"
+#define REASON_DEV_SICK_IDLE_60_STR	"Device idle for 60s"
+#define REASON_DEV_DEAD_IDLE_600_STR	"Device dead - idle for 600s"
+#define REASON_DEV_NOSTART_STR		"Device failed to start"
+#define REASON_DEV_OVER_HEAT_STR	"Device over heated"
+#define REASON_DEV_THERMAL_CUTOFF_STR	"Device reached thermal cutoff"
+#define REASON_UNKNOWN_STR		"Unknown reason - code bug"
+
 struct cgpu_info {
 	int cgminer_id;
 	struct device_api *api;
 	int device_id;
+	char *name;
 	char *device_path;
 	FILE *device_file;
 	int device_fd;
 
-	bool enabled;
+	enum dev_enable deven;
 	int accepted;
 	int rejected;
 	int hw_errors;
@@ -275,8 +267,21 @@ struct cgpu_info {
 	int threads;
 	struct thr_info *thread;
 
-	bool dynamic;
+	unsigned int max_hashes;
+
+	int virtual_gpu;
 	int intensity;
+	bool dynamic;
+	char *kname;
+#ifdef HAVE_OPENCL
+	cl_uint vwidth;
+	size_t work_size;
+	enum cl_kernels kernel;
+#endif
+
+	float temp;
+	int cutofftemp;
+
 #ifdef HAVE_ADL
 	bool has_adl;
 	struct gpu_adl adl;
@@ -290,17 +295,23 @@ struct cgpu_info {
 	int gpu_powertune;
 	float gpu_vddc;
 #endif
+	int last_share_pool;
+	time_t last_share_pool_time;
+
+	time_t device_last_well;
+	time_t device_last_not_well;
+	enum dev_reason device_not_well_reason;
+	int thread_fail_init_count;
+	int thread_zero_hash_count;
+	int thread_fail_queue_count;
+	int dev_sick_idle_60_count;
+	int dev_dead_idle_600_count;
+	int dev_nostart_count;
+	int dev_over_heat_count;	// It's a warning but worth knowing
+	int dev_thermal_cutoff_count;
 };
 
-#ifndef WIN32
-#define PTH(thr) ((thr)->pth)
-#else
-#define PTH(thr) ((thr)->pth.p)
-static inline void nanosleep(struct timespec *rgtp, void *__unused)
-{
-	Sleep(rgtp->tv_nsec / 1000000);
-}
-#endif
+extern bool add_cgpu(struct cgpu_info*);
 
 struct thread_q {
 	struct list_head	q;
@@ -313,6 +324,8 @@ struct thread_q {
 
 struct thr_info {
 	int		id;
+	int		device_thread;
+
 	pthread_t	pth;
 	struct thread_q	*q;
 	struct cgpu_info *cgpu;
@@ -327,6 +340,7 @@ struct thr_info {
 
 extern int thr_info_create(struct thr_info *thr, pthread_attr_t *attr, void *(*start) (void *), void *arg);
 extern void thr_info_cancel(struct thr_info *thr);
+extern void thr_info_freeze(struct thr_info *thr);
 
 
 struct string_elist {
@@ -430,27 +444,27 @@ static inline void rwlock_init(pthread_rwlock_t *lock)
 
 struct pool;
 
-extern bool opt_debug;
 extern bool opt_protocol;
-extern bool opt_log_output;
 extern char *opt_kernel_path;
 extern char *opt_socks_proxy;
 extern char *cgminer_path;
 extern bool opt_autofan;
 extern bool opt_autoengine;
 extern bool use_curses;
+extern char *opt_api_allow;
 extern char *opt_api_description;
 extern int opt_api_port;
 extern bool opt_api_listen;
 extern bool opt_api_network;
 extern bool opt_delaynet;
+extern bool opt_restart;
 
 extern pthread_rwlock_t netacc_lock;
 
 extern const uint32_t sha256_init_state[];
 extern json_t *json_rpc_call(CURL *curl, const char *url, const char *userpass,
 			     const char *rpc_req, bool, bool, bool *,
-			     struct pool *pool);
+			     struct pool *pool, bool);
 extern char *bin2hex(const unsigned char *p, size_t len);
 extern bool hex2bin(unsigned char *p, const char *hexstr, size_t len);
 
@@ -462,56 +476,6 @@ typedef bool (*sha256_func)(int thr_id, const unsigned char *pmidstate,
 	uint32_t *last_nonce,
 	uint32_t nonce);
 
-extern bool ScanHash_4WaySSE2(int, const unsigned char *pmidstate,
-	unsigned char *pdata, unsigned char *phash1, unsigned char *phash,
-	const unsigned char *ptarget,
-	uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce);
-
-extern bool ScanHash_altivec_4way(int thr_id, const unsigned char *pmidstate,
-	unsigned char *pdata,
-	unsigned char *phash1, unsigned char *phash,
-	const unsigned char *ptarget,
-	uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce);
-
-extern bool scanhash_via(int, const unsigned char *pmidstate,
-	unsigned char *pdata,
-	unsigned char *phash1, unsigned char *phash,
-	const unsigned char *target,
-	uint32_t max_nonce, uint32_t *last_nonce, uint32_t n);
-
-extern bool scanhash_c(int, const unsigned char *midstate, unsigned char *data,
-	      unsigned char *hash1, unsigned char *hash,
-	      const unsigned char *target,
-	      uint32_t max_nonce, uint32_t *last_nonce, uint32_t n);
-
-extern bool scanhash_cryptopp(int, const unsigned char *midstate,unsigned char *data,
-	      unsigned char *hash1, unsigned char *hash,
-	      const unsigned char *target,
-	      uint32_t max_nonce, uint32_t *last_nonce, uint32_t n);
-
-extern bool scanhash_asm32(int, const unsigned char *midstate,unsigned char *data,
-	      unsigned char *hash1, unsigned char *hash,
-	      const unsigned char *target,
-	      uint32_t max_nonce, uint32_t *last_nonce, uint32_t nonce);
-
-extern bool scanhash_sse2_64(int, const unsigned char *pmidstate, unsigned char *pdata,
-	unsigned char *phash1, unsigned char *phash,
-	const unsigned char *ptarget,
-	uint32_t max_nonce, uint32_t *last_nonce,
-	uint32_t nonce);
-
-extern bool scanhash_sse4_64(int, const unsigned char *pmidstate, unsigned char *pdata,
-	unsigned char *phash1, unsigned char *phash,
-	const unsigned char *ptarget,
-	uint32_t max_nonce, uint32_t *last_nonce,
-	uint32_t nonce);
-
-extern bool scanhash_sse2_32(int, const unsigned char *pmidstate, unsigned char *pdata,
-	unsigned char *phash1, unsigned char *phash,
-	const unsigned char *ptarget,
-	uint32_t max_nonce, uint32_t *last_nonce,
-	uint32_t nonce);
-
 extern int
 timeval_subtract (struct timeval *result, struct timeval *x, struct timeval *y);
 
@@ -524,14 +488,13 @@ struct work_restart {
 	char			padding[128 - sizeof(unsigned long)];
 };
 
+extern void thread_reportin(struct thr_info *thr);
+
 extern void kill_work(void);
 
 extern void reinit_device(struct cgpu_info *cgpu);
 
 #ifdef HAVE_ADL
-extern float gpu_temp(int gpu);
-extern int gpu_fanspeed(int gpu);
-extern int gpu_fanpercent(int gpu);
 extern bool gpu_stats(int gpu, float *temp, int *engineclock, int *memclock, float *vddc, int *activity, int *fanspeed, int *fanpercent, int *powertune);
 extern int set_fanspeed(int gpu, int iFanSpeed);
 extern int set_vddc(int gpu, float fVddc);
@@ -539,12 +502,24 @@ extern int set_engineclock(int gpu, int iEngineClock);
 extern int set_memoryclock(int gpu, int iMemoryClock);
 #endif
 
-extern void api(void);
+extern void api(int thr_id);
+
+extern struct pool *current_pool(void);
+extern int active_pools(void);
+extern int add_pool_details(bool live, char *url, char *user, char *pass);
+
+#define ADD_POOL_MAXIMUM 1
+#define ADD_POOL_OK 0
 
 #define MAX_GPUDEVICES 16
-#define MAX_DEVICES 32
+#define MAX_DEVICES 64
 #define MAX_POOLS (32)
 
+#define MIN_INTENSITY -10
+#define _MIN_INTENSITY_STR "-10"
+#define MAX_INTENSITY 14
+#define _MAX_INTENSITY_STR "14"
+
 extern struct list_head scan_devices;
 extern int nDevs;
 extern int opt_n_threads;
@@ -575,6 +550,7 @@ extern int total_accepted, total_rejected;
 extern int total_getworks, total_stale, total_discarded;
 extern unsigned int local_work;
 extern unsigned int total_go, total_ro;
+extern const int opt_cutofftemp;
 extern int opt_log_interval;
 
 #ifdef HAVE_OPENCL
@@ -596,6 +572,11 @@ typedef struct {
 	cl_uint PreW19;
 	cl_uint PreW31;
 	cl_uint PreW32;
+
+	/* For diakgcn */
+	cl_uint B1addK6, PreVal0addK7, W16addK16, W17addK17;
+	cl_uint zeroA, zeroB;
+	cl_uint oneA, twoA, threeA, fourA, fiveA, sixA, sevenA;
 } dev_blk_ctx;
 #else
 typedef struct {
@@ -612,8 +593,12 @@ struct pool {
 	bool lagging;
 	bool probed;
 	bool enabled;
+	bool submit_old;
 
 	char *hdr_path;
+	char *lp_url;
+	bool lp_sent;
+	bool is_lp;
 
 	unsigned int getwork_requested;
 	unsigned int stale_shares;
@@ -650,30 +635,26 @@ struct work {
 	bool		clone;
 	bool		cloned;
 	bool		rolltime;
+	bool		longpoll;
 
 	unsigned int	work_block;
 	int		id;
 	UT_hash_handle hh;
 };
 
-enum cl_kernel {
-	KL_NONE,
-	KL_POCLBM,
-	KL_PHATK,
-};
-
 extern void get_datestamp(char *, struct timeval *);
 bool submit_nonce(struct thr_info *thr, struct work *work, uint32_t nonce);
+extern void tailsprintf(char *f, const char *fmt, ...);
 extern void wlogprint(const char *f, ...);
 extern int curses_int(const char *query);
 extern char *curses_input(const char *query);
 extern void kill_work(void);
 extern void switch_pools(struct pool *selected);
+extern void remove_pool(struct pool *pool);
 extern void write_config(FILE *fcfg);
 extern void log_curses(int prio, const char *f, va_list ap);
 extern void clear_logwin(void);
-extern void vapplog(int prio, const char *fmt, va_list ap);
-extern void applog(int prio, const char *fmt, ...);
+extern bool pool_tclear(struct pool *pool, bool *var);
 extern struct thread_q *tq_new(void);
 extern void tq_free(struct thread_q *tq);
 extern bool tq_push(struct thread_q *tq, void *data);
@@ -681,8 +662,7 @@ extern void *tq_pop(struct thread_q *tq, const struct timespec *abstime);
 extern void tq_freeze(struct thread_q *tq);
 extern void tq_thaw(struct thread_q *tq);
 extern bool successful_connect;
-extern enum cl_kernel chosen_kernel;
 extern void adl(void);
-extern bool get_dondata(char **url, char **userpass);
+extern void app_restart(void);
 
 #endif /* __MINER_H__ */

+ 535 - 92
miner.php

@@ -1,28 +1,79 @@
 <?php
 session_start();
 #
-global $miner, $port;
-$miner = '127.0.0.1'; # hostname or IP address
-$port = 4028;
+global $miner, $port, $readonly, $notify, $rigs;
+#
+# Don't touch these 2 - see $rigs below
+$miner = null;
+$port = null;
+#
+# Set $readonly to true to force miner.php to be readonly
+# Set $readonly to false then it will check cgminer 'privileged'
+$readonly = false;
+#
+# Set $notify to false to NOT attempt to display the notify command
+# Set $notify to true to attempt to display the notify command
+# If your older version of cgminer returns an 'Invalid command'
+#  coz it doesn't have notify - it just shows the error status table
+$notify = true;
+#
+# Set $rigs to an array of your cgminer rigs that are running
+#  format: 'IP:Port' or 'Host:Port'
+# If you only have one rig, it will just show the detail of that rig
+# If you have more than one rig it will show a summary of all the rigs
+#  with buttons to show the details of each rig
+# e.g. $rigs = array('127.0.0.1:4028','myrig.com:4028');
+$rigs = array('127.0.0.1:4028');
 #
 $here = $_SERVER['PHP_SELF'];
 #
+global $tablebegin, $tableend, $warnfont, $warnoff;
+$tablebegin = '<tr><td><table border=1 cellpadding=5 cellspacing=0>';
+$tableend = '</table></td></tr>';
+$warnfont = '<font color=red><b>';
+$warnoff = '</b></font>';
+
+#
+function htmlhead($checkapi)
+{
+ global $error, $readonly, $here;
+ if ($readonly === false && $checkapi === true)
+ {
+	$access = api('privileged');
+	if ($error != null
+	||  !isset($access['STATUS']['STATUS'])
+	||  $access['STATUS']['STATUS'] != 'S')
+		$readonly = true;
+ }
 ?>
 <html><head><title>Mine</title>
 <style type='text/css'>
-td { color:blue; font-family:verdana,arial,sans; font-size:14pt; }
-td.h { color:blue; font-family:verdana,arial,sans; font-size:14pt; background:#d0ffff }
-td.sta { color:green; font-family:verdana,arial,sans; font-size:14pt; }
+td { color:blue; font-family:verdana,arial,sans; font-size:13pt; }
+td.h { color:blue; font-family:verdana,arial,sans; font-size:13pt; background:#d0ffff }
+td.err { color:black; font-family:verdana,arial,sans; font-size:13pt; background:#ff3050 }
+td.warn { color:black; font-family:verdana,arial,sans; font-size:13pt; background:#ffb050 }
+td.sta { color:green; font-family:verdana,arial,sans; font-size:13pt; }
+td.tot { color:blue; font-family:verdana,arial,sans; font-size:13pt; background:#fff8f2 }
 </style>
 </head><body bgcolor=#ecffff>
 <script type='text/javascript'>
-function pr(a,m){if(m!=null){if(!confirm(m+'?'))return}window.location="<? echo $here ?>"+a}
-function prs(a){var c=a.substr(3);var z=c.split('|',2);var m=z[0].substr(0,1).toUpperCase()+z[0].substr(1)+' GPU '+z[1];pr('?arg='+a,m)}
+function pr(a,m){if(m!=null){if(!confirm(m+'?'))return}window.location="<?php echo $here ?>"+a}
+<?php
+ if ($readonly === false && $checkapi === true)
+ {
+?>
+function prc(a,m){pr('?arg='+a,m)}
+function prs(a,r){var c=a.substr(3);var z=c.split('|',2);var m=z[0].substr(0,1).toUpperCase()+z[0].substr(1)+' GPU '+z[1];prc(a+'&rig='+r,m)}
+function prs2(a,n,r){var v=document.getElementById('gi'+n).value;var c=a.substr(3);var z=c.split('|',2);var m='Set GPU '+z[1]+' '+z[0].substr(0,1).toUpperCase()+z[0].substr(1)+' to '+v;prc(a+','+v+'&rig='+r,m)}
+<?php
+ }
+?>
 </script>
 <table width=100% height=100% border=0 cellpadding=0 cellspacing=0 summary='Mine'>
 <tr><td align=center valign=top>
 <table border=0 cellpadding=4 cellspacing=0 summary='Mine'>
-<?
+<?php
+}
 #
 global $error;
 $error = null;
@@ -152,170 +203,331 @@ function getparam($name, $both = false)
 #
 function fmt($section, $name, $value)
 {
+ $errorclass = ' class=err';
+ $warnclass = ' class=warn';
  $b = '&nbsp;';
 
+ $ret = $value;
+ $class = '';
+
  switch ($section.'.'.$name)
  {
+ case 'GPU.Last Share Time':
+ case 'PGA.Last Share Time':
+	$ret = date('H:i:s', $value);
+	break;
  case 'SUMMARY.Elapsed':
 	$s = $value % 60;
 	$value -= $s;
 	$value /= 60;
 	if ($value == 0)
-	{
-		return $s.'s';
-	}
+		$ret = $s.'s';
 	else
 	{
 		$m = $value % 60;
 		$value -= $m;
 		$value /= 60;
 		if ($value == 0)
-		{
-			return sprintf("%dm$b%02ds", $m, $s);
-		}
+			$ret = sprintf("%dm$b%02ds", $m, $s);
 		else
 		{
 			$h = $value % 24;
 			$value -= $h;
 			$value /= 24;
 			if ($value == 0)
-				return sprintf("%dh$b%02dm$b%02ds", $h, $m, $s);
+				$ret = sprintf("%dh$b%02dm$b%02ds", $h, $m, $s);
 			else
-				return sprintf("%ddays$b%02dh$b%02dm$b%02ds", $value, $h, $m, $s);
+			{
+				if ($value == 1)
+					$days = '';
+				else
+					$days = 's';
+
+				$ret = sprintf("%dday$days$b%02dh$b%02dm$b%02ds", $value, $h, $m, $s);
+			}
 		}
 	}
 	break;
- case 'GPU0.Utility':
+ case 'NOTIFY.Last Well':
+	if ($value == '0')
+	{
+		$ret = 'Never';
+		$class = $warnclass;
+	}
+	else
+		$ret = date('H:i:s', $value);
+	break;
+ case 'NOTIFY.Last Not Well':
+	if ($value == '0')
+		$ret = 'Never';
+	else
+	{
+		$ret = date('H:i:s', $value);
+		$class = $errorclass;
+	}
+	break;
+ case 'NOTIFY.Reason Not Well':
+	if ($value != 'None')
+		$class = $errorclass;
+	break;
+ case 'GPU.Utility':
+ case 'PGA.Utility':
  case 'SUMMARY.Utility':
-	return $value.'/m';
+	$ret = $value.'/m';
+	break;
+ case 'PGA.Temperature':
+	$ret = $value.'&deg;C';
+	break;
+ case 'GPU.Temperature':
+	$ret = $value.'&deg;C';
+ case 'GPU.Fan Speed':
+ case 'GPU.Fan Percent':
+ case 'GPU.GPU Clock':
+ case 'GPU.Memory Clock':
+ case 'GPU.GPU Voltage':
+ case 'GPU.GPU Activity':
+	if ($value == 0)
+		$class = $warnclass;
+	break;
+ case 'GPU.MHS av':
+ case 'PGA.MHS av':
+ case 'SUMMARY.MHS av':
+ case 'GPU.Total MH':
+ case 'PGA.Total MH':
+ case 'SUMMARY.Total MH':
+ case 'SUMMARY.Getworks':
+ case 'GPU.Accepted':
+ case 'PGA.Accepted':
+ case 'SUMMARY.Accepted':
+ case 'GPU.Rejected':
+ case 'PGA.Rejected':
+ case 'SUMMARY.Rejected':
+ case 'SUMMARY.Local Work':
+ case 'POOL.Getworks':
+ case 'POOL.Accepted':
+ case 'POOL.Rejected':
+ case 'POOL.Discarded':
+	$parts = explode('.', $value, 2);
+	if (count($parts) == 1)
+		$dec = '';
+	else
+		$dec = '.'.$parts[1];
+	$ret = number_format($parts[0]).$dec;
+	break;
+ case 'GPU.Status':
+ case 'PGA.Status':
+ case 'POOL.Status':
+	if ($value != 'Alive')
+		$class = $errorclass;
 	break;
- case 'GPU0.Temperature':
-	return $value.'&deg;C';
+ case 'GPU.Enabled':
+ case 'PGA.Enabled':
+	if ($value != 'Y')
+		$class = $warnclass;
 	break;
  }
 
- return $value;
+ if ($section == 'NOTIFY' && substr($name, 0, 1) == '*' && $value != '0')
+	$class = $errorclass;
+
+ return array($ret, $class);
 }
 #
-function details($list)
+global $poolcmd;
+$poolcmd = array(	'Switch to'	=> 'switchpool',
+			'Enable'	=> 'enablepool',
+			'Disable'	=> 'disablepool' );
+#
+function showhead($cmd, $item, $values)
 {
- $stas = array('S' => 'Success', 'W' => 'Warning', 'I' => 'Informational', 'E' => 'Error', 'F' => 'Fatal');
+ global $poolcmd, $readonly;
+
+ echo '<tr>';
+
+ foreach ($values as $name => $value)
+ {
+	if ($name == '0')
+		$name = '&nbsp;';
+	echo "<td valign=bottom class=h>$name</td>";
+ }
 
- $tb = '<tr><td><table border=1 cellpadding=5 cellspacing=0>';
- $te = '</table></td></tr>';
+ if ($cmd == 'pools' && $readonly === false)
+	foreach ($poolcmd as $name => $pcmd)
+		echo "<td valign=bottom class=h>$name</td>";
 
- echo $tb;
+ echo '</tr>';
+}
+#
+function details($cmd, $list, $rig)
+{
+ global $tablebegin, $tableend;
+ global $poolcmd, $readonly;
+
+ $dfmt = 'H:i:s j-M-Y \U\T\CP';
+
+ $stas = array('S' => 'Success', 'W' => 'Warning', 'I' => 'Informational', 'E' => 'Error', 'F' => 'Fatal');
 
- echo '<tr><td class=sta>Date: '.date('H:i:s j-M-Y \U\T\CP').'</td></tr>';
+ echo $tablebegin;
 
- echo $te.$tb;
+ echo '<tr><td class=sta>Date: '.date($dfmt).'</td></tr>';
+
+ echo $tableend.$tablebegin;
 
  if (isset($list['STATUS']))
  {
 	echo '<tr>';
 	echo '<td>Computer: '.$list['STATUS']['Description'].'</td>';
+	if (isset($list['STATUS']['When']))
+		echo '<td>When: '.date($dfmt, $list['STATUS']['When']).'</td>';
 	$sta = $list['STATUS']['STATUS'];
 	echo '<td>Status: '.$stas[$sta].'</td>';
 	echo '<td>Message: '.$list['STATUS']['Msg'].'</td>';
 	echo '</tr>';
  }
 
- echo $te.$tb;
 
  $section = '';
 
  foreach ($list as $item => $values)
  {
-	if ($item != 'STATUS')
-	{
-		$section = $item;
-
-		echo '<tr>';
-
-		foreach ($values as $name => $value)
-		{
-			if ($name == '0')
-				$name = '&nbsp;';
-			echo "<td valign=bottom class=h>$name</td>";
-		}
+	if ($item == 'STATUS')
+		continue;
 
-		echo '</tr>';
+	$sectionname = preg_replace('/\d/', '', $item);
 
-		break;
+	if ($sectionname != $section)
+	{
+		echo $tableend.$tablebegin;
+		showhead($cmd, $item, $values);
+		$section = $sectionname;
 	}
- }
 
- foreach ($list as $item => $values)
- {
-	if ($item == 'STATUS')
-		continue;
+	echo '<tr>';
 
 	foreach ($values as $name => $value)
-		echo '<td>'.fmt($section, $name, $value).'</td>';
+	{
+		list($showvalue, $class) = fmt($section, $name, $value);
+		echo "<td$class>$showvalue</td>";
+	}
+
+	if ($cmd == 'pools' && $readonly === false)
+	{
+		reset($values);
+		$pool = current($values);
+		foreach ($poolcmd as $name => $pcmd)
+		{
+			echo '<td>';
+			if ($pool === false)
+				echo '&nbsp;';
+			else
+			{
+				echo "<input type=button value='Pool $pool'";
+				echo " onclick='prc(\"$pcmd|$pool&rig=$rig\",\"$name Pool $pool\")'>";
+			}
+			echo '</td>';
+		}
+	}
 
 	echo '</tr>';
  }
- echo $te;
+
+ echo $tableend;
 }
 #
-function gpubuttons($count)
+global $devs;
+$devs = null;
+#
+function gpubuttons($count, $rig)
 {
- $tb = '<tr><td><table border=1 cellpadding=5 cellspacing=0>';
- $te = '</table></td></tr>';
+ global $tablebegin, $tableend;
+ global $devs;
 
- echo $tb.'<tr>';
+ $basic = array( 'GPU', 'Enable', 'Disable', 'Restart' );
 
- for ($i = 0; $i < 4; $i++)
- {
-	echo '<td>';
+ $options = array(	'intensity' => 'Intensity',
+			'fan' => 'Fan Percent',
+			'engine' => 'GPU Clock',
+			'mem' => 'Memory Clock',
+			'vddc' => 'GPU Voltage' );
 
-	if ($i == 0)
-		echo 'GPU';
-	else
-		echo '&nbsp;';
+ echo $tablebegin.'<tr>';
 
-	echo '</td>';
- }
+ foreach ($basic as $head)
+	echo "<td>$head</td>";
+
+ foreach ($options as $name => $des)
+	echo "<td nowrap>$des</td>";
 
+ $n = 0;
  for ($c = 0; $c < $count; $c++)
  {
 	echo '</tr><tr>';
 
-	echo "<td>$c</td>";
-	echo "<td><input type=button value='Enable $c' onclick='prs(\"gpuenable|$c\")'></td>";
-	echo "<td><input type=button value='Disable $c' onclick='prs(\"gpudisable|$c\")'></td>";
-	echo "<td><input type=button value='Restart $c' onclick='prs(\"gpurestart|$c\")'></td>";
+	foreach ($basic as $name)
+	{
+		echo '<td>';
+
+		if ($name == 'GPU')
+			echo $c;
+		else
+		{
+			echo "<input type=button value='$name $c' onclick='prs(\"gpu";
+			echo strtolower($name);
+			echo "|$c\",$rig)'>";
+		}
+
+		echo '</td>';
+	}
+
+	foreach ($options as $name => $des)
+	{
+		echo '<td>';
+		if (!isset($devs["GPU$c"][$des]))
+			echo '&nbsp;';
+		else
+		{
+			$value = $devs["GPU$c"][$des];
+			echo "<input type=button value='Set $c:' onclick='prs2(\"gpu$name|$c\",$n,$rig)'>";
+			echo "<input size=7 type=text name=gi$n value='$value' id=gi$n>";
+			$n++;
+		}
+
+		echo '</td>';
+	}
+
  }
 
- echo '</tr>'.$te;
+ echo '</tr>'.$tableend;
 }
 #
-function processgpus($rd, $ro)
+function processgpus($rig)
 {
  global $error;
+ global $warnfont, $warnoff;
 
  $gpus = api('gpucount');
 
  if ($error != null)
-	echo '<tr><td>Error getting GPU count: '.$rd.$error.$ro.'</td></tr>';
+	echo '<tr><td>Error getting GPU count: '.$warnfont.$error.$warnoff.'</td></tr>';
  else
  {
 	if (!isset($gpus['GPUS']['Count']))
-		echo '<tr><td>No GPU count returned: '.$rd.$gpus['STATUS']['STATUS'].' '.$gpus['STATUS']['Msg'].$ro.'</td></tr>';
+		echo '<tr><td>No GPU count returned: '.$warnfont.$gpus['STATUS']['STATUS'].' '.$gpus['STATUS']['Msg'].$ro.'</td></tr>';
 	else
 	{
 		$count = $gpus['GPUS']['Count'];
 		if ($count == 0)
 			echo '<tr><td>No GPUs</td></tr>';
 		else
-			gpubuttons($count);
+			gpubuttons($count, $rig);
 	}
  }
 }
 #
-function process($cmds, $rd, $ro)
+function process($cmds, $rig)
 {
- global $error;
+ global $error, $devs;
+ global $warnfont, $warnoff;
 
  foreach ($cmds as $cmd => $des)
  {
@@ -323,45 +535,276 @@ function process($cmds, $rd, $ro)
 
 	if ($error != null)
 	{
-		echo "<tr><td>Error getting $des: ";
-		echo $rd.$error.$ro.'</td></tr>';
+		echo "<tr><td colspan=100>Error getting $des: ";
+		echo $warnfont.$error.$warnoff.'</td></tr>';
 		break;
 	}
 	else
 	{
-		details($process);
+		details($cmd, $process, $rig);
 		echo '<tr><td><br><br></td></tr>';
+		if ($cmd == 'devs')
+			$devs = $process;
 	}
  }
 }
 #
-function display()
+# $head is a hack but this is just a demo anyway :)
+function doforeach($cmd, $des, $sum, $head)
 {
- global $error;
+ global $miner, $port;
+ global $error, $readonly, $notify, $rigs;
+ global $tablebegin, $tableend, $warnfont, $warnoff;
 
- $error = null;
+ $header = $head;
+ $anss = array();
 
- $rd = '<font color=red><b>';
- $ro = '</b></font>';
+ $count = 0;
+ foreach ($rigs as $rig)
+ {
+	$parts = explode(':', $rig, 2);
+	if (count($parts) == 2)
+	{
+		$miner = $parts[0];
+		$port = $parts[1];
+
+		$ans = api($cmd);
+
+		if ($error != null)
+		{
+			echo "<tr><td colspan=100>Error on rig $count getting $des: ";
+			echo $warnfont.$error.$warnoff.'</td></tr>';
+			$error = null;
+		}
+		else
+			$anss[$count] = $ans;
+	}
+	$count++;
+ }
+
+ if (count($anss) == 0)
+ {
+	echo "<tr><td>Failed to access any rigs successfully</td></tr>";
+	return;
+ }
+
+ $total = array();
+
+ foreach ($anss as $rig => $ans)
+ {
+	foreach ($ans as $item => $row)
+	{
+		if ($item == 'STATUS')
+			continue;
+
+		if (count($row) > count($header))
+		{
+			$header = $head;
+			foreach ($row as $name => $value)
+				if (!isset($header[$name]))
+					$header[$name] = '';
+		}
+
+		if ($sum != null)
+			foreach ($sum as $name)
+			{
+				if (isset($row[$name]))
+				{
+					if (isset($total[$name]))
+						$total[$name] += $row[$name];
+					else
+						$total[$name] = $row[$name];
+				}
+			}
+	}
+ }
+
+ if ($sum != null)
+	$anss['total']['total'] = $total;
+
+ showhead('', null, $header);
+
+ $section = '';
+
+ foreach ($anss as $rig => $ans)
+ {
+	foreach ($ans as $item => $row)
+	{
+		if ($item == 'STATUS')
+			continue;
+
+		echo '<tr>';
+
+		$newsection = preg_replace('/\d/', '', $item);
+		if ($newsection != 'total')
+			$section = $newsection;
+
+		foreach ($header as $name => $x)
+		{
+			if ($name == '')
+			{
+				if ($rig === 'total')
+					echo "<td align=right class=tot>Total:</td>";
+				else
+					echo "<td align=right><input type=button value='Rig $rig' onclick='pr(\"?rig=$rig\",null)'></td>";
+			}
+			else
+			{
+				if (isset($row[$name]))
+					list($showvalue, $class) = fmt($section, $name, $row[$name]);
+				else
+				{
+					$class = '';
+					$showvalue = '&nbsp;';
+				}
+
+				if ($rig === 'total' and $class == '')
+					$class = ' class=tot';
+
+				echo "<td$class align=right>$showvalue</td>";
+			}
+		}
+
+		echo '</tr>';
+	}
+ }
+}
+#
+function doOne($rig, $preprocess)
+{
+ global $error, $readonly, $notify;
+ global $rigs;
+
+ htmlhead(true);
+
+ $error = null;
 
  echo "<tr><td><table cellpadding=0 cellspacing=0 border=0><tr><td>";
- echo "<input type=button value='Refresh' onclick='pr(\"\",null)'>";
- echo "</td><td width=100%>&nbsp;</td><td>";
- echo "<input type=button value='Quit' onclick='pr(\"?arg=quit\",\"Quit CGMiner\")'>";
+ echo "<input type=button value='Refresh' onclick='pr(\"?rig=$rig\",null)'></td>";
+ if (count($rigs) > 1)
+	echo "<td><input type=button value='Summary' onclick='pr(\"\",null)'></td>";
+ echo "<td width=100%>&nbsp;</td><td>";
+ if ($readonly === false)
+ {
+	$msg = 'Quit CGMiner';
+	if (count($rigs) > 1)
+		$msg .= " Rig $rig";
+	echo "<input type=button value='Quit' onclick='prc(\"quit&rig=$rig\",\"$msg\")'>";
+ }
  echo "</td></tr></table></td></tr>";
 
- $arg = trim(getparam('arg', true));
- if ($arg != null and $arg != '')
-	process(array($arg => $arg), $rd, $ro);
+ if ($preprocess != null)
+	process(array($preprocess => $preprocess), $rig);
 
  $cmds = array(	'devs'    => 'device list',
 		'summary' => 'summary information',
 		'pools'   => 'pool list');
 
- process($cmds, $rd, $ro);
+ if ($notify)
+	$cmds['notify'] = 'device status';
+
+ $cmds['config'] = 'cgminer config';
+
+ process($cmds, $rig);
+
+ if ($error == null && $readonly === false)
+	processgpus($rig);
+}
+#
+function display()
+{
+ global $tablebegin, $tableend;
+ global $miner, $port;
+ global $error, $readonly, $notify, $rigs;
+
+ $rig = trim(getparam('rig', true));
+
+ $arg = trim(getparam('arg', true));
+ $preprocess = null;
+ if ($arg != null and $arg != '')
+ {
+	$num = null;
+	if ($rig != null and $rig != '')
+	{
+		if ($rig >= 0 and $rig < count($rigs))
+			$num = $rig;
+	}
+	else
+		if (count($rigs) == 0)
+			$num = 0;
+
+	if ($num != null)
+	{
+		$parts = explode(':', $rigs[$num], 2);
+		if (count($parts) == 2)
+		{
+			$miner = $parts[0];
+			$port = $parts[1];
+
+			$preprocess = $arg;
+		}
+	}
+ }
+
+ if ($rigs == null or count($rigs) == 0)
+ {
+	echo "<tr><td>No rigs defined</td></tr>";
+	return;
+ }
+
+ if (count($rigs) == 1)
+ {
+	$parts = explode(':', $rigs[0], 2);
+	if (count($parts) == 2)
+	{
+		$miner = $parts[0];
+		$port = $parts[1];
+
+		doOne(0, $preprocess);
+	}
+	else
+		echo '<tr><td>Invalid "$rigs" array</td></tr>';
+
+	return;
+ }
+
+ if ($rig != null and $rig != '' and $rig >= 0 and $rig < count($rigs))
+ {
+	$parts = explode(':', $rigs[$rig], 2);
+	if (count($parts) == 2)
+	{
+		$miner = $parts[0];
+		$port = $parts[1];
+
+		doOne($rig, $preprocess);
+	}
+	else
+		echo '<tr><td>Invalid "$rigs" array</td></tr>';
+
+	return;
+ }
+
+ htmlhead(false);
+
+ echo "<tr><td><table cellpadding=0 cellspacing=0 border=0><tr><td>";
+ echo "<input type=button value='Refresh' onclick='pr(\"\",null)'>";
+ echo "</td></tr></table></td></tr>";
 
- if ($error == null)
-	processgpus($rd, $ro);
+ if ($preprocess != null)
+	process(array($preprocess => $preprocess), $rig);
+
+ echo $tablebegin;
+ $sum = array('MHS av', 'Getworks', 'Found Blocks', 'Accepted', 'Rejected', 'Discarded', 'Stale', 'Utility', 'Local Work', 'Total MH');
+ doforeach('summary', 'summary information', $sum, array());
+ echo $tableend;
+ echo '<tr><td><br><br></td></tr>';
+ echo $tablebegin;
+ doforeach('devs', 'device list', $sum, array(''=>'','ID'=>'','Name'=>''));
+ echo $tableend;
+ echo '<tr><td><br><br></td></tr>';
+ echo $tablebegin;
+ doforeach('pools', 'pool list', $sum, array(''=>''));
+ echo $tableend;
 }
 #
 display();

+ 277 - 227
ocl.c

@@ -1,6 +1,12 @@
 /*
- * Copyright 2011 Con Kolivas
+ * Copyright 2011-2012 Con Kolivas
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 3 of the License, or (at your option)
+ * any later version.  See COPYING for more details.
  */
+
 #include "config.h"
 #ifdef HAVE_OPENCL
 
@@ -27,8 +33,7 @@
 #include "findnonce.h"
 #include "ocl.h"
 
-extern int opt_vectors;
-extern int opt_worksize;
+int opt_platform_id;
 
 char *file_contents(const char *filename, int *length)
 {
@@ -68,59 +73,59 @@ char *file_contents(const char *filename, int *length)
 	return (char*)buffer;
 }
 
-int clDevicesNum() {
-	cl_int status = 0;
-
+int clDevicesNum(void) {
+	cl_int status;
+	char pbuff[256];
+	cl_uint numDevices;
 	cl_uint numPlatforms;
+	cl_platform_id *platforms;
 	cl_platform_id platform = NULL;
+	unsigned int most_devices = 0, i;
+
 	status = clGetPlatformIDs(0, NULL, &numPlatforms);
 	/* If this fails, assume no GPUs. */
 	if (status != CL_SUCCESS) {
-		applog(LOG_INFO, "clGetPlatformsIDs failed (no GPU?)");
-		return 0;
-	}
-
-	if (numPlatforms > 0) {
-		cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id));
-		status = clGetPlatformIDs(numPlatforms, platforms, NULL);
-		if (status != CL_SUCCESS)
-		{
-			applog(LOG_ERR, "Error: Getting Platform Ids. (clGetPlatformsIDs)");
-			return -1;
-		}
-
-		unsigned int i;
-		for (i = 0; i < numPlatforms; ++i) {
-			char pbuff[100];
-			status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);
-			if (status != CL_SUCCESS)
-			{
-				applog(LOG_ERR, "Error: Getting Platform Info. (clGetPlatformInfo)");
-				free(platforms);
-				return -1;
-			}
-			platform = platforms[i];
-			if (!strcmp(pbuff, "Advanced Micro Devices, Inc."))
-			{
-				break;
-			}
-		}
-		free(platforms);
+		applog(LOG_ERR, "Error %d: clGetPlatformsIDs failed (no OpenCL SDK installed?)", status);
+		return -1;
 	}
 
-	if (platform == NULL) {
-		perror("NULL platform found!\n");
+	if (numPlatforms == 0) {
+		applog(LOG_ERR, "clGetPlatformsIDs returned no platforms (no OpenCL SDK installed?)");
 		return -1;
 	}
 
-	cl_uint numDevices;
-	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+	platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id));
+	status = clGetPlatformIDs(numPlatforms, platforms, NULL);
 	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error: Getting Device IDs (num)");
+		applog(LOG_ERR, "Error %d: Getting Platform Ids. (clGetPlatformsIDs)", status);
 		return -1;
 	}
 
-	return numDevices;
+	for (i = 0; i < numPlatforms; i++) {
+		status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);
+		if (status != CL_SUCCESS) {
+			applog(LOG_ERR, "Error %d: Getting Platform Info. (clGetPlatformInfo)", status);
+			return -1;
+		}
+		platform = platforms[i];
+		applog(LOG_INFO, "CL Platform %d vendor: %s", i, pbuff);
+		status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL);
+		if (status == CL_SUCCESS)
+			applog(LOG_INFO, "CL Platform %d name: %s", i, pbuff);
+		status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(pbuff), pbuff, NULL);
+		if (status == CL_SUCCESS)
+			applog(LOG_INFO, "CL Platform %d version: %s", i, pbuff);
+		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
+		if (status != CL_SUCCESS) {
+			applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status);
+			return -1;
+		}
+		applog(LOG_INFO, "Platform %d devices: %d", i, numDevices);
+		if (numDevices > most_devices)
+			most_devices = numDevices;
+	}
+
+	return most_devices;
 }
 
 static int advance(char **area, unsigned *remaining, const char *marker)
@@ -128,8 +133,7 @@ static int advance(char **area, unsigned *remaining, const char *marker)
 	char *find = memmem(*area, *remaining, marker, strlen(marker));
 
 	if (!find) {
-		if (opt_debug)
-			applog(LOG_DEBUG, "Marker \"%s\" not found", marker);
+		applog(LOG_DEBUG, "Marker \"%s\" not found", marker);
 		return 0;
 	}
 	*remaining -= find - *area;
@@ -175,70 +179,69 @@ void patch_opcodes(char *w, unsigned remaining)
 		opcode++;
 		remaining -= 8;
 	}
-	if (opt_debug) {
-		applog(LOG_DEBUG, "Potential OP3 instructions identified: "
-			"%i BFE_INT, %i BFE_UINT, %i BYTE_ALIGN",
-			count_bfe_int, count_bfe_uint, count_byte_align);
-		applog(LOG_DEBUG, "Patched a total of %i BFI_INT instructions", patched);
-	}
+	applog(LOG_DEBUG, "Potential OP3 instructions identified: "
+		"%i BFE_INT, %i BFE_UINT, %i BYTE_ALIGN",
+		count_bfe_int, count_bfe_uint, count_byte_align);
+	applog(LOG_DEBUG, "Patched a total of %i BFI_INT instructions", patched);
 }
 
 _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 {
-	bool patchbfi = false;
-	cl_int status = 0;
-	unsigned int i;
-
 	_clState *clState = calloc(1, sizeof(_clState));
-
-	cl_uint numPlatforms;
+	bool patchbfi = false, prog_built = false;
 	cl_platform_id platform = NULL;
+	char pbuff[256], vbuff[255];
+	cl_platform_id* platforms;
+	cl_uint preferred_vwidth;
+	cl_device_id *devices;
+	cl_uint numPlatforms;
+	cl_uint numDevices;
+	cl_int status;
+
 	status = clGetPlatformIDs(0, NULL, &numPlatforms);
 	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error: Getting Platforms. (clGetPlatformsIDs)");
+		applog(LOG_ERR, "Error %d: Getting Platforms. (clGetPlatformsIDs)", status);
 		return NULL;
 	}
 
-	if (numPlatforms > 0) {
-		cl_platform_id* platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id));
-		status = clGetPlatformIDs(numPlatforms, platforms, NULL);
-		if (status != CL_SUCCESS)
-		{
-			applog(LOG_ERR, "Error: Getting Platform Ids. (clGetPlatformsIDs)");
-			return NULL;
-		}
+	platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id));
+	status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+	if (status != CL_SUCCESS) {
+		applog(LOG_ERR, "Error %d: Getting Platform Ids. (clGetPlatformsIDs)", status);
+		return NULL;
+	}
 
-		for(i = 0; i < numPlatforms; ++i) {
-			char pbuff[100];
-			status = clGetPlatformInfo( platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);
-			if (status != CL_SUCCESS)
-			{
-				applog(LOG_ERR, "Error: Getting Platform Info. (clGetPlatformInfo)");
-				free(platforms);
-				return NULL;
-			}
-			platform = platforms[i];
-			if (!strcmp(pbuff, "Advanced Micro Devices, Inc."))
-			{
-				break;
-			}
-		}
-		free(platforms);
+	if (opt_platform_id >= (int)numPlatforms) {
+		applog(LOG_ERR, "Specified platform that does not exist");
+		return NULL;
 	}
 
+	status = clGetPlatformInfo(platforms[opt_platform_id], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL);
+	if (status != CL_SUCCESS) {
+		applog(LOG_ERR, "Error %d: Getting Platform Info. (clGetPlatformInfo)", status);
+		return NULL;
+	}
+	platform = platforms[opt_platform_id];
+
 	if (platform == NULL) {
 		perror("NULL platform found!\n");
 		return NULL;
 	}
 
-	cl_uint numDevices;
+	applog(LOG_INFO, "CL Platform vendor: %s", pbuff);
+	status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL);
+	if (status == CL_SUCCESS)
+		applog(LOG_INFO, "CL Platform name: %s", pbuff);
+	status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(vbuff), vbuff, NULL);
+	if (status == CL_SUCCESS)
+		applog(LOG_INFO, "CL Platform version: %s", vbuff);
+
 	status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices);
 	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error: Getting Device IDs (num)");
+		applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status);
 		return NULL;
 	}
 
-	cl_device_id *devices;
 	if (numDevices > 0 ) {
 		devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id));
 
@@ -246,7 +249,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 
 		status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL);
 		if (status != CL_SUCCESS) {
-			applog(LOG_ERR, "Error: Getting Device IDs (list)");
+			applog(LOG_ERR, "Error %d: Getting Device IDs (list)", status);
 			return NULL;
 		}
 
@@ -254,10 +257,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 
 		unsigned int i;
 		for (i = 0; i < numDevices; i++) {
-			char pbuff[100];
 			status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
 			if (status != CL_SUCCESS) {
-				applog(LOG_ERR, "Error: Getting Device Info");
+				applog(LOG_ERR, "Error %d: Getting Device Info", status);
 				return NULL;
 			}
 
@@ -265,10 +267,9 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 		}
 
 		if (gpu < numDevices) {
-			char pbuff[100];
 			status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL);
 			if (status != CL_SUCCESS) {
-				applog(LOG_ERR, "Error: Getting Device Info");
+				applog(LOG_ERR, "Error %d: Getting Device Info", status);
 				return NULL;
 			}
 
@@ -285,7 +286,7 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 
 	clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status);
 	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error: Creating Context. (clCreateContextFromType)");
+		applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status);
 		return NULL;
 	}
 
@@ -297,154 +298,196 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 
 	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL);
 	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS");
+		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS", status);
 		return NULL;
 	}
 	find = strstr(extensions, camo);
 	if (find)
 		clState->hasBitAlign = true;
+		
+	/* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */
+	char * devoclver = malloc(1024);
+	const char * ocl10 = "OpenCL 1.0";
 
-	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&clState->preferred_vwidth, NULL);
+	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_VERSION, 1024, (void *)devoclver, NULL);
 	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT");
+		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_VERSION", status);
 		return NULL;
 	}
-	if (opt_debug)
-		applog(LOG_DEBUG, "Preferred vector width reported %d", clState->preferred_vwidth);
+	find = strstr(devoclver, ocl10);
+	if (!find)
+		clState->hasOpenCL11plus = true;
 
-	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL);
+	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&preferred_vwidth, NULL);
 	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE");
+		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT", status);
 		return NULL;
 	}
-	if (opt_debug)
-		applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size);
+	applog(LOG_DEBUG, "Preferred vector width reported %d", preferred_vwidth);
 
-	/* For some reason 2 vectors is still better even if the card says
-	 * otherwise, and many cards lie about their max so use 256 as max
-	 * unless explicitly set on the command line */
-	if (clState->preferred_vwidth > 1)
-		clState->preferred_vwidth = 2;
-	if (opt_vectors)
-		clState->preferred_vwidth = opt_vectors;
-	if (opt_worksize && opt_worksize <= clState->max_work_size)
-		clState->work_size = opt_worksize;
-	else
-		clState->work_size = (clState->max_work_size <= 256 ? clState->max_work_size : 256) /
-				clState->preferred_vwidth;
+	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL);
+	if (status != CL_SUCCESS) {
+		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE", status);
+		return NULL;
+	}
+	applog(LOG_DEBUG, "Max work group size reported %d", clState->max_work_size);
 
 	/* Create binary filename based on parameters passed to opencl
 	 * compiler to ensure we only load a binary that matches what would
 	 * have otherwise created. The filename is:
-	 * name + kernelname +/i bitalign + v + vectors + w + work_size + sizeof(long) + .bin
+	 * name + kernelname +/- g(offset) + v + vectors + w + work_size + l + sizeof(long) + .bin
 	 */
 	char binaryfilename[255];
+	char filename[255];
 	char numbuf[10];
-	char filename[16];
 
-	if (chosen_kernel == KL_NONE) {
-		if (clState->hasBitAlign)
-			chosen_kernel = KL_PHATK;
-		else
-			chosen_kernel = KL_POCLBM;
-	}
+	if (gpus[gpu].kernel == KL_NONE) {
+		/* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */
+		if (!strstr(name, "Tahiti") &&
+			(strstr(vbuff, "844.4") ||  // Linux 64 bit ATI 2.6 SDK
+			 strstr(vbuff, "851.4") ||  // Windows 64 bit ""
+			 strstr(vbuff, "831.4") ||
+			 strstr(vbuff, "898.1"))) { // 12.2 driver SDK
+				applog(LOG_INFO, "Selecting diablo kernel");
+				clState->chosen_kernel = KL_DIABLO;
+		/* Detect all 7970s, older ATI and NVIDIA and use poclbm */
+		} else if (strstr(name, "Tahiti") || !clState->hasBitAlign) {
+			applog(LOG_INFO, "Selecting poclbm kernel");
+			clState->chosen_kernel = KL_POCLBM;
+		/* Use phatk for the rest R5xxx R6xxx */
+		} else {
+			applog(LOG_INFO, "Selecting phatk kernel");
+			clState->chosen_kernel = KL_PHATK;
+		}
+		gpus[gpu].kernel = clState->chosen_kernel;
+	} else
+		clState->chosen_kernel = gpus[gpu].kernel;
 
-	switch (chosen_kernel) {
+	/* For some reason 2 vectors is still better even if the card says
+	 * otherwise, and many cards lie about their max so use 256 as max
+	 * unless explicitly set on the command line. Tahiti prefers 1 */
+	if (strstr(name, "Tahiti"))
+		preferred_vwidth = 1;
+	else if (preferred_vwidth > 2)
+		preferred_vwidth = 2;
+
+	switch (clState->chosen_kernel) {
 		case KL_POCLBM:
-			strcpy(filename, "poclbm110817.cl");
-			strcpy(binaryfilename, "poclbm110817");
+			strcpy(filename, POCLBM_KERNNAME".cl");
+			strcpy(binaryfilename, POCLBM_KERNNAME);
 			break;
-		case KL_NONE: /* Shouldn't happen */
 		case KL_PHATK:
-			strcpy(filename, "phatk110817.cl");
-			strcpy(binaryfilename, "phatk110817");
+			strcpy(filename, PHATK_KERNNAME".cl");
+			strcpy(binaryfilename, PHATK_KERNNAME);
+			break;
+		case KL_DIAKGCN:
+			strcpy(filename, DIAKGCN_KERNNAME".cl");
+			strcpy(binaryfilename, DIAKGCN_KERNNAME);
 			break;
+		case KL_NONE: /* Shouldn't happen */
+		case KL_DIABLO:
+			strcpy(filename, DIABLO_KERNNAME".cl");
+			strcpy(binaryfilename, DIABLO_KERNNAME);
+			break;
+	}
+
+	if (gpus[gpu].vwidth)
+		clState->vwidth = gpus[gpu].vwidth;
+	else {
+		clState->vwidth = preferred_vwidth;
+		gpus[gpu].vwidth = preferred_vwidth;
 	}
 
+	if ((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO) &&
+		clState->vwidth == 1 && clState->hasOpenCL11plus)
+			clState->goffset = true;
+
+	if (gpus[gpu].work_size && gpus[gpu].work_size <= clState->max_work_size)
+		clState->wsize = gpus[gpu].work_size;
+	else if (strstr(name, "Tahiti"))
+		clState->wsize = 64;
+	else
+		clState->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth;
+	gpus[gpu].work_size = clState->wsize;
+
 	FILE *binaryfile;
 	size_t *binary_sizes;
 	char **binaries;
 	int pl;
 	char *source = file_contents(filename, &pl);
 	size_t sourceSize[] = {(size_t)pl};
+	cl_uint slot, cpnd;
+
+	slot = cpnd = 0;
 
 	if (!source)
 		return NULL;
 
-	binary_sizes = (size_t *)malloc(sizeof(size_t)*numDevices);
+	binary_sizes = calloc(sizeof(size_t) * MAX_GPUDEVICES * 4, 1);
 	if (unlikely(!binary_sizes)) {
-		applog(LOG_ERR, "Unable to malloc binary_sizes");
+		applog(LOG_ERR, "Unable to calloc binary_sizes");
 		return NULL;
 	}
-	binaries = (char **)malloc(sizeof(char *)*numDevices);
+	binaries = calloc(sizeof(char *) * MAX_GPUDEVICES * 4, 1);
 	if (unlikely(!binaries)) {
-		applog(LOG_ERR, "Unable to malloc binaries");
+		applog(LOG_ERR, "Unable to calloc binaries");
 		return NULL;
 	}
 
 	strcat(binaryfilename, name);
-	if (clState->hasBitAlign)
-		strcat(binaryfilename, "bitalign");
-
+	if (clState->goffset)
+		strcat(binaryfilename, "g");
 	strcat(binaryfilename, "v");
-	sprintf(numbuf, "%d", clState->preferred_vwidth);
+	sprintf(numbuf, "%d", clState->vwidth);
 	strcat(binaryfilename, numbuf);
 	strcat(binaryfilename, "w");
-	sprintf(numbuf, "%d", (int)clState->work_size);
+	sprintf(numbuf, "%d", (int)clState->wsize);
 	strcat(binaryfilename, numbuf);
-	strcat(binaryfilename, "long");
+	strcat(binaryfilename, "l");
 	sprintf(numbuf, "%d", (int)sizeof(long));
 	strcat(binaryfilename, numbuf);
 	strcat(binaryfilename, ".bin");
 
 	binaryfile = fopen(binaryfilename, "rb");
 	if (!binaryfile) {
-		if (opt_debug)
-			applog(LOG_DEBUG, "No binary found, generating from source");
+		applog(LOG_DEBUG, "No binary found, generating from source");
 	} else {
 		struct stat binary_stat;
 
 		if (unlikely(stat(binaryfilename, &binary_stat))) {
-			if (opt_debug)
-				applog(LOG_DEBUG, "Unable to stat binary, generating from source");
+			applog(LOG_DEBUG, "Unable to stat binary, generating from source");
 			fclose(binaryfile);
 			goto build;
 		}
 		if (!binary_stat.st_size)
 			goto build;
 
-		binary_sizes[gpu] = binary_stat.st_size;
-		binaries[gpu] = (char *)malloc(binary_sizes[gpu]);
-		if (unlikely(!binaries[gpu])) {
-			applog(LOG_ERR, "Unable to malloc binaries");
+		binary_sizes[slot] = binary_stat.st_size;
+		binaries[slot] = (char *)calloc(binary_sizes[slot], 1);
+		if (unlikely(!binaries[slot])) {
+			applog(LOG_ERR, "Unable to calloc binaries");
 			fclose(binaryfile);
 			return NULL;
 		}
 
-		if (fread(binaries[gpu], 1, binary_sizes[gpu], binaryfile) != binary_sizes[gpu]) {
-			applog(LOG_ERR, "Unable to fread binaries[gpu]");
+		if (fread(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot]) {
+			applog(LOG_ERR, "Unable to fread binaries");
 			fclose(binaryfile);
-			free(binaries[gpu]);
+			free(binaries[slot]);
 			goto build;
 		}
 
-		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[gpu], (const unsigned char **)&binaries[gpu], &status, NULL);
+		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)binaries, &status, NULL);
 		if (status != CL_SUCCESS) {
-			applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)");
+			applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
 			fclose(binaryfile);
-			free(binaries[gpu]);
+			free(binaries[slot]);
 			goto build;
 		}
-		fclose(binaryfile);
-		if (opt_debug)
-			applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename);
 
-		/* We don't need to patch this already loaded image, but need to
-		 * set the flag for status later */
-		if (clState->hasBitAlign)
-			patchbfi = true;
+		fclose(binaryfile);
+		applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename);
 
-		free(binaries[gpu]);
 		goto built;
 	}
 
@@ -455,30 +498,22 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize)
 build:
 	clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status);
 	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithSource)");
-		return NULL;
-	}
-
-	clRetainProgram(clState->program);
-	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)");
+		applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithSource)", status);
 		return NULL;
 	}
 
 	/* create a cl program executable for all the devices specified */
-	char CompilerOptions[256];
+	char *CompilerOptions = calloc(1, 256);
 
-	sprintf(CompilerOptions, "-DWORKSIZE=%d -DVECTORS%d",
-		(int)clState->work_size, clState->preferred_vwidth);
-	if (opt_debug)
-		applog(LOG_DEBUG, "Setting worksize to %d", clState->work_size);
-	if (clState->preferred_vwidth > 1 && opt_debug)
-		applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->preferred_vwidth);
+	sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d",
+		(int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth);
+	applog(LOG_DEBUG, "Setting worksize to %d", clState->wsize);
+	if (clState->vwidth > 1)
+		applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth);
 
 	if (clState->hasBitAlign) {
-		strcat(CompilerOptions, " -DBITALIGN");
-		if (opt_debug)
-			applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN");
+		strcat(CompilerOptions, " -D BITALIGN");
+		applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN");
 		if (strstr(name, "Cedar") ||
 		    strstr(name, "Redwood") ||
 		    strstr(name, "Juniper") ||
@@ -494,54 +529,72 @@ build:
 		    strstr(name, "WinterPark" ) ||
 		    strstr(name, "BeaverCreek" ))
 			patchbfi = true;
-	} else if (opt_debug)
+	} else
 		applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN");
 
 	if (patchbfi) {
-		strcat(CompilerOptions, " -DBFI_INT");
-		if (opt_debug)
-			applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT");
-	} else if (opt_debug)
+		strcat(CompilerOptions, " -D BFI_INT");
+		applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT");
+	} else
 		applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch");
 
+	if (clState->goffset)
+		strcat(CompilerOptions, " -D GOFFSET");
+
+	applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions);
 	status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL);
+	free(CompilerOptions);
 
 	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error: Building Program (clBuildProgram)");
+		applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status);
 		size_t logSize;
 		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
 
 		char *log = malloc(logSize);
 		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
-		applog(LOG_INFO, "%s", log);
+		applog(LOG_ERR, "%s", log);
 		return NULL;
 	}
 
-	status = clGetProgramInfo( clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*numDevices, binary_sizes, NULL );
+	prog_built = true;
+
+	status = clGetProgramInfo(clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL);
 	if (unlikely(status != CL_SUCCESS)) {
-		applog(LOG_ERR, "Error: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetPlatformInfo)");
+		applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)", status);
 		return NULL;
 	}
 
+	status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL);
+	if (unlikely(status != CL_SUCCESS)) {
+		applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)", status);
+		return NULL;
+	}
+
+	/* The actual compiled binary ends up in a RANDOM slot! Grr, so we have
+	 * to iterate over all the binary slots and find where the real program
+	 * is. What the heck is this!? */
+	for (slot = 0; slot < cpnd; slot++)
+		if (binary_sizes[slot])
+			break;
+
 	/* copy over all of the generated binaries. */
-	if (opt_debug)
-		applog(LOG_DEBUG, "binary size %d : %d", gpu, binary_sizes[gpu]);
-	if (!binary_sizes[gpu]) {
-		applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, may need to reboot!");
+	applog(LOG_DEBUG, "Binary size for gpu %d found in binary slot %d: %d", gpu, slot, binary_sizes[slot]);
+	if (!binary_sizes[slot]) {
+		applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, FAIL!");
 		return NULL;
 	}
-	binaries[gpu] = (char *)malloc( sizeof(char)*binary_sizes[gpu]);
-	status = clGetProgramInfo( clState->program, CL_PROGRAM_BINARIES, sizeof(char *)*numDevices, binaries, NULL );
+	binaries[slot] = calloc(sizeof(char) * binary_sizes[slot], 1);
+	status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL );
 	if (unlikely(status != CL_SUCCESS)) {
-		applog(LOG_ERR, "Error: Getting program info. (clGetPlatformInfo)");
+		applog(LOG_ERR, "Error %d: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)", status);
 		return NULL;
 	}
 
 	/* Patch the kernel if the hardware supports BFI_INT but it needs to
 	 * be hacked in */
 	if (patchbfi) {
-		unsigned remaining = binary_sizes[gpu];
-		char *w = binaries[gpu];
+		unsigned remaining = binary_sizes[slot];
+		char *w = binaries[slot];
 		unsigned int start, length;
 
 		/* Find 2nd incidence of .text, and copy the program's
@@ -549,7 +602,7 @@ build:
 		* back and find the 2nd incidence of \x7ELF (rewind by one
 		* from ELF) and then patch the opcocdes */
 		if (!advance(&w, &remaining, ".text"))
-			{patchbfi = 0; goto build;}
+			goto build;
 		w++; remaining--;
 		if (!advance(&w, &remaining, ".text")) {
 			/* 32 bit builds only one ELF */
@@ -557,9 +610,9 @@ build:
 		}
 		memcpy(&start, w + 285, 4);
 		memcpy(&length, w + 289, 4);
-		w = binaries[gpu]; remaining = binary_sizes[gpu];
+		w = binaries[slot]; remaining = binary_sizes[slot];
 		if (!advance(&w, &remaining, "ELF"))
-			{patchbfi = 0; goto build;}
+			goto build;
 		w++; remaining--;
 		if (!advance(&w, &remaining, "ELF")) {
 			/* 32 bit builds only one ELF */
@@ -567,28 +620,24 @@ build:
 		}
 		w--; remaining++;
 		w += start; remaining -= start;
-		if (opt_debug)
-			applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching",
-				w, remaining);
+		applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching",
+			w, remaining);
 		patch_opcodes(w, length);
 
 		status = clReleaseProgram(clState->program);
 		if (status != CL_SUCCESS) {
-			applog(LOG_ERR, "Error: Releasing program. (clReleaseProgram)");
+			applog(LOG_ERR, "Error %d: Releasing program. (clReleaseProgram)", status);
 			return NULL;
 		}
 
-		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[gpu], (const unsigned char **)&binaries[gpu], &status, NULL);
+		clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)&binaries[slot], &status, NULL);
 		if (status != CL_SUCCESS) {
-			applog(LOG_ERR, "Error: Loading Binary into cl_program (clCreateProgramWithBinary)");
+			applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
 			return NULL;
 		}
 
-		clRetainProgram(clState->program);
-		if (status != CL_SUCCESS) {
-			applog(LOG_ERR, "Error: Retaining Program (clRetainProgram)");
-			return NULL;
-		}
+		/* Program needs to be rebuilt */
+		prog_built = false;
 	}
 
 	free(source);
@@ -597,41 +646,42 @@ build:
 	binaryfile = fopen(binaryfilename, "wb");
 	if (!binaryfile) {
 		/* Not a fatal problem, just means we build it again next time */
-		if (opt_debug)
-			applog(LOG_DEBUG, "Unable to create file %s", binaryfilename);
+		applog(LOG_DEBUG, "Unable to create file %s", binaryfilename);
 	} else {
-		if (unlikely(fwrite(binaries[gpu], 1, binary_sizes[gpu], binaryfile) != binary_sizes[gpu])) {
+		if (unlikely(fwrite(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot])) {
 			applog(LOG_ERR, "Unable to fwrite to binaryfile");
 			return NULL;
 		}
 		fclose(binaryfile);
 	}
-	if (binaries[gpu])
-		free(binaries[gpu]);
 built:
+	if (binaries[slot])
+		free(binaries[slot]);
 	free(binaries);
 	free(binary_sizes);
 
-	applog(LOG_INFO, "Initialising kernel %s with%s BFI_INT, %d vectors and worksize %d",
-	       filename, patchbfi ? "" : "out", clState->preferred_vwidth, clState->work_size);
+	applog(LOG_INFO, "Initialising kernel %s with%s bitalign, %d vectors and worksize %d",
+	       filename, clState->hasBitAlign ? "" : "out", clState->vwidth, clState->wsize);
 
-	/* create a cl program executable for all the devices specified */
-	status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL);
-	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error: Building Program (clBuildProgram)");
-		size_t logSize;
-		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
+	if (!prog_built) {
+		/* create a cl program executable for all the devices specified */
+		status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL);
+		if (status != CL_SUCCESS) {
+			applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status);
+			size_t logSize;
+			status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
 
-		char *log = malloc(logSize);
-		status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
-		applog(LOG_INFO, "%s", log);
-		return NULL;
+			char *log = malloc(logSize);
+			status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
+			applog(LOG_ERR, "%s", log);
+			return NULL;
+		}
 	}
 
 	/* get a kernel object handle for a kernel with the given name */
 	clState->kernel = clCreateKernel(clState->program, "search", &status);
 	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error: Creating Kernel from program. (clCreateKernel)");
+		applog(LOG_ERR, "Error %d: Creating Kernel from program. (clCreateKernel)", status);
 		return NULL;
 	}
 
@@ -643,13 +693,13 @@ built:
 	if (status != CL_SUCCESS) /* Try again without OOE enable */
 		clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status);
 	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Creating Command Queue. (clCreateCommandQueue)");
+		applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status);
 		return NULL;
 	}
 
-	clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, BUFFERSIZE, NULL, &status);
+	clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);
 	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error: clCreateBuffer (outputBuffer)");
+		applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status);
 		return NULL;
 	}
 

+ 8 - 3
ocl.h

@@ -11,6 +11,8 @@
 #include <CL/cl.h>
 #endif
 
+#include "miner.h"
+
 typedef struct {
 	cl_context context;
 	cl_kernel kernel;
@@ -18,13 +20,16 @@ typedef struct {
 	cl_program program;
 	cl_mem outputBuffer;
 	bool hasBitAlign;
-	cl_uint preferred_vwidth;
+	bool hasOpenCL11plus;
+	bool goffset;
+	cl_uint vwidth;
 	size_t max_work_size;
-	size_t work_size;
+	size_t wsize;
+	enum cl_kernels chosen_kernel;
 } _clState;
 
 extern char *file_contents(const char *filename, int *length);
-extern int clDevicesNum();
+extern int clDevicesNum(void);
 extern _clState *initCl(unsigned int gpu, char *name, size_t nameSize);
 #endif /* HAVE_OPENCL */
 #endif /* __OCL_H__ */

+ 45 - 44
phatk110817.cl → phatk120223.cl

@@ -1,15 +1,13 @@
 // This file is taken and modified from the public-domain poclbm project, and
 // I have therefore decided to keep it public-domain.
-
+// Modified version copyright 2011-2012 Con Kolivas
 
 #ifdef VECTORS4
 	typedef uint4 u;
-#else 
-	#ifdef VECTORS2
-		typedef uint2 u;
-	#else
-		typedef uint u;
-	#endif
+#elif defined VECTORS2
+	typedef uint2 u;
+#else
+	typedef uint u;
 #endif
 
 __constant uint K[64] = { 
@@ -51,9 +49,6 @@ __constant uint H[8] = {
 #ifdef BITALIGN
 	#pragma OPENCL EXTENSION cl_amd_media_ops : enable
 	#define rot(x, y) amd_bitalign(x, x, (uint)(32 - y))
-#else
-	#define rot(x, y) rotate(x, (uint)y)
-#endif
 
 // This part is not from the stock poclbm kernel. It's part of an optimization
 // added in the Phoenix Miner.
@@ -63,7 +58,7 @@ __constant uint H[8] = {
 // detected, use it for Ch. Otherwise, construct Ch out of simpler logical
 // primitives.
 
-#ifdef BFI_INT
+ #ifdef BFI_INT
 	// Well, slight problem... It turns out BFI_INT isn't actually exposed to
 	// OpenCL (or CAL IL for that matter) in any way. However, there is 
 	// a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via
@@ -75,13 +70,23 @@ __constant uint H[8] = {
 	#define Ch(x, y, z) amd_bytealign(x,y,z)
 	// Ma can also be implemented in terms of BFI_INT...
 	#define Ma(z, x, y) amd_bytealign(z^x,y,x)
-#else
-	#define Ch(x, y, z) bitselect(x,y,z)
-	// Ma can also be implemented in terms of bitselect
-	#define Ma(z, x, y) bitselect(z^x,y,x)
+ #else // BFI_INT
+	// Later SDKs optimise this to BFI INT without patching and GCN
+	// actually fails if manually patched with BFI_INT
+
+	#define Ch(x, y, z) bitselect((u)z, (u)y, (u)x)
+	#define Ma(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x)
+	#define rotr(x, y) amd_bitalign((u)x, (u)x, (u)y)
+ #endif
+#else // BITALIGN
+	#define Ch(x, y, z) (z ^ (x & (y ^ z)))
+	#define Ma(x, y, z) ((x & z) | (y & (x | z)))
+	#define rot(x, y) rotate((u)x, (u)y)
+	#define rotr(x, y) rotate((u)x, (u)(32-y))
 #endif
 
 
+
 //Various intermediate calculations for each SHA round
 #define s0(n) (S0(Vals[(0 + 128 - (n)) % 8]))
 #define S0(n) (rot(n, 30u)^rot(n, 19u)^rot(n,10u))
@@ -168,7 +173,7 @@ void search(	const uint state0, const uint state1, const uint state2, const uint
 
 //Dummy Variable to prevent compiler from reordering between rounds
 	u t1;
-	
+
 	//Vals[0]=state0;
 	Vals[1]=B1;
 	Vals[2]=C1;
@@ -187,16 +192,14 @@ void search(	const uint state0, const uint state1, const uint state2, const uint
 	uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
 	//Since only the 2 LSB is opposite between the nonces, we can save an instruction by flipping the 4 bits in W18 rather than the 1 bit in W3
 	W[18] = PreW18 + (u){r, r ^ 0x2004000U, r ^ 0x4008000U, r ^ 0x600C000U};
+#elif defined VECTORS2
+	W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
+	uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
+	W[18] = PreW18 + (u){r, r ^ 0x2004000U};
 #else
-	#ifdef VECTORS2
-		W[3] = base + (uint)(get_local_id(0)) * 2u + (uint)(get_group_id(0)) * (WORKSIZE * 2u);
-		uint r = rot(W[3].x,25u)^rot(W[3].x,14u)^((W[3].x)>>3U);
-		W[18] = PreW18 + (u){r, r ^ 0x2004000U};
-	#else
-		W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
-		u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U);
-		W[18] = PreW18 + r;
-	#endif
+	W[3] = base + get_local_id(0) + get_group_id(0) * (WORKSIZE);
+	u r = rot(W[3],25u)^rot(W[3],14u)^((W[3])>>3U);
+	W[18] = PreW18 + r;
 #endif
 	//the order of the W calcs and Rounds is like this because the compiler needs help finding how to order the instructions
 
@@ -381,36 +384,34 @@ void search(	const uint state0, const uint state1, const uint state2, const uint
 	sharoundW(64 + 57);
 	sharoundW(64 + 58);
 
-	u v = W[117] + W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64),Vals[1],Vals[2]) ^
-		-(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(59+64))  + s1(64+59)+ ch(59+64));
+	W[117] += W[108] + Vals[3] + Vals[7] + P2(124) + P1(124) + Ch((Vals[0] + Vals[4]) + (K[59] + W(59+64)) + s1(64+59)+ ch(59+64),Vals[1],Vals[2]) -
+		(-(K[60] + H[7]) - S1((Vals[0] + Vals[4]) + (K[59] + W(59+64))  + s1(64+59)+ ch(59+64)));
 
 #define FOUND (0x80)
 #define NFLAG (0x7F)
 
 #ifdef VECTORS4
-	bool result = v.x & v.y & v.z & v.w;
+	bool result = W[117].x & W[117].y & W[117].z & W[117].w;
 	if (!result) {
-		if (!v.x)
+		if (!W[117].x)
 			output[FOUND] = output[NFLAG & W[3].x] = W[3].x;
-		if (!v.y)
+		if (!W[117].y)
 			output[FOUND] = output[NFLAG & W[3].y] = W[3].y;
-		if (!v.z)
+		if (!W[117].z)
 			output[FOUND] = output[NFLAG & W[3].z] = W[3].z;
-		if (!v.w)
+		if (!W[117].w)
 			output[FOUND] = output[NFLAG & W[3].w] = W[3].w;
 	}
+#elif defined VECTORS2
+	bool result = W[117].x & W[117].y;
+	if (!result) {
+		if (!W[117].x)
+			output[FOUND] = output[NFLAG & W[3].x] = W[3].x;
+		if (!W[117].y)
+			output[FOUND] = output[NFLAG & W[3].y] = W[3].y;
+	}
 #else
-	#ifdef VECTORS2
-		bool result = v.x & v.y;
-		if (!result) {
-			if (!v.x)
-				output[FOUND] = output[NFLAG & W[3].x] = W[3].x;
-			if (!v.y)
-				output[FOUND] = output[NFLAG & W[3].y] = W[3].y;
-		}
-	#else
-		if (!v)
-			output[FOUND] = output[NFLAG & W[3]] = W[3];
-	#endif
+	if (!W[117])
+		output[FOUND] = output[NFLAG & W[3]] = W[3];
 #endif
 }

+ 0 - 650
poclbm110817.cl

@@ -1,650 +0,0 @@
-// -ck modified kernel taken from Phoenix taken from poclbm, with aspects of
-// phatk and others.
-// Modified version copyright 2011 Con Kolivas
-
-// This file is taken and modified from the public-domain poclbm project, and
-// we have therefore decided to keep it public-domain in Phoenix.
-
-#ifdef VECTORS4
-	typedef uint4 u;
-#elif defined VECTORS2
-	typedef uint2 u;
-#else
-	typedef uint u;
-#endif
-
-__constant uint K[64] = { 
-    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-
-// This part is not from the stock poclbm kernel. It's part of an optimization
-// added in the Phoenix Miner.
-
-// Some AMD devices have a BFI_INT opcode, which behaves exactly like the
-// SHA-256 ch function, but provides it in exactly one instruction. If
-// detected, use it for ch. Otherwise, construct ch out of simpler logical
-// primitives.
-
-#ifdef BFI_INT
-	// Well, slight problem... It turns out BFI_INT isn't actually exposed to
-	// OpenCL (or CAL IL for that matter) in any way. However, there is 
-	// a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via
-	// amd_bytealign, takes the same inputs, and provides the same output. 
-	// We can use that as a placeholder for BFI_INT and have the application 
-	// patch it after compilation.
-	
-	// This is the BFI_INT function
-	#define ch(x, y, z) amd_bytealign(x, y, z)
-	
-	// Ma can also be implemented in terms of BFI_INT...
-	#define Ma(x, y, z) amd_bytealign( (z^x), (y), (x) )
-#else
-	#define ch(x, y, z) (z ^ (x & (y ^ z)))
-	#define Ma(x, y, z) ((x & z) | (y & (x | z)))
-#endif
-
-#ifdef BITALIGN
-	#pragma OPENCL EXTENSION cl_amd_media_ops : enable
-	#define rotr(x, y) amd_bitalign((u)x, (u)x, (u)y)
-#else
-	#define rotr(x, y) rotate((u)x, (u)(32-y))
-#endif
-
-// AMD's KernelAnalyzer throws errors compiling the kernel if we use 
-// amd_bytealign on constants with vectors enabled, so we use this to avoid 
-// problems. (this is used 4 times, and likely optimized out by the compiler.)
-#define Ma2(x, y, z) ((y & z) | (x & (y | z)))
-
-__kernel void search(	const uint state0, const uint state1, const uint state2, const uint state3,
-						const uint state4, const uint state5, const uint state6, const uint state7,
-						const uint b1, const uint c1, const uint d1,
-						const uint f1, const uint g1, const uint h1,
-						const uint base,
-						const uint fw0, const uint fw1, const uint fw2, const uint fw3, const uint fw15, const uint fw01r, const uint fcty_e, const uint fcty_e2,
-						__global uint * output)
-{
-	u W[24];
-	u Vals[8];
-	u nonce;
-
-#ifdef VECTORS4
-	nonce = base + (get_global_id(0)<<2) + (uint4)(0, 1, 2, 3);
-#elif defined VECTORS2
-	nonce = base + (get_global_id(0)<<1) + (uint2)(0, 1);
-#else
-	nonce = base + get_global_id(0);
-#endif
-
-	W[3] = nonce + fw3;
-	Vals[4] = fcty_e +  nonce;
-	Vals[0] = state0 + Vals[4];
-	Vals[4] = Vals[4] + fcty_e2;
-	Vals[3] = d1 + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], b1, c1) + K[ 4] +  0x80000000;
-	Vals[7] = h1 + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma2(g1, Vals[4], f1);
-	Vals[2] = c1 + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], b1) + K[ 5];
-	Vals[6] = g1 + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma2(f1, Vals[3], Vals[4]);
-	Vals[1] = b1 + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[ 6];
-	Vals[5] = f1 + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[ 7];
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-	Vals[7] = Vals[7] + (rotr(Vals[4], 6) ^ rotr(Vals[4], 11) ^ rotr(Vals[4], 25)) + ch(Vals[4], Vals[5], Vals[6]) + K[ 8];
-	Vals[3] = Vals[3] + Vals[7];
-	Vals[7] = Vals[7] + (rotr(Vals[0], 2) ^ rotr(Vals[0], 13) ^ rotr(Vals[0], 22)) + Ma(Vals[2], Vals[0], Vals[1]);
-	Vals[6] = Vals[6] + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + ch(Vals[3], Vals[4], Vals[5]) + K[ 9];
-	Vals[2] = Vals[2] + Vals[6];
-	Vals[6] = Vals[6] + (rotr(Vals[7], 2) ^ rotr(Vals[7], 13) ^ rotr(Vals[7], 22)) + Ma(Vals[1], Vals[7], Vals[0]);
-	Vals[5] = Vals[5] + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], Vals[4]) + K[10];
-	Vals[1] = Vals[1] + Vals[5];
-	Vals[5] = Vals[5] + (rotr(Vals[6], 2) ^ rotr(Vals[6], 13) ^ rotr(Vals[6], 22)) + Ma(Vals[0], Vals[6], Vals[7]);
-	Vals[4] = Vals[4] + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[11];
-	Vals[0] = Vals[0] + Vals[4];
-	Vals[4] = Vals[4] + (rotr(Vals[5], 2) ^ rotr(Vals[5], 13) ^ rotr(Vals[5], 22)) + Ma(Vals[7], Vals[5], Vals[6]);
-	Vals[3] = Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[12];
-	Vals[7] = Vals[7] + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma(Vals[6], Vals[4], Vals[5]);
-	Vals[2] = Vals[2] + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], Vals[1]) + K[13];
-	Vals[6] = Vals[6] + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma(Vals[5], Vals[3], Vals[4]);
-	Vals[1] = Vals[1] + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[14];
-	Vals[5] = Vals[5] + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[15] + 0x00000280U;
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-	Vals[7] = Vals[7] + (rotr(Vals[4], 6) ^ rotr(Vals[4], 11) ^ rotr(Vals[4], 25)) + ch(Vals[4], Vals[5], Vals[6]) + K[16] + fw0;
-	Vals[3] = Vals[3] + Vals[7];
-	Vals[7] = Vals[7] + (rotr(Vals[0], 2) ^ rotr(Vals[0], 13) ^ rotr(Vals[0], 22)) + Ma(Vals[2], Vals[0], Vals[1]);
-	Vals[6] = Vals[6] + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + ch(Vals[3], Vals[4], Vals[5]) + K[17] + fw1;
-	Vals[2] = Vals[2] + Vals[6];
-	Vals[6] = Vals[6] + (rotr(Vals[7], 2) ^ rotr(Vals[7], 13) ^ rotr(Vals[7], 22)) + Ma(Vals[1], Vals[7], Vals[0]);
-	W[2] = (rotr(nonce, 7) ^ rotr(nonce, 18) ^ (nonce >> 3U)) + fw2;
-	Vals[5] = Vals[5] + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], Vals[4]) + K[18] +  W[2];
-	Vals[1] = Vals[1] + Vals[5];
-	Vals[5] = Vals[5] + (rotr(Vals[6], 2) ^ rotr(Vals[6], 13) ^ rotr(Vals[6], 22)) + Ma(Vals[0], Vals[6], Vals[7]);
-	Vals[4] = Vals[4] + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[19] +  W[3];
-	Vals[0] = Vals[0] + Vals[4];
-	Vals[4] = Vals[4] + (rotr(Vals[5], 2) ^ rotr(Vals[5], 13) ^ rotr(Vals[5], 22)) + Ma(Vals[7], Vals[5], Vals[6]);
-	W[4] = (rotr(W[2], 17) ^ rotr(W[2], 19) ^ (W[2] >> 10U)) + 0x80000000;
-	
-	Vals[3] = Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[20] +  W[4];
-	Vals[7] = Vals[7] + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma(Vals[6], Vals[4], Vals[5]);
-	W[5] = (rotr(W[3], 17) ^ rotr(W[3], 19) ^ (W[3] >> 10U));
-	
-	Vals[2] = Vals[2] + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], Vals[1]) + K[21] +  W[5];
-	Vals[6] = Vals[6] + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma(Vals[5], Vals[3], Vals[4]);
-	W[6] = (rotr(W[4], 17) ^ rotr(W[4], 19) ^ (W[4] >> 10U)) + 0x00000280U;
-	
-	Vals[1] = Vals[1] + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[22] +  W[6];
-	Vals[5] = Vals[5] + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	W[7] = (rotr(W[5], 17) ^ rotr(W[5], 19) ^ (W[5] >> 10U)) + fw0;
-	
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[23] +  W[7];
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-	W[8] = (rotr(W[6], 17) ^ rotr(W[6], 19) ^ (W[6] >> 10U)) + fw1;
-	
-	Vals[7] = Vals[7] + (rotr(Vals[4], 6) ^ rotr(Vals[4], 11) ^ rotr(Vals[4], 25)) + ch(Vals[4], Vals[5], Vals[6]) + K[24] +  W[8];
-	Vals[3] = Vals[3] + Vals[7];
-	Vals[7] = Vals[7] + (rotr(Vals[0], 2) ^ rotr(Vals[0], 13) ^ rotr(Vals[0], 22)) + Ma(Vals[2], Vals[0], Vals[1]);
-	W[9] = W[2] + (rotr(W[7], 17) ^ rotr(W[7], 19) ^ (W[7] >> 10U));
-	Vals[6] = Vals[6] + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + ch(Vals[3], Vals[4], Vals[5]) + K[25] +  W[9];
-	Vals[2] = Vals[2] + Vals[6];
-	Vals[6] = Vals[6] + (rotr(Vals[7], 2) ^ rotr(Vals[7], 13) ^ rotr(Vals[7], 22)) + Ma(Vals[1], Vals[7], Vals[0]);
-	W[10] = W[3] + (rotr(W[8], 17) ^ rotr(W[8], 19) ^ (W[8] >> 10U));
-	
-	Vals[5] = Vals[5] + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], Vals[4]) + K[26] + W[10];
-	Vals[1] = Vals[1] + Vals[5];
-	Vals[5] = Vals[5] + (rotr(Vals[6], 2) ^ rotr(Vals[6], 13) ^ rotr(Vals[6], 22)) + Ma(Vals[0], Vals[6], Vals[7]);
-	W[11] = W[4] + (rotr(W[9], 17) ^ rotr(W[9], 19) ^ (W[9] >> 10U));
-	
-	Vals[4] = Vals[4] + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[27] + W[11];
-	Vals[0] = Vals[0] + Vals[4];
-	Vals[4] = Vals[4] + (rotr(Vals[5], 2) ^ rotr(Vals[5], 13) ^ rotr(Vals[5], 22)) + Ma(Vals[7], Vals[5], Vals[6]);
-	W[12] = W[5] + (rotr(W[10], 17) ^ rotr(W[10], 19) ^ (W[10] >> 10U));
-	
-	Vals[3] = Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[28] + W[12];
-	Vals[7] = Vals[7] + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma(Vals[6], Vals[4], Vals[5]);
-	W[13] = W[6] + (rotr(W[11], 17) ^ rotr(W[11], 19) ^ (W[11] >> 10U));
-	
-	Vals[2] = Vals[2] + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], Vals[1]) + K[29] + W[13];
-	Vals[6] = Vals[6] + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma(Vals[5], Vals[3], Vals[4]);
-	W[14] = 0x00a00055U + W[7] + (rotr(W[12], 17) ^ rotr(W[12], 19) ^ (W[12] >> 10U));
-	
-	Vals[1] = Vals[1] + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[30] + W[14];
-	Vals[5] = Vals[5] + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	W[15] = fw15 + W[8] + (rotr(W[13], 17) ^ rotr(W[13], 19) ^ (W[13] >> 10U));
-	
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[31] + W[15];
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-	W[0] = fw01r + W[9] + (rotr(W[14], 17) ^ rotr(W[14], 19) ^ (W[14] >> 10U));
-	Vals[7] = Vals[7] + (rotr(Vals[4], 6) ^ rotr(Vals[4], 11) ^ rotr(Vals[4], 25)) + ch(Vals[4], Vals[5], Vals[6]) + K[32] +  W[0];
-	Vals[3] = Vals[3] + Vals[7];
-	Vals[7] = Vals[7] + (rotr(Vals[0], 2) ^ rotr(Vals[0], 13) ^ rotr(Vals[0], 22)) + Ma(Vals[2], Vals[0], Vals[1]);
-	W[1] = fw1 + (rotr(W[2], 7) ^ rotr(W[2], 18) ^ (W[2] >> 3U)) + W[10] + (rotr(W[15], 17) ^ rotr(W[15], 19) ^ (W[15] >> 10U));
-	Vals[6] = Vals[6] + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + ch(Vals[3], Vals[4], Vals[5]) + K[33] +  W[1];
-	Vals[2] = Vals[2] + Vals[6];
-	Vals[6] = Vals[6] + (rotr(Vals[7], 2) ^ rotr(Vals[7], 13) ^ rotr(Vals[7], 22)) + Ma(Vals[1], Vals[7], Vals[0]);
-	W[2] = W[2] + (rotr(W[3], 7) ^ rotr(W[3], 18) ^ (W[3] >> 3U)) + W[11] + (rotr(W[0], 17) ^ rotr(W[0], 19) ^ (W[0] >> 10U));
-	Vals[5] = Vals[5] + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], Vals[4]) + K[34] +  W[2];
-	Vals[1] = Vals[1] + Vals[5];
-	Vals[5] = Vals[5] + (rotr(Vals[6], 2) ^ rotr(Vals[6], 13) ^ rotr(Vals[6], 22)) + Ma(Vals[0], Vals[6], Vals[7]);
-	W[3] = W[3] + (rotr(W[4], 7) ^ rotr(W[4], 18) ^ (W[4] >> 3U)) + W[12] + (rotr(W[1], 17) ^ rotr(W[1], 19) ^ (W[1] >> 10U));
-	
-	Vals[4] = Vals[4] + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[35] +  W[3];
-	Vals[0] = Vals[0] + Vals[4];
-	Vals[4] = Vals[4] + (rotr(Vals[5], 2) ^ rotr(Vals[5], 13) ^ rotr(Vals[5], 22)) + Ma(Vals[7], Vals[5], Vals[6]);
-	W[4] = W[4] + (rotr(W[5], 7) ^ rotr(W[5], 18) ^ (W[5] >> 3U)) + W[13] + (rotr(W[2], 17) ^ rotr(W[2], 19) ^ (W[2] >> 10U));
-	
-	Vals[3] = Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[36] +  W[4];
-	Vals[7] = Vals[7] + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma(Vals[6], Vals[4], Vals[5]);
-	W[5] = W[5] + (rotr(W[6], 7) ^ rotr(W[6], 18) ^ (W[6] >> 3U)) + W[14] + (rotr(W[3], 17) ^ rotr(W[3], 19) ^ (W[3] >> 10U));
-	
-	Vals[2] = Vals[2] + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], Vals[1]) + K[37] +  W[5];
-	Vals[6] = Vals[6] + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma(Vals[5], Vals[3], Vals[4]);
-	W[6] = W[6] + (rotr(W[7], 7) ^ rotr(W[7], 18) ^ (W[7] >> 3U)) + W[15] + (rotr(W[4], 17) ^ rotr(W[4], 19) ^ (W[4] >> 10U));
-	
-	Vals[1] = Vals[1] + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[38] +  W[6];
-	Vals[5] = Vals[5] + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	W[7] = W[7] + (rotr(W[8], 7) ^ rotr(W[8], 18) ^ (W[8] >> 3U)) + W[0] + (rotr(W[5], 17) ^ rotr(W[5], 19) ^ (W[5] >> 10U));
-	
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[39] +  W[7];
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-	W[8] = W[8] + (rotr(W[9], 7) ^ rotr(W[9], 18) ^ (W[9] >> 3U)) + W[1] + (rotr(W[6], 17) ^ rotr(W[6], 19) ^ (W[6] >> 10U));
-	
-	Vals[7] = Vals[7] + (rotr(Vals[4], 6) ^ rotr(Vals[4], 11) ^ rotr(Vals[4], 25)) + ch(Vals[4], Vals[5], Vals[6]) + K[40] +  W[8];
-	Vals[3] = Vals[3] + Vals[7];
-	Vals[7] = Vals[7] + (rotr(Vals[0], 2) ^ rotr(Vals[0], 13) ^ rotr(Vals[0], 22)) + Ma(Vals[2], Vals[0], Vals[1]);
-	W[9] = W[9] + (rotr(W[10], 7) ^ rotr(W[10], 18) ^ (W[10] >> 3U)) + W[2] + (rotr(W[7], 17) ^ rotr(W[7], 19) ^ (W[7] >> 10U));
-	
-	Vals[6] = Vals[6] + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + ch(Vals[3], Vals[4], Vals[5]) + K[41] +  W[9];
-	Vals[2] = Vals[2] + Vals[6];
-	Vals[6] = Vals[6] + (rotr(Vals[7], 2) ^ rotr(Vals[7], 13) ^ rotr(Vals[7], 22)) + Ma(Vals[1], Vals[7], Vals[0]);
-	W[10] = W[10] + (rotr(W[11], 7) ^ rotr(W[11], 18) ^ (W[11] >> 3U)) + W[3] + (rotr(W[8], 17) ^ rotr(W[8], 19) ^ (W[8] >> 10U));
-	
-	Vals[5] = Vals[5] + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], Vals[4]) + K[42] + W[10];
-	Vals[1] = Vals[1] + Vals[5];
-	Vals[5] = Vals[5] + (rotr(Vals[6], 2) ^ rotr(Vals[6], 13) ^ rotr(Vals[6], 22)) + Ma(Vals[0], Vals[6], Vals[7]);
-	W[11] = W[11] + (rotr(W[12], 7) ^ rotr(W[12], 18) ^ (W[12] >> 3U)) + W[4] + (rotr(W[9], 17) ^ rotr(W[9], 19) ^ (W[9] >> 10U));
-	
-	Vals[4] = Vals[4] + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[43] + W[11];
-	Vals[0] = Vals[0] + Vals[4];
-	Vals[4] = Vals[4] + (rotr(Vals[5], 2) ^ rotr(Vals[5], 13) ^ rotr(Vals[5], 22)) + Ma(Vals[7], Vals[5], Vals[6]);
-	W[12] = W[12] + (rotr(W[13], 7) ^ rotr(W[13], 18) ^ (W[13] >> 3U)) + W[5] + (rotr(W[10], 17) ^ rotr(W[10], 19) ^ (W[10] >> 10U));
-	
-	Vals[3] = Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[44] + W[12];
-	Vals[7] = Vals[7] + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma(Vals[6], Vals[4], Vals[5]);
-	W[13] = W[13] + (rotr(W[14], 7) ^ rotr(W[14], 18) ^ (W[14] >> 3U)) + W[6] + (rotr(W[11], 17) ^ rotr(W[11], 19) ^ (W[11] >> 10U));
-	
-	Vals[2] = Vals[2] + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], Vals[1]) + K[45] + W[13];
-	Vals[6] = Vals[6] + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma(Vals[5], Vals[3], Vals[4]);
-	W[14] = W[14] + (rotr(W[15], 7) ^ rotr(W[15], 18) ^ (W[15] >> 3U)) + W[7] + (rotr(W[12], 17) ^ rotr(W[12], 19) ^ (W[12] >> 10U));
-	
-	Vals[1] = Vals[1] + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[46] + W[14];
-	Vals[5] = Vals[5] + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	W[15] = W[15] + (rotr(W[0], 7) ^ rotr(W[0], 18) ^ (W[0] >> 3U)) + W[8] + (rotr(W[13], 17) ^ rotr(W[13], 19) ^ (W[13] >> 10U));
-	
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[47] + W[15];
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-	W[0] = W[0] + (rotr(W[1], 7) ^ rotr(W[1], 18) ^ (W[1] >> 3U)) + W[9] + (rotr(W[14], 17) ^ rotr(W[14], 19) ^ (W[14] >> 10U));
-	Vals[7] = Vals[7] + (rotr(Vals[4], 6) ^ rotr(Vals[4], 11) ^ rotr(Vals[4], 25)) + ch(Vals[4], Vals[5], Vals[6]) + K[48] +  W[0];
-	Vals[3] = Vals[3] + Vals[7];
-	Vals[7] = Vals[7] + (rotr(Vals[0], 2) ^ rotr(Vals[0], 13) ^ rotr(Vals[0], 22)) + Ma(Vals[2], Vals[0], Vals[1]);
-	W[1] = W[1] + (rotr(W[2], 7) ^ rotr(W[2], 18) ^ (W[2] >> 3U)) + W[10] + (rotr(W[15], 17) ^ rotr(W[15], 19) ^ (W[15] >> 10U));
-	Vals[6] = Vals[6] + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + ch(Vals[3], Vals[4], Vals[5]) + K[49] +  W[1];
-	Vals[2] = Vals[2] + Vals[6];
-	Vals[6] = Vals[6] + (rotr(Vals[7], 2) ^ rotr(Vals[7], 13) ^ rotr(Vals[7], 22)) + Ma(Vals[1], Vals[7], Vals[0]);
-	W[2] = W[2] + (rotr(W[3], 7) ^ rotr(W[3], 18) ^ (W[3] >> 3U)) + W[11] + (rotr(W[0], 17) ^ rotr(W[0], 19) ^ (W[0] >> 10U));
-	Vals[5] = Vals[5] + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], Vals[4]) + K[50] +  W[2];
-	Vals[1] = Vals[1] + Vals[5];
-	Vals[5] = Vals[5] + (rotr(Vals[6], 2) ^ rotr(Vals[6], 13) ^ rotr(Vals[6], 22)) + Ma(Vals[0], Vals[6], Vals[7]);
-	W[3] = W[3] + (rotr(W[4], 7) ^ rotr(W[4], 18) ^ (W[4] >> 3U)) + W[12] + (rotr(W[1], 17) ^ rotr(W[1], 19) ^ (W[1] >> 10U));
-	
-	Vals[4] = Vals[4] + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[51] +  W[3];
-	Vals[0] = Vals[0] + Vals[4];
-	Vals[4] = Vals[4] + (rotr(Vals[5], 2) ^ rotr(Vals[5], 13) ^ rotr(Vals[5], 22)) + Ma(Vals[7], Vals[5], Vals[6]);
-	W[4] = W[4] + (rotr(W[5], 7) ^ rotr(W[5], 18) ^ (W[5] >> 3U)) + W[13] + (rotr(W[2], 17) ^ rotr(W[2], 19) ^ (W[2] >> 10U));
-	
-	Vals[3] = Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[52] +  W[4];
-	Vals[7] = Vals[7] + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma(Vals[6], Vals[4], Vals[5]);
-	W[5] = W[5] + (rotr(W[6], 7) ^ rotr(W[6], 18) ^ (W[6] >> 3U)) + W[14] + (rotr(W[3], 17) ^ rotr(W[3], 19) ^ (W[3] >> 10U));
-	
-	Vals[2] = Vals[2] + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], Vals[1]) + K[53] +  W[5];
-	Vals[6] = Vals[6] + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma(Vals[5], Vals[3], Vals[4]);
-	W[6] = W[6] + (rotr(W[7], 7) ^ rotr(W[7], 18) ^ (W[7] >> 3U)) + W[15] + (rotr(W[4], 17) ^ rotr(W[4], 19) ^ (W[4] >> 10U));
-	
-	Vals[1] = Vals[1] + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[54] +  W[6];
-	Vals[5] = Vals[5] + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	W[7] = W[7] + (rotr(W[8], 7) ^ rotr(W[8], 18) ^ (W[8] >> 3U)) + W[0] + (rotr(W[5], 17) ^ rotr(W[5], 19) ^ (W[5] >> 10U));
-	
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[55] +  W[7];
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-	W[8] = W[8] + (rotr(W[9], 7) ^ rotr(W[9], 18) ^ (W[9] >> 3U)) + W[1] + (rotr(W[6], 17) ^ rotr(W[6], 19) ^ (W[6] >> 10U));
-	
-	Vals[7] = Vals[7] + (rotr(Vals[4], 6) ^ rotr(Vals[4], 11) ^ rotr(Vals[4], 25)) + ch(Vals[4], Vals[5], Vals[6]) + K[56] +  W[8];
-	Vals[3] = Vals[3] + Vals[7];
-	Vals[7] = Vals[7] + (rotr(Vals[0], 2) ^ rotr(Vals[0], 13) ^ rotr(Vals[0], 22)) + Ma(Vals[2], Vals[0], Vals[1]);
-	W[9] = W[9] + (rotr(W[10], 7) ^ rotr(W[10], 18) ^ (W[10] >> 3U)) + W[2] + (rotr(W[7], 17) ^ rotr(W[7], 19) ^ (W[7] >> 10U));
-	
-	Vals[6] = Vals[6] + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + ch(Vals[3], Vals[4], Vals[5]) + K[57] +  W[9];
-	Vals[2] = Vals[2] + Vals[6];
-	Vals[6] = Vals[6] + (rotr(Vals[7], 2) ^ rotr(Vals[7], 13) ^ rotr(Vals[7], 22)) + Ma(Vals[1], Vals[7], Vals[0]);
-	W[10] = W[10] + (rotr(W[11], 7) ^ rotr(W[11], 18) ^ (W[11] >> 3U)) + W[3] + (rotr(W[8], 17) ^ rotr(W[8], 19) ^ (W[8] >> 10U));
-	
-	Vals[5] = Vals[5] + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], Vals[4]) + K[58] + W[10];
-	Vals[1] = Vals[1] + Vals[5];
-	Vals[5] = Vals[5] + (rotr(Vals[6], 2) ^ rotr(Vals[6], 13) ^ rotr(Vals[6], 22)) + Ma(Vals[0], Vals[6], Vals[7]);
-	W[11] = W[11] + (rotr(W[12], 7) ^ rotr(W[12], 18) ^ (W[12] >> 3U)) + W[4] + (rotr(W[9], 17) ^ rotr(W[9], 19) ^ (W[9] >> 10U));
-	
-	Vals[4] = Vals[4] + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[59] + W[11];
-	Vals[0] = Vals[0] + Vals[4];
-	Vals[4] = Vals[4] + (rotr(Vals[5], 2) ^ rotr(Vals[5], 13) ^ rotr(Vals[5], 22)) + Ma(Vals[7], Vals[5], Vals[6]);
-	W[12] = W[12] + (rotr(W[13], 7) ^ rotr(W[13], 18) ^ (W[13] >> 3U)) + W[5] + (rotr(W[10], 17) ^ rotr(W[10], 19) ^ (W[10] >> 10U));
-	
-	Vals[3] = Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[60] + W[12];
-	Vals[7] = Vals[7] + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma(Vals[6], Vals[4], Vals[5]);
-	W[13] = W[13] + (rotr(W[14], 7) ^ rotr(W[14], 18) ^ (W[14] >> 3U)) + W[6] + (rotr(W[11], 17) ^ rotr(W[11], 19) ^ (W[11] >> 10U));
-	
-	Vals[2] = Vals[2] + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], Vals[1]) + K[61] + W[13];
-	Vals[6] = Vals[6] + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma(Vals[5], Vals[3], Vals[4]);
-	W[14] = W[14] + (rotr(W[15], 7) ^ rotr(W[15], 18) ^ (W[15] >> 3U)) + W[7] + (rotr(W[12], 17) ^ rotr(W[12], 19) ^ (W[12] >> 10U));
-	
-	Vals[1] = Vals[1] + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[62] + W[14];
-	Vals[5] = Vals[5] + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	W[15] = W[15] + (rotr(W[0], 7) ^ rotr(W[0], 18) ^ (W[0] >> 3U)) + W[8] + (rotr(W[13], 17) ^ rotr(W[13], 19) ^ (W[13] >> 10U));
-	
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[63] + W[15];
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-
-	W[0] = Vals[0] + state0;
-	W[1] = Vals[1] + state1;
-	W[2] = Vals[2] + state2;
-	W[3] = Vals[3] + state3;
-	W[4] = Vals[4] + state4;
-	W[5] = Vals[5] + state5;
-	W[6] = Vals[6] + state6;
-	W[7] = Vals[7] + state7;
-
-	Vals[7] = 0xb0edbdd0 + K[ 0] +  W[0];
-	Vals[3] = 0xa54ff53a + Vals[7];
-	Vals[7] = Vals[7] + 0x08909ae5U;
-	Vals[6] = 0x1f83d9abU + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + (0x9b05688cU ^ (Vals[3] & 0xca0b3af3U)) + K[ 1] +  W[1];
-	Vals[2] = 0x3c6ef372U + Vals[6];
-	Vals[6] = Vals[6] + (rotr(Vals[7], 2) ^ rotr(Vals[7], 13) ^ rotr(Vals[7], 22)) +  Ma2(0xbb67ae85U, Vals[7], 0x6a09e667U);
-	Vals[5] = 0x9b05688cU + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], 0x510e527fU) + K[ 2] +  W[2];
-	Vals[1] = 0xbb67ae85U + Vals[5];
-	Vals[5] = Vals[5] + (rotr(Vals[6], 2) ^ rotr(Vals[6], 13) ^ rotr(Vals[6], 22)) + Ma2(0x6a09e667U, Vals[6], Vals[7]);
-	Vals[4] = 0x510e527fU + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[ 3] +  W[3];
-	Vals[0] = 0x6a09e667U + Vals[4];
-	Vals[4] = Vals[4] + (rotr(Vals[5], 2) ^ rotr(Vals[5], 13) ^ rotr(Vals[5], 22)) + Ma(Vals[7], Vals[5], Vals[6]);
-	Vals[3] = Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[ 4] +  W[4];
-	Vals[7] = Vals[7] + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma(Vals[6], Vals[4], Vals[5]);
-	Vals[2] = Vals[2] + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], Vals[1]) + K[ 5] +  W[5];
-	Vals[6] = Vals[6] + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma(Vals[5], Vals[3], Vals[4]);
-	Vals[1] = Vals[1] + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[ 6] +  W[6];
-	Vals[5] = Vals[5] + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[ 7] +  W[7];
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-	Vals[7] = Vals[7] + (rotr(Vals[4], 6) ^ rotr(Vals[4], 11) ^ rotr(Vals[4], 25)) + ch(Vals[4], Vals[5], Vals[6]) + K[ 8] +  0x80000000;
-	Vals[3] = Vals[3] + Vals[7];
-	Vals[7] = Vals[7] + (rotr(Vals[0], 2) ^ rotr(Vals[0], 13) ^ rotr(Vals[0], 22)) + Ma(Vals[2], Vals[0], Vals[1]);
-	Vals[6] = Vals[6] + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + ch(Vals[3], Vals[4], Vals[5]) + K[ 9];
-	Vals[2] = Vals[2] + Vals[6];
-	Vals[6] = Vals[6] + (rotr(Vals[7], 2) ^ rotr(Vals[7], 13) ^ rotr(Vals[7], 22)) + Ma(Vals[1], Vals[7], Vals[0]);
-	Vals[5] = Vals[5] + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], Vals[4]) + K[10];
-	Vals[1] = Vals[1] + Vals[5];
-	Vals[5] = Vals[5] + (rotr(Vals[6], 2) ^ rotr(Vals[6], 13) ^ rotr(Vals[6], 22)) + Ma(Vals[0], Vals[6], Vals[7]);
-	Vals[4] = Vals[4] + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[11];
-	Vals[0] = Vals[0] + Vals[4];
-	Vals[4] = Vals[4] + (rotr(Vals[5], 2) ^ rotr(Vals[5], 13) ^ rotr(Vals[5], 22)) + Ma(Vals[7], Vals[5], Vals[6]);
-	Vals[3] = Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[12];
-	Vals[7] = Vals[7] + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma(Vals[6], Vals[4], Vals[5]);
-	Vals[2] = Vals[2] + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], Vals[1]) + K[13];
-	Vals[6] = Vals[6] + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma(Vals[5], Vals[3], Vals[4]);
-	Vals[1] = Vals[1] + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[14];
-	Vals[5] = Vals[5] + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[15] + 0x00000100U;
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-	W[0] = W[0] + (rotr(W[1], 7) ^ rotr(W[1], 18) ^ (W[1] >> 3U));
-	Vals[7] = Vals[7] + (rotr(Vals[4], 6) ^ rotr(Vals[4], 11) ^ rotr(Vals[4], 25)) + ch(Vals[4], Vals[5], Vals[6]) + K[16] +  W[0];
-	Vals[3] = Vals[3] + Vals[7];
-	Vals[7] = Vals[7] + (rotr(Vals[0], 2) ^ rotr(Vals[0], 13) ^ rotr(Vals[0], 22)) + Ma(Vals[2], Vals[0], Vals[1]);
-	W[1] = W[1] + (rotr(W[2], 7) ^ rotr(W[2], 18) ^ (W[2] >> 3U)) + 0x00a00000U;
-	Vals[6] = Vals[6] + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + ch(Vals[3], Vals[4], Vals[5]) + K[17] +  W[1];
-	Vals[2] = Vals[2] + Vals[6];
-	Vals[6] = Vals[6] + (rotr(Vals[7], 2) ^ rotr(Vals[7], 13) ^ rotr(Vals[7], 22)) + Ma(Vals[1], Vals[7], Vals[0]);
-	W[2] = W[2] + (rotr(W[3], 7) ^ rotr(W[3], 18) ^ (W[3] >> 3U)) + (rotr(W[0], 17) ^ rotr(W[0], 19) ^ (W[0] >> 10U));
-	Vals[5] = Vals[5] + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], Vals[4]) + K[18] +  W[2];
-	Vals[1] = Vals[1] + Vals[5];
-	Vals[5] = Vals[5] + (rotr(Vals[6], 2) ^ rotr(Vals[6], 13) ^ rotr(Vals[6], 22)) + Ma(Vals[0], Vals[6], Vals[7]);
-	W[3] = W[3] + (rotr(W[4], 7) ^ rotr(W[4], 18) ^ (W[4] >> 3U)) + (rotr(W[1], 17) ^ rotr(W[1], 19) ^ (W[1] >> 10U));
-	
-	Vals[4] = Vals[4] + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[19] +  W[3];
-	Vals[0] = Vals[0] + Vals[4];
-	Vals[4] = Vals[4] + (rotr(Vals[5], 2) ^ rotr(Vals[5], 13) ^ rotr(Vals[5], 22)) + Ma(Vals[7], Vals[5], Vals[6]);
-	W[4] = W[4] + (rotr(W[5], 7) ^ rotr(W[5], 18) ^ (W[5] >> 3U)) + (rotr(W[2], 17) ^ rotr(W[2], 19) ^ (W[2] >> 10U));
-	
-	Vals[3] = Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[20] +  W[4];
-	Vals[7] = Vals[7] + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma(Vals[6], Vals[4], Vals[5]);
-	W[5] = W[5] + (rotr(W[6], 7) ^ rotr(W[6], 18) ^ (W[6] >> 3U)) + (rotr(W[3], 17) ^ rotr(W[3], 19) ^ (W[3] >> 10U));
-	
-	Vals[2] = Vals[2] + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], Vals[1]) + K[21] +  W[5];
-	Vals[6] = Vals[6] + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma(Vals[5], Vals[3], Vals[4]);
-	W[6] = W[6] + (rotr(W[7], 7) ^ rotr(W[7], 18) ^ (W[7] >> 3U)) + 0x00000100U + (rotr(W[4], 17) ^ rotr(W[4], 19) ^ (W[4] >> 10U));
-	
-	Vals[1] = Vals[1] + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[22] +  W[6];
-	Vals[5] = Vals[5] + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	W[7] = W[7] + 0x11002000U + W[0] + (rotr(W[5], 17) ^ rotr(W[5], 19) ^ (W[5] >> 10U));
-	
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[23] +  W[7];
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-	W[8] = 0x80000000 + W[1] + (rotr(W[6], 17) ^ rotr(W[6], 19) ^ (W[6] >> 10U));
-	
-	Vals[7] = Vals[7] + (rotr(Vals[4], 6) ^ rotr(Vals[4], 11) ^ rotr(Vals[4], 25)) + ch(Vals[4], Vals[5], Vals[6]) + K[24] +  W[8];
-	Vals[3] = Vals[3] + Vals[7];
-	Vals[7] = Vals[7] + (rotr(Vals[0], 2) ^ rotr(Vals[0], 13) ^ rotr(Vals[0], 22)) + Ma(Vals[2], Vals[0], Vals[1]);
-	W[9] = W[2] + (rotr(W[7], 17) ^ rotr(W[7], 19) ^ (W[7] >> 10U));
-	
-	Vals[6] = Vals[6] + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + ch(Vals[3], Vals[4], Vals[5]) + K[25] +  W[9];
-	Vals[2] = Vals[2] + Vals[6];
-	Vals[6] = Vals[6] + (rotr(Vals[7], 2) ^ rotr(Vals[7], 13) ^ rotr(Vals[7], 22)) + Ma(Vals[1], Vals[7], Vals[0]);
-	W[10] = W[3] + (rotr(W[8], 17) ^ rotr(W[8], 19) ^ (W[8] >> 10U));
-	
-	Vals[5] = Vals[5] + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], Vals[4]) + K[26] + W[10];
-	Vals[1] = Vals[1] + Vals[5];
-	Vals[5] = Vals[5] + (rotr(Vals[6], 2) ^ rotr(Vals[6], 13) ^ rotr(Vals[6], 22)) + Ma(Vals[0], Vals[6], Vals[7]);
-	W[11] = W[4] + (rotr(W[9], 17) ^ rotr(W[9], 19) ^ (W[9] >> 10U));
-	
-	Vals[4] = Vals[4] + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[27] + W[11];
-	Vals[0] = Vals[0] + Vals[4];
-	Vals[4] = Vals[4] + (rotr(Vals[5], 2) ^ rotr(Vals[5], 13) ^ rotr(Vals[5], 22)) + Ma(Vals[7], Vals[5], Vals[6]);
-	W[12] = W[5] + (rotr(W[10], 17) ^ rotr(W[10], 19) ^ (W[10] >> 10U));
-	
-	Vals[3] = Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[28] + W[12];
-	Vals[7] = Vals[7] + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma(Vals[6], Vals[4], Vals[5]);
-	W[13] = W[6] + (rotr(W[11], 17) ^ rotr(W[11], 19) ^ (W[11] >> 10U));
-	
-	Vals[2] = Vals[2] + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], Vals[1]) + K[29] + W[13];
-	Vals[6] = Vals[6] + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma(Vals[5], Vals[3], Vals[4]);
-	W[14] = 0x00400022U + W[7] + (rotr(W[12], 17) ^ rotr(W[12], 19) ^ (W[12] >> 10U));
-	
-	Vals[1] = Vals[1] + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[30] + W[14];
-	Vals[5] = Vals[5] + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	W[15] = 0x00000100U + (rotr(W[0], 7) ^ rotr(W[0], 18) ^ (W[0] >> 3U)) + W[8] + (rotr(W[13], 17) ^ rotr(W[13], 19) ^ (W[13] >> 10U));
-	
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[31] + W[15];
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-	W[0] = W[0] + (rotr(W[1], 7) ^ rotr(W[1], 18) ^ (W[1] >> 3U)) + W[9] + (rotr(W[14], 17) ^ rotr(W[14], 19) ^ (W[14] >> 10U));
-	Vals[7] = Vals[7] + (rotr(Vals[4], 6) ^ rotr(Vals[4], 11) ^ rotr(Vals[4], 25)) + ch(Vals[4], Vals[5], Vals[6]) + K[32] +  W[0];
-	Vals[3] = Vals[3] + Vals[7];
-	Vals[7] = Vals[7] + (rotr(Vals[0], 2) ^ rotr(Vals[0], 13) ^ rotr(Vals[0], 22)) + Ma(Vals[2], Vals[0], Vals[1]);
-	W[1] = W[1] + (rotr(W[2], 7) ^ rotr(W[2], 18) ^ (W[2] >> 3U)) + W[10] + (rotr(W[15], 17) ^ rotr(W[15], 19) ^ (W[15] >> 10U));
-	Vals[6] = Vals[6] + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + ch(Vals[3], Vals[4], Vals[5]) + K[33] +  W[1];
-	Vals[2] = Vals[2] + Vals[6];
-	Vals[6] = Vals[6] + (rotr(Vals[7], 2) ^ rotr(Vals[7], 13) ^ rotr(Vals[7], 22)) + Ma(Vals[1], Vals[7], Vals[0]);
-	W[2] = W[2] + (rotr(W[3], 7) ^ rotr(W[3], 18) ^ (W[3] >> 3U)) + W[11] + (rotr(W[0], 17) ^ rotr(W[0], 19) ^ (W[0] >> 10U));
-	Vals[5] = Vals[5] + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], Vals[4]) + K[34] +  W[2];
-	Vals[1] = Vals[1] + Vals[5];
-	Vals[5] = Vals[5] + (rotr(Vals[6], 2) ^ rotr(Vals[6], 13) ^ rotr(Vals[6], 22)) + Ma(Vals[0], Vals[6], Vals[7]);
-	W[3] = W[3] + (rotr(W[4], 7) ^ rotr(W[4], 18) ^ (W[4] >> 3U)) + W[12] + (rotr(W[1], 17) ^ rotr(W[1], 19) ^ (W[1] >> 10U));
-	
-	Vals[4] = Vals[4] + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[35] +  W[3];
-	Vals[0] = Vals[0] + Vals[4];
-	Vals[4] = Vals[4] + (rotr(Vals[5], 2) ^ rotr(Vals[5], 13) ^ rotr(Vals[5], 22)) + Ma(Vals[7], Vals[5], Vals[6]);
-	W[4] = W[4] + (rotr(W[5], 7) ^ rotr(W[5], 18) ^ (W[5] >> 3U)) + W[13] + (rotr(W[2], 17) ^ rotr(W[2], 19) ^ (W[2] >> 10U));
-	
-	Vals[3] = Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[36] +  W[4];
-	Vals[7] = Vals[7] + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma(Vals[6], Vals[4], Vals[5]);
-	W[5] = W[5] + (rotr(W[6], 7) ^ rotr(W[6], 18) ^ (W[6] >> 3U)) + W[14] + (rotr(W[3], 17) ^ rotr(W[3], 19) ^ (W[3] >> 10U));
-	
-	Vals[2] = Vals[2] + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], Vals[1]) + K[37] +  W[5];
-	Vals[6] = Vals[6] + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma(Vals[5], Vals[3], Vals[4]);
-	W[6] = W[6] + (rotr(W[7], 7) ^ rotr(W[7], 18) ^ (W[7] >> 3U)) + W[15] + (rotr(W[4], 17) ^ rotr(W[4], 19) ^ (W[4] >> 10U));
-	
-	Vals[1] = Vals[1] + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[38] +  W[6];
-	Vals[5] = Vals[5] + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	W[7] = W[7] + (rotr(W[8], 7) ^ rotr(W[8], 18) ^ (W[8] >> 3U)) + W[0] + (rotr(W[5], 17) ^ rotr(W[5], 19) ^ (W[5] >> 10U));
-	
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[39] +  W[7];
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-	W[8] = W[8] + (rotr(W[9], 7) ^ rotr(W[9], 18) ^ (W[9] >> 3U)) + W[1] + (rotr(W[6], 17) ^ rotr(W[6], 19) ^ (W[6] >> 10U));
-	
-	Vals[7] = Vals[7] + (rotr(Vals[4], 6) ^ rotr(Vals[4], 11) ^ rotr(Vals[4], 25)) + ch(Vals[4], Vals[5], Vals[6]) + K[40] +  W[8];
-	Vals[3] = Vals[3] + Vals[7];
-	Vals[7] = Vals[7] + (rotr(Vals[0], 2) ^ rotr(Vals[0], 13) ^ rotr(Vals[0], 22)) + Ma(Vals[2], Vals[0], Vals[1]);
-	W[9] = W[9] + (rotr(W[10], 7) ^ rotr(W[10], 18) ^ (W[10] >> 3U)) + W[2] + (rotr(W[7], 17) ^ rotr(W[7], 19) ^ (W[7] >> 10U));
-	
-	Vals[6] = Vals[6] + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + ch(Vals[3], Vals[4], Vals[5]) + K[41] +  W[9];
-	Vals[2] = Vals[2] + Vals[6];
-	Vals[6] = Vals[6] + (rotr(Vals[7], 2) ^ rotr(Vals[7], 13) ^ rotr(Vals[7], 22)) + Ma(Vals[1], Vals[7], Vals[0]);
-	W[10] = W[10] + (rotr(W[11], 7) ^ rotr(W[11], 18) ^ (W[11] >> 3U)) + W[3] + (rotr(W[8], 17) ^ rotr(W[8], 19) ^ (W[8] >> 10U));
-	
-	Vals[5] = Vals[5] + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], Vals[4]) + K[42] + W[10];
-	Vals[1] = Vals[1] + Vals[5];
-	Vals[5] = Vals[5] + (rotr(Vals[6], 2) ^ rotr(Vals[6], 13) ^ rotr(Vals[6], 22)) + Ma(Vals[0], Vals[6], Vals[7]);
-	W[11] = W[11] + (rotr(W[12], 7) ^ rotr(W[12], 18) ^ (W[12] >> 3U)) + W[4] + (rotr(W[9], 17) ^ rotr(W[9], 19) ^ (W[9] >> 10U));
-	
-	Vals[4] = Vals[4] + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[43] + W[11];
-	Vals[0] = Vals[0] + Vals[4];
-	Vals[4] = Vals[4] + (rotr(Vals[5], 2) ^ rotr(Vals[5], 13) ^ rotr(Vals[5], 22)) + Ma(Vals[7], Vals[5], Vals[6]);
-	W[12] = W[12] + (rotr(W[13], 7) ^ rotr(W[13], 18) ^ (W[13] >> 3U)) + W[5] + (rotr(W[10], 17) ^ rotr(W[10], 19) ^ (W[10] >> 10U));
-	
-	Vals[3] = Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[44] + W[12];
-	Vals[7] = Vals[7] + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma(Vals[6], Vals[4], Vals[5]);
-	W[13] = W[13] + (rotr(W[14], 7) ^ rotr(W[14], 18) ^ (W[14] >> 3U)) + W[6] + (rotr(W[11], 17) ^ rotr(W[11], 19) ^ (W[11] >> 10U));
-	
-	Vals[2] = Vals[2] + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], Vals[1]) + K[45] + W[13];
-	Vals[6] = Vals[6] + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma(Vals[5], Vals[3], Vals[4]);
-	W[14] = W[14] + (rotr(W[15], 7) ^ rotr(W[15], 18) ^ (W[15] >> 3U)) + W[7] + (rotr(W[12], 17) ^ rotr(W[12], 19) ^ (W[12] >> 10U));
-	
-	Vals[1] = Vals[1] + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[46] + W[14];
-	Vals[5] = Vals[5] + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	W[15] = W[15] + (rotr(W[0], 7) ^ rotr(W[0], 18) ^ (W[0] >> 3U)) + W[8] + (rotr(W[13], 17) ^ rotr(W[13], 19) ^ (W[13] >> 10U));
-	
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[47] + W[15];
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-	W[0] = W[0] + (rotr(W[1], 7) ^ rotr(W[1], 18) ^ (W[1] >> 3U)) + W[9] + (rotr(W[14], 17) ^ rotr(W[14], 19) ^ (W[14] >> 10U));
-	Vals[7] = Vals[7] + (rotr(Vals[4], 6) ^ rotr(Vals[4], 11) ^ rotr(Vals[4], 25)) + ch(Vals[4], Vals[5], Vals[6]) + K[48] +  W[0];
-	Vals[3] = Vals[3] + Vals[7];
-	Vals[7] = Vals[7] + (rotr(Vals[0], 2) ^ rotr(Vals[0], 13) ^ rotr(Vals[0], 22)) + Ma(Vals[2], Vals[0], Vals[1]);
-	W[1] = W[1] + (rotr(W[2], 7) ^ rotr(W[2], 18) ^ (W[2] >> 3U)) + W[10] + (rotr(W[15], 17) ^ rotr(W[15], 19) ^ (W[15] >> 10U));
-	Vals[6] = Vals[6] + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + ch(Vals[3], Vals[4], Vals[5]) + K[49] +  W[1];
-	Vals[2] = Vals[2] + Vals[6];
-	Vals[6] = Vals[6] + (rotr(Vals[7], 2) ^ rotr(Vals[7], 13) ^ rotr(Vals[7], 22)) + Ma(Vals[1], Vals[7], Vals[0]);
-	W[2] = W[2] + (rotr(W[3], 7) ^ rotr(W[3], 18) ^ (W[3] >> 3U)) + W[11] + (rotr(W[0], 17) ^ rotr(W[0], 19) ^ (W[0] >> 10U));
-	Vals[5] = Vals[5] + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], Vals[4]) + K[50] +  W[2];
-	Vals[1] = Vals[1] + Vals[5];
-	Vals[5] = Vals[5] + (rotr(Vals[6], 2) ^ rotr(Vals[6], 13) ^ rotr(Vals[6], 22)) + Ma(Vals[0], Vals[6], Vals[7]);
-	W[3] = W[3] + (rotr(W[4], 7) ^ rotr(W[4], 18) ^ (W[4] >> 3U)) + W[12] + (rotr(W[1], 17) ^ rotr(W[1], 19) ^ (W[1] >> 10U));
-	
-	Vals[4] = Vals[4] + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[51] +  W[3];
-	Vals[0] = Vals[0] + Vals[4];
-	Vals[4] = Vals[4] + (rotr(Vals[5], 2) ^ rotr(Vals[5], 13) ^ rotr(Vals[5], 22)) + Ma(Vals[7], Vals[5], Vals[6]);
-	W[4] = W[4] + (rotr(W[5], 7) ^ rotr(W[5], 18) ^ (W[5] >> 3U)) + W[13] + (rotr(W[2], 17) ^ rotr(W[2], 19) ^ (W[2] >> 10U));
-	
-	Vals[3] = Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[52] +  W[4];
-	Vals[7] = Vals[7] + Vals[3];
-	Vals[3] = Vals[3] + (rotr(Vals[4], 2) ^ rotr(Vals[4], 13) ^ rotr(Vals[4], 22)) + Ma(Vals[6], Vals[4], Vals[5]);
-	W[5] = W[5] + (rotr(W[6], 7) ^ rotr(W[6], 18) ^ (W[6] >> 3U)) + W[14] + (rotr(W[3], 17) ^ rotr(W[3], 19) ^ (W[3] >> 10U));
-	
-	Vals[2] = Vals[2] + (rotr(Vals[7], 6) ^ rotr(Vals[7], 11) ^ rotr(Vals[7], 25)) + ch(Vals[7], Vals[0], Vals[1]) + K[53] +  W[5];
-	Vals[6] = Vals[6] + Vals[2];
-	Vals[2] = Vals[2] + (rotr(Vals[3], 2) ^ rotr(Vals[3], 13) ^ rotr(Vals[3], 22)) + Ma(Vals[5], Vals[3], Vals[4]);
-	W[6] = W[6] + (rotr(W[7], 7) ^ rotr(W[7], 18) ^ (W[7] >> 3U)) + W[15] + (rotr(W[4], 17) ^ rotr(W[4], 19) ^ (W[4] >> 10U));
-	
-	Vals[1] = Vals[1] + (rotr(Vals[6], 6) ^ rotr(Vals[6], 11) ^ rotr(Vals[6], 25)) + ch(Vals[6], Vals[7], Vals[0]) + K[54] +  W[6];
-	Vals[5] = Vals[5] + Vals[1];
-	Vals[1] = Vals[1] + (rotr(Vals[2], 2) ^ rotr(Vals[2], 13) ^ rotr(Vals[2], 22)) + Ma(Vals[4], Vals[2], Vals[3]);
-	W[7] = W[7] + (rotr(W[8], 7) ^ rotr(W[8], 18) ^ (W[8] >> 3U)) + W[0] + (rotr(W[5], 17) ^ rotr(W[5], 19) ^ (W[5] >> 10U));
-	
-	Vals[0] = Vals[0] + (rotr(Vals[5], 6) ^ rotr(Vals[5], 11) ^ rotr(Vals[5], 25)) + ch(Vals[5], Vals[6], Vals[7]) + K[55] +  W[7];
-	Vals[4] = Vals[4] + Vals[0];
-	Vals[0] = Vals[0] + (rotr(Vals[1], 2) ^ rotr(Vals[1], 13) ^ rotr(Vals[1], 22)) + Ma(Vals[3], Vals[1], Vals[2]);
-	W[8] = W[8] + (rotr(W[9], 7) ^ rotr(W[9], 18) ^ (W[9] >> 3U)) + W[1] + (rotr(W[6], 17) ^ rotr(W[6], 19) ^ (W[6] >> 10U));
-	
-	Vals[7] = Vals[7] + (rotr(Vals[4], 6) ^ rotr(Vals[4], 11) ^ rotr(Vals[4], 25)) + ch(Vals[4], Vals[5], Vals[6]) + K[56] +  W[8];
-	Vals[3] = Vals[3] + Vals[7];
-	Vals[7] = Vals[7] + (rotr(Vals[0], 2) ^ rotr(Vals[0], 13) ^ rotr(Vals[0], 22)) + Ma(Vals[2], Vals[0], Vals[1]);
-	W[9] = W[9] + (rotr(W[10], 7) ^ rotr(W[10], 18) ^ (W[10] >> 3U)) + W[2] + (rotr(W[7], 17) ^ rotr(W[7], 19) ^ (W[7] >> 10U));
-	
-	Vals[6] = Vals[6] + (rotr(Vals[3], 6) ^ rotr(Vals[3], 11) ^ rotr(Vals[3], 25)) + ch(Vals[3], Vals[4], Vals[5]) + K[57] +  W[9];
-	Vals[2] = Vals[2] + Vals[6];
-	W[10] = W[10] + (rotr(W[11], 7) ^ rotr(W[11], 18) ^ (W[11] >> 3U)) + W[3] + (rotr(W[8], 17) ^ rotr(W[8], 19) ^ (W[8] >> 10U));
-	
-	Vals[5] = Vals[5] + (rotr(Vals[2], 6) ^ rotr(Vals[2], 11) ^ rotr(Vals[2], 25)) + ch(Vals[2], Vals[3], Vals[4]) + K[58] + W[10];
-	Vals[1] = Vals[1] + Vals[5];
-	W[11] = W[11] + (rotr(W[12], 7) ^ rotr(W[12], 18) ^ (W[12] >> 3U)) + W[4] + (rotr(W[9], 17) ^ rotr(W[9], 19) ^ (W[9] >> 10U));
-	
-	Vals[4] = Vals[4] + (rotr(Vals[1], 6) ^ rotr(Vals[1], 11) ^ rotr(Vals[1], 25)) + ch(Vals[1], Vals[2], Vals[3]) + K[59] + W[11];
-	Vals[0] = Vals[0] + Vals[4];
-	W[12] = W[12] + (rotr(W[13], 7) ^ rotr(W[13], 18) ^ (W[13] >> 3U)) + W[5] + (rotr(W[10], 17) ^ rotr(W[10], 19) ^ (W[10] >> 10U));
-	
-	Vals[7] = Vals[7] + Vals[3] + (rotr(Vals[0], 6) ^ rotr(Vals[0], 11) ^ rotr(Vals[0], 25)) + ch(Vals[0], Vals[1], Vals[2]) + K[60] + W[12];
-	Vals[7] ^= -0x5be0cd19U;
-
-#define FOUND (0x80)
-#define NFLAG (0x7F)
-
-#if defined(VECTORS4)
-	bool result = Vals[7].x & Vals[7].y & Vals[7].z & Vals[7].w;
-	if (!result) {
-		if (!Vals[7].x)
-			output[FOUND] = output[NFLAG & nonce.x] =  nonce.x;
-		if (!Vals[7].y)
-			output[FOUND] = output[NFLAG & nonce.y] =  nonce.y;
-		if (!Vals[7].z)
-			output[FOUND] = output[NFLAG & nonce.z] =  nonce.z;
-		if (!Vals[7].w)
-			output[FOUND] = output[NFLAG & nonce.w] =  nonce.w;
-	}
-#elif defined(VECTORS2)
-	bool result = Vals[7].x & Vals[7].y;
-	if (!result) {
-		if (!Vals[7].x)
-			output[FOUND] = output[NFLAG & nonce.x] =  nonce.x;
-		if (!Vals[7].y)
-			output[FOUND] = output[NFLAG & nonce.y] =  nonce.y;
-	}
-#else
-	if (!Vals[7])
-		output[FOUND] = output[NFLAG & nonce] =  nonce;
-#endif
-}

+ 1353 - 0
poclbm120327.cl

@@ -0,0 +1,1353 @@
+// -ck modified kernel taken from Phoenix taken from poclbm, with aspects of
+// phatk and others.
+// Modified version copyright 2011-2012 Con Kolivas
+
+// This file is taken and modified from the public-domain poclbm project, and
+// we have therefore decided to keep it public-domain in Phoenix.
+
+#ifdef VECTORS4
+	typedef uint4 u;
+#elif defined VECTORS2
+	typedef uint2 u;
+#else
+	typedef uint u;
+#endif
+
+__constant uint K[64] = { 
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+
+// This part is not from the stock poclbm kernel. It's part of an optimization
+// added in the Phoenix Miner.
+
+// Some AMD devices have a BFI_INT opcode, which behaves exactly like the
+// SHA-256 ch function, but provides it in exactly one instruction. If
+// detected, use it for ch. Otherwise, construct ch out of simpler logical
+// primitives.
+
+#ifdef BITALIGN
+	#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+	#define rotr(x, y) amd_bitalign((u)x, (u)x, (u)y)
+#else
+	#define rotr(x, y) rotate((u)x, (u)(32 - y))
+#endif
+#ifdef BFI_INT
+	// Well, slight problem... It turns out BFI_INT isn't actually exposed to
+	// OpenCL (or CAL IL for that matter) in any way. However, there is 
+	// a similar instruction, BYTE_ALIGN_INT, which is exposed to OpenCL via
+	// amd_bytealign, takes the same inputs, and provides the same output. 
+	// We can use that as a placeholder for BFI_INT and have the application 
+	// patch it after compilation.
+	
+	// This is the BFI_INT function
+	#define ch(x, y, z) amd_bytealign(x, y, z)
+	
+	// Ma can also be implemented in terms of BFI_INT...
+	#define Ma(x, y, z) amd_bytealign( (z^x), (y), (x) )
+
+	// AMD's KernelAnalyzer throws errors compiling the kernel if we use
+	// amd_bytealign on constants with vectors enabled, so we use this to avoid
+	// problems. (this is used 4 times, and likely optimized out by the compiler.)
+	#define Ma2(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x)
+#else // BFI_INT
+	//GCN actually fails if manually patched with BFI_INT
+
+	#define ch(x, y, z) bitselect((u)z, (u)y, (u)x)
+	#define Ma(x, y, z) bitselect((u)x, (u)y, (u)z ^ (u)x)
+	#define Ma2(x, y, z) Ma(x, y, z)
+#endif
+
+
+__kernel
+__attribute__((vec_type_hint(u)))
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+void search(const uint state0, const uint state1, const uint state2, const uint state3,
+	const uint state4, const uint state5, const uint state6, const uint state7,
+	const uint b1, const uint c1,
+	const uint f1, const uint g1, const uint h1,
+#ifndef GOFFSET
+	const u base,
+#endif
+	const uint fw0, const uint fw1, const uint fw2, const uint fw3, const uint fw15, const uint fw01r,
+	const uint D1A, const uint C1addK5, const uint B1addK6,
+	const uint W16addK16, const uint W17addK17,
+	const uint PreVal4addT1, const uint Preval0,
+	__global uint * output)
+{
+	u Vals[24];
+	u *W = &Vals[8];
+
+#ifdef GOFFSET
+	const u nonce = (uint)(get_global_id(0));
+#else
+	const u nonce = base + (uint)(get_global_id(0));
+#endif
+
+Vals[5]=Preval0;
+Vals[5]+=nonce;
+
+Vals[0]=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],b1,c1);
+Vals[0]+=D1A;
+
+Vals[2]=Vals[0];
+Vals[2]+=h1;
+
+Vals[1]=PreVal4addT1;
+Vals[1]+=nonce;
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+
+Vals[6]=C1addK5;
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],b1);
+
+Vals[3]=Vals[6];
+Vals[3]+=g1;
+Vals[0]+=Ma2(g1,Vals[1],f1);
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma2(f1,Vals[0],Vals[1]);
+
+Vals[7]=B1addK6;
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+
+Vals[4]=Vals[7];
+Vals[4]+=f1;
+
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[7];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[8];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[9];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[10];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[11];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[12];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[13];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[14];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=0xC19BF3F4U;
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=W16addK16;
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=W17addK17;
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[2]=(rotr(nonce,7)^rotr(nonce,18)^(nonce>>3U));
+W[2]+=fw2;
+Vals[4]+=W[2];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[18];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[3]=nonce;
+W[3]+=fw3;
+Vals[1]+=W[3];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[19];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[4]=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U));
+W[4]+=0x80000000U;
+Vals[0]+=W[4];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[20];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[5]=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U));
+Vals[6]+=W[5];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[21];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[6]=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U));
+W[6]+=0x00000280U;
+Vals[7]+=W[6];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[22];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[7]=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U));
+W[7]+=fw0;
+Vals[5]+=W[7];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[23];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[8]=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U));
+W[8]+=fw1;
+Vals[2]+=W[8];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[24];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[9]=W[2];
+W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U));
+Vals[3]+=W[9];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[25];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[10]=W[3];
+W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U));
+Vals[4]+=W[10];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[26];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[11]=W[4];
+W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
+Vals[1]+=W[11];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[27];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[12]=W[5];
+W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U));
+Vals[0]+=W[12];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[28];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[13]=W[6];
+W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U));
+Vals[6]+=W[13];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[29];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[14]=0x00a00055U;
+W[14]+=W[7];
+W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U));
+Vals[7]+=W[14];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[30];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[15]=fw15;
+W[15]+=W[8];
+W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U));
+Vals[5]+=W[15];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[31];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[0]=fw01r;
+W[0]+=W[9];
+W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U));
+Vals[2]+=W[0];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[32];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[1]=fw1;
+W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U));
+W[1]+=W[10];
+W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U));
+Vals[3]+=W[1];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[33];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U));
+W[2]+=W[11];
+W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U));
+Vals[4]+=W[2];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[34];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U));
+W[3]+=W[12];
+W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U));
+Vals[1]+=W[3];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[35];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U));
+W[4]+=W[13];
+W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U));
+Vals[0]+=W[4];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[36];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U));
+W[5]+=W[14];
+W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U));
+Vals[6]+=W[5];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[37];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U));
+W[6]+=W[15];
+W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U));
+Vals[7]+=W[6];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[38];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U));
+W[7]+=W[0];
+W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U));
+Vals[5]+=W[7];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[39];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U));
+W[8]+=W[1];
+W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U));
+Vals[2]+=W[8];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[40];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U));
+W[9]+=W[2];
+W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U));
+Vals[3]+=W[9];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[41];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U));
+W[10]+=W[3];
+W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U));
+Vals[4]+=W[10];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[42];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[11]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U));
+W[11]+=W[4];
+W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
+Vals[1]+=W[11];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[43];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[12]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U));
+W[12]+=W[5];
+W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U));
+Vals[0]+=W[12];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[44];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[13]+=(rotr(W[14],7)^rotr(W[14],18)^(W[14]>>3U));
+W[13]+=W[6];
+W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U));
+Vals[6]+=W[13];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[45];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[14]+=(rotr(W[15],7)^rotr(W[15],18)^(W[15]>>3U));
+W[14]+=W[7];
+W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U));
+Vals[7]+=W[14];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[46];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[15]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U));
+W[15]+=W[8];
+W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U));
+Vals[5]+=W[15];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[47];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U));
+W[0]+=W[9];
+W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U));
+Vals[2]+=W[0];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[48];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U));
+W[1]+=W[10];
+W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U));
+Vals[3]+=W[1];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[49];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U));
+W[2]+=W[11];
+W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U));
+Vals[4]+=W[2];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[50];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U));
+W[3]+=W[12];
+W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U));
+Vals[1]+=W[3];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[51];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U));
+W[4]+=W[13];
+W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U));
+Vals[0]+=W[4];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[52];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U));
+W[5]+=W[14];
+W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U));
+Vals[6]+=W[5];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[53];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U));
+W[6]+=W[15];
+W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U));
+Vals[7]+=W[6];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[54];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U));
+W[7]+=W[0];
+W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U));
+Vals[5]+=W[7];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[55];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U));
+W[8]+=W[1];
+W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U));
+Vals[2]+=W[8];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[56];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U));
+W[9]+=W[2];
+W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U));
+Vals[3]+=W[9];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[57];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U));
+W[10]+=W[3];
+W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U));
+Vals[4]+=W[10];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[58];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[11]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U));
+W[11]+=W[4];
+W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
+Vals[1]+=W[11];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[59];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[12]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U));
+W[12]+=W[5];
+W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U));
+Vals[0]+=W[12];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[60];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[13]+=(rotr(W[14],7)^rotr(W[14],18)^(W[14]>>3U));
+W[13]+=W[6];
+W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U));
+Vals[6]+=W[13];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[61];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+Vals[7]+=W[14];
+Vals[7]+=(rotr(W[15],7)^rotr(W[15],18)^(W[15]>>3U));
+Vals[7]+=W[7];
+Vals[7]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U));
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[62];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+Vals[5]+=W[15];
+Vals[5]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U));
+Vals[5]+=W[8];
+Vals[5]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U));
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[63];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+Vals[5]+=state0;
+
+W[7]=state7;
+W[7]+=Vals[2];
+
+Vals[2]=0xF377ED68U;
+Vals[2]+=Vals[5];
+
+W[3]=state3;
+W[3]+=Vals[0];
+
+Vals[0]=0xa54ff53aU;
+Vals[0]+=Vals[2];
+Vals[2]+=0x08909ae5U;
+
+W[6]=state6;
+W[6]+=Vals[3];
+
+Vals[3]=0x90BB1E3CU;
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=(0x9b05688cU^(Vals[0]&0xca0b3af3U));
+
+Vals[7]+=state1;
+Vals[3]+=Vals[7];
+
+W[2]=state2;
+W[2]+=Vals[6];
+
+Vals[6]=0x3c6ef372U;
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma2(0xbb67ae85U,Vals[2],0x6a09e667U);
+
+W[5]=state5;
+W[5]+=Vals[4];
+
+Vals[4]=0x50C6645BU;
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],0x510e527fU);
+Vals[4]+=W[2];
+
+W[1]=Vals[7];
+Vals[7]=0xbb67ae85U;
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma2(0x6a09e667U,Vals[3],Vals[2]);
+
+W[4]=state4;
+W[4]+=Vals[1];
+
+Vals[1]=0x3AC42E24U;
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=W[3];
+
+W[0]=Vals[5];
+
+Vals[5]=Vals[1];
+Vals[5]+=0x6a09e667U;
+
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[4];
+Vals[0]+=W[4];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[5];
+Vals[6]+=W[5];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[6];
+Vals[7]+=W[6];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[7];
+Vals[5]+=W[7];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=0x5807AA98U;
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[9];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[10];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[11];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[12];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[13];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[14];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=0xC19BF274U;
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U));
+Vals[2]+=W[0];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[16];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U));
+W[1]+=0x00a00000U;
+Vals[3]+=W[1];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[17];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U));
+W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U));
+Vals[4]+=W[2];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[18];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U));
+W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U));
+Vals[1]+=W[3];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[19];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U));
+W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U));
+Vals[0]+=W[4];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[20];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U));
+W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U));
+Vals[6]+=W[5];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[21];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U));
+W[6]+=0x00000100U;
+W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U));
+Vals[7]+=W[6];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[22];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[7]+=0x11002000U;
+W[7]+=W[0];
+W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U));
+Vals[5]+=W[7];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[23];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[8]=0x80000000U;
+W[8]+=W[1];
+W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U));
+Vals[2]+=W[8];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[24];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[9]=W[2];
+W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U));
+Vals[3]+=W[9];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[25];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[10]=W[3];
+W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U));
+Vals[4]+=W[10];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[26];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[11]=W[4];
+W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
+Vals[1]+=W[11];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[27];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[12]=W[5];
+W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U));
+Vals[0]+=W[12];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[28];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[13]=W[6];
+W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U));
+Vals[6]+=W[13];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[29];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[14]=0x00400022U;
+W[14]+=W[7];
+W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U));
+Vals[7]+=W[14];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[30];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[15]=0x00000100U;
+W[15]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U));
+W[15]+=W[8];
+W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U));
+Vals[5]+=W[15];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[31];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U));
+W[0]+=W[9];
+W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U));
+Vals[2]+=W[0];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[32];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U));
+W[1]+=W[10];
+W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U));
+Vals[3]+=W[1];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[33];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U));
+W[2]+=W[11];
+W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U));
+Vals[4]+=W[2];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[34];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U));
+W[3]+=W[12];
+W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U));
+Vals[1]+=W[3];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[35];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U));
+W[4]+=W[13];
+W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U));
+Vals[0]+=W[4];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[36];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U));
+W[5]+=W[14];
+W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U));
+Vals[6]+=W[5];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[37];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U));
+W[6]+=W[15];
+W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U));
+Vals[7]+=W[6];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[38];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U));
+W[7]+=W[0];
+W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U));
+Vals[5]+=W[7];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[39];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U));
+W[8]+=W[1];
+W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U));
+Vals[2]+=W[8];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[40];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U));
+W[9]+=W[2];
+W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U));
+Vals[3]+=W[9];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[41];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U));
+W[10]+=W[3];
+W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U));
+Vals[4]+=W[10];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[42];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[11]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U));
+W[11]+=W[4];
+W[11]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
+Vals[1]+=W[11];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[43];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[12]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U));
+W[12]+=W[5];
+W[12]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U));
+Vals[0]+=W[12];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[44];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[13]+=(rotr(W[14],7)^rotr(W[14],18)^(W[14]>>3U));
+W[13]+=W[6];
+W[13]+=(rotr(W[11],17)^rotr(W[11],19)^(W[11]>>10U));
+Vals[6]+=W[13];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[45];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[14]+=(rotr(W[15],7)^rotr(W[15],18)^(W[15]>>3U));
+W[14]+=W[7];
+W[14]+=(rotr(W[12],17)^rotr(W[12],19)^(W[12]>>10U));
+Vals[7]+=W[14];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[46];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[15]+=(rotr(W[0],7)^rotr(W[0],18)^(W[0]>>3U));
+W[15]+=W[8];
+W[15]+=(rotr(W[13],17)^rotr(W[13],19)^(W[13]>>10U));
+Vals[5]+=W[15];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[47];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[0]+=(rotr(W[1],7)^rotr(W[1],18)^(W[1]>>3U));
+W[0]+=W[9];
+W[0]+=(rotr(W[14],17)^rotr(W[14],19)^(W[14]>>10U));
+Vals[2]+=W[0];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[48];
+Vals[0]+=Vals[2];
+Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+
+W[1]+=(rotr(W[2],7)^rotr(W[2],18)^(W[2]>>3U));
+W[1]+=W[10];
+W[1]+=(rotr(W[15],17)^rotr(W[15],19)^(W[15]>>10U));
+Vals[3]+=W[1];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[49];
+Vals[6]+=Vals[3];
+Vals[3]+=(rotr(Vals[2],2)^rotr(Vals[2],13)^rotr(Vals[2],22));
+Vals[3]+=Ma(Vals[7],Vals[2],Vals[5]);
+
+W[2]+=(rotr(W[3],7)^rotr(W[3],18)^(W[3]>>3U));
+W[2]+=W[11];
+W[2]+=(rotr(W[0],17)^rotr(W[0],19)^(W[0]>>10U));
+Vals[4]+=W[2];
+Vals[4]+=(rotr(Vals[6],6)^rotr(Vals[6],11)^rotr(Vals[6],25));
+Vals[4]+=ch(Vals[6],Vals[0],Vals[1]);
+Vals[4]+=K[50];
+Vals[7]+=Vals[4];
+Vals[4]+=(rotr(Vals[3],2)^rotr(Vals[3],13)^rotr(Vals[3],22));
+Vals[4]+=Ma(Vals[5],Vals[3],Vals[2]);
+
+W[3]+=(rotr(W[4],7)^rotr(W[4],18)^(W[4]>>3U));
+W[3]+=W[12];
+W[3]+=(rotr(W[1],17)^rotr(W[1],19)^(W[1]>>10U));
+Vals[1]+=W[3];
+Vals[1]+=(rotr(Vals[7],6)^rotr(Vals[7],11)^rotr(Vals[7],25));
+Vals[1]+=ch(Vals[7],Vals[6],Vals[0]);
+Vals[1]+=K[51];
+Vals[5]+=Vals[1];
+Vals[1]+=(rotr(Vals[4],2)^rotr(Vals[4],13)^rotr(Vals[4],22));
+Vals[1]+=Ma(Vals[2],Vals[4],Vals[3]);
+
+W[4]+=(rotr(W[5],7)^rotr(W[5],18)^(W[5]>>3U));
+W[4]+=W[13];
+W[4]+=(rotr(W[2],17)^rotr(W[2],19)^(W[2]>>10U));
+Vals[0]+=W[4];
+Vals[0]+=(rotr(Vals[5],6)^rotr(Vals[5],11)^rotr(Vals[5],25));
+Vals[0]+=ch(Vals[5],Vals[7],Vals[6]);
+Vals[0]+=K[52];
+Vals[2]+=Vals[0];
+Vals[0]+=(rotr(Vals[1],2)^rotr(Vals[1],13)^rotr(Vals[1],22));
+Vals[0]+=Ma(Vals[3],Vals[1],Vals[4]);
+
+W[5]+=(rotr(W[6],7)^rotr(W[6],18)^(W[6]>>3U));
+W[5]+=W[14];
+W[5]+=(rotr(W[3],17)^rotr(W[3],19)^(W[3]>>10U));
+Vals[6]+=W[5];
+Vals[6]+=(rotr(Vals[2],6)^rotr(Vals[2],11)^rotr(Vals[2],25));
+Vals[6]+=ch(Vals[2],Vals[5],Vals[7]);
+Vals[6]+=K[53];
+Vals[3]+=Vals[6];
+Vals[6]+=(rotr(Vals[0],2)^rotr(Vals[0],13)^rotr(Vals[0],22));
+Vals[6]+=Ma(Vals[4],Vals[0],Vals[1]);
+
+W[6]+=(rotr(W[7],7)^rotr(W[7],18)^(W[7]>>3U));
+W[6]+=W[15];
+W[6]+=(rotr(W[4],17)^rotr(W[4],19)^(W[4]>>10U));
+Vals[7]+=W[6];
+Vals[7]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[7]+=ch(Vals[3],Vals[2],Vals[5]);
+Vals[7]+=K[54];
+Vals[4]+=Vals[7];
+Vals[7]+=(rotr(Vals[6],2)^rotr(Vals[6],13)^rotr(Vals[6],22));
+Vals[7]+=Ma(Vals[1],Vals[6],Vals[0]);
+
+W[7]+=(rotr(W[8],7)^rotr(W[8],18)^(W[8]>>3U));
+W[7]+=W[0];
+W[7]+=(rotr(W[5],17)^rotr(W[5],19)^(W[5]>>10U));
+Vals[5]+=W[7];
+Vals[5]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[5]+=ch(Vals[4],Vals[3],Vals[2]);
+Vals[5]+=K[55];
+Vals[1]+=Vals[5];
+Vals[5]+=(rotr(Vals[7],2)^rotr(Vals[7],13)^rotr(Vals[7],22));
+Vals[5]+=Ma(Vals[0],Vals[7],Vals[6]);
+
+W[8]+=(rotr(W[9],7)^rotr(W[9],18)^(W[9]>>3U));
+W[8]+=W[1];
+W[8]+=(rotr(W[6],17)^rotr(W[6],19)^(W[6]>>10U));
+Vals[2]+=W[8];
+Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+Vals[2]+=K[56];
+Vals[0]+=Vals[2];
+
+W[9]+=(rotr(W[10],7)^rotr(W[10],18)^(W[10]>>3U));
+W[9]+=W[2];
+W[9]+=(rotr(W[7],17)^rotr(W[7],19)^(W[7]>>10U));
+Vals[3]+=W[9];
+Vals[3]+=(rotr(Vals[0],6)^rotr(Vals[0],11)^rotr(Vals[0],25));
+Vals[3]+=ch(Vals[0],Vals[1],Vals[4]);
+Vals[3]+=K[57];
+Vals[3]+=Vals[6];
+
+W[10]+=(rotr(W[11],7)^rotr(W[11],18)^(W[11]>>3U));
+W[10]+=W[3];
+W[10]+=(rotr(W[8],17)^rotr(W[8],19)^(W[8]>>10U));
+Vals[4]+=W[10];
+Vals[4]+=(rotr(Vals[3],6)^rotr(Vals[3],11)^rotr(Vals[3],25));
+Vals[4]+=ch(Vals[3],Vals[0],Vals[1]);
+Vals[4]+=K[58];
+Vals[4]+=Vals[7];
+Vals[1]+=(rotr(Vals[4],6)^rotr(Vals[4],11)^rotr(Vals[4],25));
+Vals[1]+=ch(Vals[4],Vals[3],Vals[0]);
+Vals[1]+=W[11];
+Vals[1]+=(rotr(W[12],7)^rotr(W[12],18)^(W[12]>>3U));
+Vals[1]+=W[4];
+Vals[1]+=(rotr(W[9],17)^rotr(W[9],19)^(W[9]>>10U));
+Vals[1]+=K[59];
+Vals[1]+=Vals[5];
+
+#define FOUND (0x80)
+#define NFLAG (0x7F)
+
+#if defined(VECTORS2) || defined(VECTORS4)
+	Vals[2]+=Ma(Vals[6],Vals[5],Vals[7]);
+	Vals[2]+=(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22));
+	Vals[2]+=W[12];
+	Vals[2]+=(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U));
+	Vals[2]+=W[5];
+	Vals[2]+=(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U));
+	Vals[2]+=Vals[0];
+	Vals[2]+=(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25));
+	Vals[2]+=ch(Vals[1],Vals[4],Vals[3]);
+
+	if (any(Vals[2] == 0x136032edU)) {
+		if (Vals[2].x == 0x136032edU)
+			output[FOUND] = output[NFLAG & nonce.x] = nonce.x;
+		if (Vals[2].y == 0x136032edU)
+			output[FOUND] = output[NFLAG & nonce.y] = nonce.y;
+#if defined(VECTORS4)
+		if (Vals[2].z == 0x136032edU)
+			output[FOUND] = output[NFLAG & nonce.z] = nonce.z;
+		if (Vals[2].w == 0x136032edU)
+			output[FOUND] = output[NFLAG & nonce.w] = nonce.w;
+#endif
+	}
+#else
+	if ((Vals[2]+
+		Ma(Vals[6],Vals[5],Vals[7])+
+		(rotr(Vals[5],2)^rotr(Vals[5],13)^rotr(Vals[5],22))+
+		W[12]+
+		(rotr(W[13],7)^rotr(W[13],18)^(W[13]>>3U))+
+		W[5]+
+		(rotr(W[10],17)^rotr(W[10],19)^(W[10]>>10U))+
+		Vals[0]+
+		(rotr(Vals[1],6)^rotr(Vals[1],11)^rotr(Vals[1],25))+
+		ch(Vals[1],Vals[4],Vals[3])) == 0x136032edU)
+			output[FOUND] = output[NFLAG & nonce] =  nonce;
+#endif
+}

+ 11 - 11
sha2.c

@@ -11,7 +11,7 @@
  *
  *  This program is free software; you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
+ *  the Free Software Foundation; either version 3 of the License, or
  *  (at your option) any later version.
  *
  *  This program is distributed in the hope that it will be useful,
@@ -40,10 +40,10 @@
 #ifndef GET_ULONG_BE
 #define GET_ULONG_BE(n,b,i)                             \
 {                                                       \
-    (n) = ( (unsigned long) (b)[(i)    ] << 24 )        \
-        | ( (unsigned long) (b)[(i) + 1] << 16 )        \
-        | ( (unsigned long) (b)[(i) + 2] <<  8 )        \
-        | ( (unsigned long) (b)[(i) + 3]       );       \
+    (n) = ( (uint32_t) (b)[(i)    ] << 24 )        \
+        | ( (uint32_t) (b)[(i) + 1] << 16 )        \
+        | ( (uint32_t) (b)[(i) + 2] <<  8 )        \
+        | ( (uint32_t) (b)[(i) + 3]       );       \
 }
 #endif
 
@@ -95,8 +95,8 @@ void sha2_starts( sha2_context *ctx, int is224 )
 
 static void sha2_process( sha2_context *ctx, const unsigned char data[64] )
 {
-    unsigned long temp1, temp2, W[64];
-    unsigned long A, B, C, D, E, F, G, H;
+    uint32_t temp1, temp2, W[64];
+    uint32_t A, B, C, D, E, F, G, H;
 
     GET_ULONG_BE( W[ 0], data,  0 );
     GET_ULONG_BE( W[ 1], data,  4 );
@@ -230,7 +230,7 @@ static void sha2_process( sha2_context *ctx, const unsigned char data[64] )
 void sha2_update( sha2_context *ctx, const unsigned char *input, int ilen )
 {
     int fill;
-    unsigned long left;
+    uint32_t left;
 
     if( ilen <= 0 )
         return;
@@ -241,7 +241,7 @@ void sha2_update( sha2_context *ctx, const unsigned char *input, int ilen )
     ctx->total[0] += ilen;
     ctx->total[0] &= 0xFFFFFFFF;
 
-    if( ctx->total[0] < (unsigned long) ilen )
+    if( ctx->total[0] < (uint32_t) ilen )
         ctx->total[1]++;
 
     if( left && ilen >= fill )
@@ -281,8 +281,8 @@ static const unsigned char sha2_padding[64] =
  */
 void sha2_finish( sha2_context *ctx, unsigned char output[32] )
 {
-    unsigned long last, padn;
-    unsigned long high, low;
+    uint32_t last, padn;
+    uint32_t high, low;
     unsigned char msglen[8];
 
     high = ( ctx->total[0] >> 29 )

+ 4 - 2
sha2.h

@@ -26,13 +26,15 @@
 #ifndef POLARSSL_SHA2_H
 #define POLARSSL_SHA2_H
 
+#include <stdint.h>
+
 /**
  * \brief          SHA-256 context structure
  */
 typedef struct
 {
-    unsigned long total[2];     /*!< number of bytes processed  */
-    unsigned long state[8];     /*!< intermediate digest state  */
+    uint32_t total[2];     /*!< number of bytes processed  */
+    uint32_t state[8];     /*!< intermediate digest state  */
     unsigned char buffer[64];   /*!< data block being processed */
 
     unsigned char ipad[64];     /*!< HMAC: inner padding        */

+ 1 - 2
sha256_4way.c

@@ -4,8 +4,7 @@
 
 // tcatm's 4-way 128-bit SSE2 SHA-256
 
-#include "config.h"
-#include "miner.h"
+#include "driver-cpu.h"
 
 #ifdef WANT_SSE2_4WAY
 

+ 1 - 2
sha256_altivec_4way.c

@@ -9,8 +9,7 @@
 //
 
 
-//#include "config.h"
-#include "miner.h"
+#include "driver-cpu.h"
 
 #ifdef WANT_ALTIVEC_4WAY
 

+ 1 - 3
sha256_sse2_amd64.c

@@ -9,9 +9,7 @@
  *
  */
 
-#include "config.h"
-
-#include "miner.h"
+#include "driver-cpu.h"
 
 #ifdef WANT_X8664_SSE2
 

+ 1 - 3
sha256_sse2_i386.c

@@ -9,9 +9,7 @@
  *
  */
 
-#include "config.h"
-
-#include "miner.h"
+#include "driver-cpu.h"
 
 #ifdef WANT_X8632_SSE2
 

+ 1 - 3
sha256_sse4_amd64.c

@@ -9,9 +9,7 @@
  *
  */
 
-#include "config.h"
-
-#include "miner.h"
+#include "driver-cpu.h"
 
 #ifdef WANT_X8664_SSE4
 

+ 1 - 1
sha256_via.c

@@ -1,5 +1,5 @@
 
-#include "config.h"
+#include "driver-cpu.h"
 
 #include <stdint.h>
 #include <stdlib.h>

+ 85 - 155
util.c

@@ -1,11 +1,10 @@
-
 /*
- * Copyright 2011 Con Kolivas
+ * Copyright 2011-2012 Con Kolivas
  * Copyright 2010 Jeff Garzik
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the Free
- * Software Foundation; either version 2 of the License, or (at your option)
+ * Software Foundation; either version 3 of the License, or (at your option)
  * any later version.  See COPYING for more details.
  */
 
@@ -20,7 +19,6 @@
 #include <jansson.h>
 #include <curl/curl.h>
 #include <time.h>
-#include <curses.h>
 #include <errno.h>
 #include <unistd.h>
 #include <sys/types.h>
@@ -32,8 +30,10 @@
 # include <winsock2.h>
 # include <mstcpip.h>
 #endif
+
 #include "miner.h"
 #include "elist.h"
+#include "compat.h"
 
 #if JANSSON_MAJOR_VERSION >= 2
 #define JSON_LOADS(str, err_ptr) json_loads((str), 0, (err_ptr))
@@ -57,6 +57,7 @@ struct upload_buffer {
 struct header_info {
 	char		*lp_path;
 	bool		has_rolltime;
+	char		*reason;
 };
 
 struct tq_ent {
@@ -64,67 +65,6 @@ struct tq_ent {
 	struct list_head	q_node;
 };
 
-void vapplog(int prio, const char *fmt, va_list ap)
-{
-	extern bool use_curses;
-
-#ifdef HAVE_SYSLOG_H
-	if (use_syslog) {
-		vsyslog(prio, fmt, ap);
-	}
-#else
-	if (0) {}
-#endif
-	else if (opt_log_output || prio <= LOG_NOTICE) {
-		char *f;
-		int len;
-		struct timeval tv = { };
-		struct tm tm;
-
-		gettimeofday(&tv, NULL);
-
-		localtime_r(&tv.tv_sec, &tm);
-
-		len = 40 + strlen(fmt) + 22;
-		f = alloca(len);
-		sprintf(f, "[%d-%02d-%02d %02d:%02d:%02d] %s\n",
-			tm.tm_year + 1900,
-			tm.tm_mon + 1,
-			tm.tm_mday,
-			tm.tm_hour,
-			tm.tm_min,
-			tm.tm_sec,
-			fmt);
-		/* Only output to stderr if it's not going to the screen as well */
-		if (!isatty(fileno((FILE *)stderr))) {
-			va_list apc;
-
-			va_copy(apc, ap);
-			vfprintf(stderr, f, apc);	/* atomic write to stderr */
-			fflush(stderr);
-		}
-
-		if (use_curses)
-			log_curses(prio, f, ap);
-		else {
-			int len = strlen(f);
-
-			strcpy(f + len - 1, "                    \n");
-
-			log_curses(prio, f, ap);
-		}
-	}
-}
-
-void applog(int prio, const char *fmt, ...)
-{
-	va_list ap;
-
-	va_start(ap, fmt);
-	vapplog(prio, fmt, ap);
-	va_end(ap);
-}
-
 static void databuf_free(struct data_buffer *db)
 {
 	if (!db)
@@ -163,7 +103,7 @@ static size_t upload_data_cb(void *ptr, size_t size, size_t nmemb,
 			     void *user_data)
 {
 	struct upload_buffer *ub = user_data;
-	int len = size * nmemb;
+	unsigned int len = size * nmemb;
 
 	if (len > ub->len)
 		len = ub->len;
@@ -218,11 +158,9 @@ static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
 
 	if (!strcasecmp("X-Roll-Ntime", key)) {
 		if (!strncasecmp("N", val, 1)) {
-			if (opt_debug)
-				applog(LOG_DEBUG, "X-Roll-Ntime: N found");
+			applog(LOG_DEBUG, "X-Roll-Ntime: N found");
 		} else {
-			if (opt_debug)
-				applog(LOG_DEBUG, "X-Roll-Ntime found");
+			applog(LOG_DEBUG, "X-Roll-Ntime found");
 			hi->has_rolltime = true;
 		}
 	}
@@ -232,6 +170,11 @@ static size_t resp_hdr_cb(void *ptr, size_t size, size_t nmemb, void *user_data)
 		val = NULL;
 	}
 
+	if (!strcasecmp("X-Reject-Reason", key)) {
+		hi->reason = val;	/* steal memory reference */
+		val = NULL;
+	}
+
 out:
 	free(key);
 	free(val);
@@ -239,14 +182,15 @@ out:
 }
 
 #ifdef CURL_HAS_SOCKOPT
-int json_rpc_call_sockopt_cb(void *userdata, curl_socket_t fd, curlsocktype purpose)
+int json_rpc_call_sockopt_cb(void __maybe_unused *userdata, curl_socket_t fd,
+			     curlsocktype __maybe_unused purpose)
 {
-	int keepalive = 1;
-	int tcp_keepcnt = 5;
 	int tcp_keepidle = 120;
 	int tcp_keepintvl = 120;
 
 #ifndef WIN32
+	int keepalive = 1;
+	int tcp_keepcnt = 5;
 
 	if (unlikely(setsockopt(fd, SOL_SOCKET, SO_KEEPALIVE, &keepalive, sizeof(keepalive))))
 		return 1;
@@ -305,27 +249,26 @@ static void set_nettime(void)
 json_t *json_rpc_call(CURL *curl, const char *url,
 		      const char *userpass, const char *rpc_req,
 		      bool probe, bool longpoll, bool *rolltime,
-		      struct pool *pool)
+		      struct pool *pool, bool share)
 {
 	json_t *val, *err_val, *res_val;
 	int rc;
-	struct data_buffer all_data = { };
+	struct data_buffer all_data = {NULL, 0};
 	struct upload_buffer upload_data;
-	json_error_t err = { };
+	json_error_t err;
 	struct curl_slist *headers = NULL;
 	char len_hdr[64], user_agent_hdr[128];
 	char curl_err_str[CURL_ERROR_SIZE];
 	long timeout = longpoll ? (60 * 60) : 60;
-	struct header_info hi = { };
+	struct header_info hi = {NULL, false, NULL};
 	bool probing = false;
 
+	memset(&err, 0, sizeof(err));
+
 	/* it is assumed that 'curl' is freshly [re]initialized at this pt */
 
-	if (probe) {
+	if (probe)
 		probing = !pool->probed;
-		/* Probe for only 15 seconds */
-		timeout = 15;
-	}
 	curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeout);
 
 #if 0 /* Disable curl debugging since it spews to stderr */
@@ -336,7 +279,10 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	curl_easy_setopt(curl, CURLOPT_URL, url);
 	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
 	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1);
-	if (!opt_delaynet)
+
+	/* Shares are staggered already and delays in submission can be costly
+	 * so do not delay them */
+	if (!opt_delaynet || share)
 		curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
 	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb);
 	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
@@ -373,7 +319,7 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	headers = curl_slist_append(headers,
 		"Content-type: application/json");
 	headers = curl_slist_append(headers,
-		"X-Mining-Extensions: midstate rollntime");
+		"X-Mining-Extensions: longpoll midstate rollntime submitold");
 	headers = curl_slist_append(headers, len_hdr);
 	headers = curl_slist_append(headers, user_agent_hdr);
 	headers = curl_slist_append(headers, "Expect:"); /* disable Expect hdr*/
@@ -381,43 +327,54 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 	curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
 
 	if (opt_delaynet) {
-		long long now_msecs, last_msecs;
-		struct timeval now, last;
-
-		gettimeofday(&now, NULL);
-		last_nettime(&last);
-		now_msecs = (long long)now.tv_sec * 1000;
-		now_msecs += now.tv_usec / 1000;
-		last_msecs = (long long)last.tv_sec * 1000;
-		last_msecs += last.tv_usec / 1000;
-		if (now_msecs > last_msecs && now_msecs - last_msecs < 250) {
-			struct timespec rgtp;
-
-			rgtp.tv_sec = 0;
-			rgtp.tv_nsec = (250 - (now_msecs - last_msecs)) * 1000000;
-			nanosleep(&rgtp, NULL);
+		/* Don't delay share submission, but still track the nettime */
+		if (!share) {
+			long long now_msecs, last_msecs;
+			struct timeval now, last;
+
+			gettimeofday(&now, NULL);
+			last_nettime(&last);
+			now_msecs = (long long)now.tv_sec * 1000;
+			now_msecs += now.tv_usec / 1000;
+			last_msecs = (long long)last.tv_sec * 1000;
+			last_msecs += last.tv_usec / 1000;
+			if (now_msecs > last_msecs && now_msecs - last_msecs < 250) {
+				struct timespec rgtp;
+
+				rgtp.tv_sec = 0;
+				rgtp.tv_nsec = (250 - (now_msecs - last_msecs)) * 1000000;
+				nanosleep(&rgtp, NULL);
+			}
 		}
 		set_nettime();
 	}
+
 	rc = curl_easy_perform(curl);
+	if (longpoll)
+		pool_tclear(pool, &pool->lp_sent);
 	if (rc) {
 		applog(LOG_INFO, "HTTP request failed: %s", curl_err_str);
 		goto err_out;
 	}
 
 	if (!all_data.buf) {
-		if (opt_debug)
-			applog(LOG_DEBUG, "Empty data received in json_rpc_call.");
+		applog(LOG_DEBUG, "Empty data received in json_rpc_call.");
 		goto err_out;
 	}
 
 	if (probing) {
 		pool->probed = true;
 		/* If X-Long-Polling was found, activate long polling */
-		if (hi.lp_path)
+		if (hi.lp_path) {
+			if (pool->hdr_path != NULL)
+				free(pool->hdr_path);
 			pool->hdr_path = hi.lp_path;
-		else
+		} else {
 			pool->hdr_path = NULL;
+		}
+	} else if (hi.lp_path) {
+		free(hi.lp_path);
+		hi.lp_path = NULL;
 	}
 
 	*rolltime = hi.has_rolltime;
@@ -460,6 +417,11 @@ json_t *json_rpc_call(CURL *curl, const char *url,
 		goto err_out;
 	}
 
+	if (hi.reason) {
+		json_object_set_new(val, "reject-reason", json_string(hi.reason));
+		free(hi.reason);
+		hi.reason = NULL;
+	}
 	successful_connect = true;
 	databuf_free(&all_data);
 	curl_slist_free_all(headers);
@@ -478,8 +440,9 @@ err_out:
 
 char *bin2hex(const unsigned char *p, size_t len)
 {
-	int i;
+	unsigned int i;
 	char *s = malloc((len * 2) + 1);
+
 	if (!s)
 		return NULL;
 
@@ -714,67 +677,34 @@ int thr_info_create(struct thr_info *thr, pthread_attr_t *attr, void *(*start) (
 	return ret;
 }
 
-void thr_info_cancel(struct thr_info *thr)
+void thr_info_freeze(struct thr_info *thr)
 {
+	struct tq_ent *ent, *iter;
+	struct thread_q *tq;
+
 	if (!thr)
 		return;
 
-	if (thr->q)
-		tq_freeze(thr->q);
+	tq = thr->q;
+	if (!tq)
+		return;
 
-	if (PTH(thr) != 0L) {
-		pthread_cancel(thr->pth);
-		PTH(thr) = 0L;
+	mutex_lock(&tq->mutex);
+	tq->frozen = true;
+	list_for_each_entry_safe(ent, iter, &tq->q, q_node) {
+		list_del(&ent->q_node);
+		free(ent);
 	}
+	mutex_unlock(&tq->mutex);
 }
 
-bool get_dondata(char **url, char **userpass)
+void thr_info_cancel(struct thr_info *thr)
 {
-	struct data_buffer all_data = { };
-	char curl_err_str[CURL_ERROR_SIZE];
-	CURL *curl = curl_easy_init();
-	int rc;
-
-	curl_easy_setopt(curl, CURLOPT_NOSIGNAL, 1);
-	curl_easy_setopt(curl, CURLOPT_ENCODING, "");
-	curl_easy_setopt(curl, CURLOPT_FAILONERROR, 1);
-	if (!opt_delaynet)
-		curl_easy_setopt(curl, CURLOPT_TCP_NODELAY, 1);
-	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, all_data_cb);
-	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &all_data);
-	curl_easy_setopt(curl, CURLOPT_ERRORBUFFER, curl_err_str);
-	curl_easy_setopt(curl, CURLOPT_URL, "http://vds.kolivas.org/url");
-	rc = curl_easy_perform(curl);
-	if (rc) {
-		applog(LOG_INFO, "HTTP request failed: %s", curl_err_str);
-		goto err_out;
-	}
-	if (!all_data.buf)
-		goto err_out;
-	*url = strtok(all_data.buf, "\n");
-	all_data.buf = NULL;
-	databuf_free(&all_data);
+	if (!thr)
+		return;
 
-	curl_easy_setopt(curl, CURLOPT_URL, "http://vds.kolivas.org/userpass");
-	rc = curl_easy_perform(curl);
-	if (rc) {
-		applog(LOG_INFO, "HTTP request failed: %s", curl_err_str);
-		goto err_out;
+	if (PTH(thr) != 0L) {
+		pthread_cancel(thr->pth);
+		PTH(thr) = 0L;
 	}
-	if (!all_data.buf)
-		goto err_out;
-	*userpass = strtok(all_data.buf, "\n");
-	all_data.buf = NULL;
-	databuf_free(&all_data);
-
-	applog(LOG_INFO, "Donation URL: %s Userpass: %s", *url, *userpass);
-	curl_easy_cleanup(curl);
-	return true;
-
-err_out:
-	databuf_free(&all_data);
-	*url = NULL;
-	*userpass = NULL;
-	curl_easy_cleanup(curl);
-	return false;
 }

+ 224 - 0
windows-build.txt

@@ -0,0 +1,224 @@
+######################################################################################
+#                                                                                    #
+#          Native WIN32 setup and build instructions (on mingw32/Windows):           #
+#                                                                                    #
+######################################################################################
+
+**************************************************************************************
+* Introduction                                                                       *
+**************************************************************************************
+The following instructions have been tested on both Windows 7 and Windows XP.
+Most of what is described below (copying files, downloading files, etc.) can be done
+directly in the MinGW MSYS shell; these instructions do not do so because package
+versions and links change over time. The best way is to use your browser, go to the
+links directly, and see for yourself which versions you want to install.
+
+If you think that this documentation was helpful and you wish to donate, you can 
+do so at the following address. 12KaKtrK52iQjPdtsJq7fJ7smC32tXWbWr
+
+**************************************************************************************
+* A tip that might help you along the way                                            *
+**************************************************************************************
+Enable "QuickEdit Mode" in your Command Prompt Window or MinGW Command Prompt
+Window (No need to go into the context menu to choose edit-mark/copy/paste):
+Right-click on the title bar and click Properties. Under the Options tab, check
+the box for "QuickEdit Mode". Alternately, if you want this change to be
+permanent on all of your Command Prompt Windows; you can click Defaults instead
+of Properties as described above. Now you can drag and select text you want to
+copy, right-click to copy the text to the clipboard and right-click once again to
+paste it at the desired location. You could for example, copy some text from this
+document to the clipboard and right click in your Command Prompt Window to paste
+what you copied.
+
+**************************************************************************************
+* Install mingw32                                                                    *
+**************************************************************************************
+Go to this url ==> http://www.mingw.org/wiki/Getting_Started
+Click the link that says "Download and run the latest mingw-get-inst version."
+Download and run the latest file. Install MinGW in the default directory.
+(I downloaded the one labeled "mingw-get-inst-20111118" - note that this could 
+be a different version later.)
+Make sure to check the option for "Download latest repository catalogs".
+I just selected all the check boxes (excluding "Fortran Compiler") so that everything
+was installed.
+
+**************************************************************************************
+* Create mstcpip.h                                                                   *
+**************************************************************************************
+Open notepad and copy the following into it. Save it as "\MinGW\include\mstcpip.h".
+Make sure it does not have the ".txt" extension (If it does then rename it).
+
+struct tcp_keepalive
+{
+    u_long onoff;
+    u_long keepalivetime;
+    u_long keepaliveinterval;
+};
+
+#ifndef USE_WS_PREFIX
+
+#define SIO_KEEPALIVE_VALS    _WSAIOW(IOC_VENDOR, 4)
+
+#else
+
+#define WS_SIO_KEEPALIVE_VALS    _WSAIOW(WS_IOC_VENDOR, 4)
+
+#endif
+
+**************************************************************************************
+* Run the MSYS shell for the first time to create your user directory                *
+************************************************************************************** 
+(Start Icon/keyboard key ==> All Programs ==> MinGW ==> MinGW Shell).
+This will create your user directory for you.
+
+**************************************************************************************
+* Install libpdcurses                                                                *
+**************************************************************************************
+Type the lines below to install libpdcurses.
+mingw-get install mingw32-libpdcurses
+mingw-get install mingw32-pdcurses
+Ctrl-D or typing "logout" and pressing the enter key should get you out of the
+window.
+
+**************************************************************************************
+* Copy CGMiner source to your MSYS working directory                                 *
+**************************************************************************************
+Copy CGMiner source code directory into: 
+\MinGW\msys\1.0\home\(folder with your user name)
+
+**************************************************************************************
+* Install AMD APP SDK, latest version (only if you want GPU mining)                  *
+**************************************************************************************
+Note: You do not need to install the AMD APP SDK if you are only using Nvidia GPU's
+Go to this url for the latest AMD APP SDK: 
+ http://developer.amd.com/sdks/AMDAPPSDK/downloads/Pages/default.aspx
+Go to this url for legacy AMD APP SDK's:
+ http://developer.amd.com/sdks/AMDAPPSDK/downloads/pages/AMDAPPSDKDownloadArchive.aspx
+Download and install whichever version you like best.
+Copy the folders in \Program Files (x86)\AMD APP\include to \MinGW\include 
+Copy \Program Files (x86)\AMD APP\lib\x86\libOpenCL.a to \MinGW\lib
+Note: If you are on a 32 bit version of windows "Program Files (x86)" will be 
+"Program Files".
+Note2: If you update your APP SDK later you might want to recopy the above files 
+
+**************************************************************************************
+* Install AMD ADL SDK, latest version (only if you want GPU monitoring)              *
+**************************************************************************************
+Note: You do not need to install the AMD ADL SDK if you are only using Nvidia GPU's	
+Go to this url ==> http://developer.amd.com/sdks/ADLSDK/Pages/default.aspx
+Download and unzip the file you downloaded.
+Pull adl_defines.h, adl_sdk.h, and adl_structures.h out of the include folder 
+Put those files into the ADL_SDK folder in your source tree as shown below.
+\MinGW\msys\1.0\home\(folder with your user name)\cgminer-x.x.x\ADL_SDK
+
+**************************************************************************************
+* Install GTK-WIN, required for Pkg-config in the next step                          *
+**************************************************************************************
+Go to this url ==> http://sourceforge.net/projects/gtk-win/ 
+Download the file.
+After you have downloaded the file Double click/run it and this will install GTK+
+I chose all the selection boxes when I installed.
+Copy libglib-2.0-0.dll and intl.dll from \Program Files (x86)\gtk2-runtime\bin to 
+\MinGW\bin
+Note: If you are on a 32 bit version of windows "Program Files (x86)" will be 
+"Program Files".
+
+**************************************************************************************
+* Install pkg-config                                                                 *
+**************************************************************************************
+Go to this url ==> http://www.gtk.org/download/win32.php
+Scroll down to where it shows pkg-cfg.
+Download the file from the tool link. Extract "pkg-config.exe" from bin and place in
+your  \MinGW\bin directory.
+Download the file from the "Dev" link. Extract "pkg.m4" from share\aclocal and place
+in your \MingW\share\aclocal directory.
+		
+**************************************************************************************
+* Install libcurl                                                                    *
+**************************************************************************************
+Go to this url ==> http://curl.haxx.se/download.html#Win32
+At the section where it says "Win32 - Generic", Click on the link that indicates
+Win32 2000.XP 7.24.0 libcurl SSL and download it.
+The one I downloaded may not be current for you. Choose the latest.
+Extract the files that are in the zip (bin, include, and lib) to their respective
+locations in MinGW (\MinGW\bin, \MinGW\include, and \MinGW\lib).
+Edit the file \MinGW\lib\pkgconfig\libcurl.pc and change "-lcurl" to 
+"-lcurl -lcurldll".
+Ref. http://old.nabble.com/gcc-working-with-libcurl-td20506927.html
+
+**************************************************************************************
+* Build cgminer.exe                                                                  *
+**************************************************************************************
+Run the MinGW MSYS shell 
+(Start Icon/keyboard key ==> All Programs ==> MinGW ==> MinGW Shell).	
+Change the working directory to your CGMiner project folder.
+Example: cd cgminer-2.1.2 [Enter Key] if you are unsure then type "ls -la"
+Another way is to type "cd cg" and then press the tab key; It will auto fill.		
+Type the lines below one at a time. Look for problems after each one before going on
+to the next.
+
+      adl.sh (optional - see below)
+      autoreconf -fvi
+      CFLAGS="-O2 -msse2" ./configure (additional config options, see below)
+      make
+
+**************************************************************************************
+* Copy files to a build directory/folder                                             *
+**************************************************************************************
+Make a directory and copy the following files into it. This will be your CGMiner
+Folder that you use for mining. Remember the .cl filenames could change on later
+releases. If you installed a different version of libcurl then some of those dll's
+may be different as well.
+  cgminer.exe     from \MinGW\msys\1.0\home\(username)\cgminer-x.x.x 
+  *.cl            from \MinGW\msys\1.0\home\(username)\cgminer-x.x.x
+  README          from \MinGW\msys\1.0\home\(username)\cgminer-x.x.x
+  libcurl.dll     from \MinGW\bin
+  libeay32.dll    from \MinGW\bin
+  libidn-11.dll   from \MinGW\bin
+  libssl32.dll    from \MinGW\bin
+  libpdcurses.dll from \MinGW\bin
+  pthreadGC2.dll  from \MinGW\bin
+  
+**************************************************************************************
+* Optional - Install Git into MinGW/MSYS                                             *
+**************************************************************************************
+Go to this url ==> http://code.google.com/p/msysgit/
+Click on the Downloads tab.
+Download the latest "Portable" git archive.
+Extract the git*.exe files from the bin folder and put them into \MinGW\bin.
+Extract the share\git-core folder and place it into \MinGW\share.
+To test if it is working, open a MinGW shell and type the following:
+  git config -–global core.autocrlf false (note: one time run only)
+  git clone git://github.com/ckolivas/cgminer.git
+  
+If you simply just want to update the source after you have already cloned, type:
+  git pull git://github.com/ckolivas/cgminer.git
+
+Now you can get the latest source directly from github.
+
+**************************************************************************************
+* Optional - Make a .sh file to automate copying over ADL files                      *
+**************************************************************************************
+Make a folder/directory in your home folder and name it ADL_SDK.
+ (ref:  \MinGW\msys\1.0\home\(folder with your user name)\ADL_SDK)
+Copy the ADL .h files into that folder/directory.
+Open your favorite text editor and type the following into it.
+ cp -av ../ADL_SDK/*.h ADL_SDK
+Save the file as "adl.sh" and then place the file into "\MinGW\msys\1.0\bin".
+From now on when your current working directory is the cgminer source directory
+You can simply type "adl.sh" and it will place the ADL header files into place
+For you. Make sure you never remove the ADL_SDK folder from your home folder.
+
+**************************************************************************************
+* Some ./configure options                                                           *
+**************************************************************************************
+--disable-opencl        Override detection and disable building with opencl
+--disable-adl           Override detection and disable building with adl
+--enable-bitforce       Compile support for BitForce FPGAs(default disabled)
+--enable-icarus         Compile support for Icarus Board(default disabled)
+
+######################################################################################
+#                                                                                    #
+#       Native WIN32 setup and build instructions (on mingw32/Windows) complete      #
+#                                                                                    #
+######################################################################################

+ 7 - 0
x86_32/sha256_xmm.asm

@@ -250,3 +250,10 @@ LAB_RET:
 	pop	edi
 	pop	esi
 	retn	4
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif

+ 7 - 0
x86_64/sha256_sse4_amd64.asm

@@ -256,3 +256,10 @@ LAB_LOOP:
 LAB_RET:
 	pop	rbx
 	ret
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif

+ 7 - 0
x86_64/sha256_xmm_amd64.asm

@@ -320,3 +320,10 @@ sha256_sse2_64_new:
 LAB_RET:
     pop       rbx
     ret
+
+%ifidn __OUTPUT_FORMAT__,elf
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif

Some files were not shown because too many files changed in this diff