Browse Source

Merge branch 'ocl_opt_goffset' into bfgminer

Luke Dashjr 11 years ago
parent
commit
c7ab4f95a6
8 changed files with 129 additions and 54 deletions
  1. 22 0
      driver-opencl.c
  2. 1 0
      driver-opencl.h
  3. 66 38
      ocl.c
  4. 0 1
      ocl.h
  5. 13 5
      opencl/psw.cl
  6. 13 5
      opencl/scrypt.cl
  7. 13 5
      opencl/zuikkis.cl
  8. 1 0
      util.c

+ 22 - 0
driver-opencl.c

@@ -304,6 +304,7 @@ void opencl_early_init()
 		struct opencl_device_data * const data = &dataarray[i];
 		struct opencl_device_data * const data = &dataarray[i];
 		*data = (struct opencl_device_data){
 		*data = (struct opencl_device_data){
 			.dynamic = true,
 			.dynamic = true,
+			.use_goffset = BTS_UNKNOWN,
 			.intensity = intensity_not_set,
 			.intensity = intensity_not_set,
 #ifdef USE_SCRYPT
 #ifdef USE_SCRYPT
 			.lookup_gap = 2,
 			.lookup_gap = 2,
@@ -451,6 +452,19 @@ const char *opencl_init_binary(struct cgpu_info * const proc, const char * const
 	return NULL;
 	return NULL;
 }
 }
 
 
+static
+const char *opencl_init_goffset(struct cgpu_info * const proc, const char * const optname, const char * const newvalue, char * const replybuf, enum bfg_set_device_replytype * const out_success)
+{
+	struct opencl_device_data * const data = proc->device_data;
+	char *end;
+	bool nv = bfg_strtobool(newvalue, &end, 0);
+	if (newvalue[0] && !end[0])
+		data->use_goffset = nv;
+	else
+		return "Invalid boolean value";
+	return NULL;
+}
+
 #ifdef HAVE_ADL
 #ifdef HAVE_ADL
 /* This function allows us to map an adl device to an opencl device for when
 /* This function allows us to map an adl device to an opencl device for when
  * simple enumeration has failed to match them. */
  * simple enumeration has failed to match them. */
@@ -1234,6 +1248,12 @@ cl_int queue_scrypt_kernel(const struct opencl_kernel_info * const kinfo, _clSta
 	unsigned int num = 0;
 	unsigned int num = 0;
 	cl_uint le_target;
 	cl_uint le_target;
 	cl_int status = 0;
 	cl_int status = 0;
+	
+	if (!kinfo->goffset)
+	{
+		cl_uint nonce_base = work->blk.nonce;
+		CL_SET_ARG(nonce_base);
+	}
 
 
 	le_target = *(cl_uint *)(work->target + 28);
 	le_target = *(cl_uint *)(work->target + 28);
 	clState->cldata = work->data;
 	clState->cldata = work->data;
@@ -1888,6 +1908,7 @@ static const struct bfg_set_device_definition opencl_set_device_funcs_probe[] =
 	{"vector", opencl_init_vector},
 	{"vector", opencl_init_vector},
 	{"work_size", opencl_init_worksize},
 	{"work_size", opencl_init_worksize},
 	{"binary", opencl_init_binary},
 	{"binary", opencl_init_binary},
+	{"goffset", opencl_init_goffset},
 #ifdef HAVE_ADL
 #ifdef HAVE_ADL
 	{"adl_mapping", opencl_init_gpu_map},
 	{"adl_mapping", opencl_init_gpu_map},
 	{"clock", opencl_init_gpu_engine},
 	{"clock", opencl_init_gpu_engine},
@@ -1913,6 +1934,7 @@ static const struct bfg_set_device_definition opencl_set_device_funcs[] = {
 	{"vector", opencl_cannot_set, ""},
 	{"vector", opencl_cannot_set, ""},
 	{"work_size", opencl_cannot_set, ""},
 	{"work_size", opencl_cannot_set, ""},
 	{"binary", opencl_cannot_set, ""},
 	{"binary", opencl_cannot_set, ""},
+	{"goffset", opencl_cannot_set, ""},
 #ifdef HAVE_ADL
 #ifdef HAVE_ADL
 	{"adl_mapping", opencl_cannot_set, "Map to ADL device"},
 	{"adl_mapping", opencl_cannot_set, "Map to ADL device"},
 	{"clock", opencl_set_gpu_engine, "GPU engine clock"},
 	{"clock", opencl_set_gpu_engine, "GPU engine clock"},

+ 1 - 0
driver-opencl.h

@@ -46,6 +46,7 @@ struct opencl_device_data {
 	char *_init_intensity;
 	char *_init_intensity;
 	bool dynamic;
 	bool dynamic;
 	
 	
+	enum bfg_tristate use_goffset;
 	cl_uint vwidth;
 	cl_uint vwidth;
 	size_t work_size;
 	size_t work_size;
 	cl_ulong max_alloc;
 	cl_ulong max_alloc;

+ 66 - 38
ocl.c

@@ -595,20 +595,6 @@ _clState *opencl_create_clState(unsigned int gpu, char *name, size_t nameSize)
 		clState->hasBitAlign = true;
 		clState->hasBitAlign = true;
 	free(extensions);
 	free(extensions);
 
 
-	/* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */
-	char * devoclver = malloc(1024);
-	const char * ocl10 = "OpenCL 1.0";
-
-	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_VERSION, 1024, (void *)devoclver, NULL);
-	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_VERSION", status);
-		return NULL;
-	}
-	find = strstr(devoclver, ocl10);
-	if (!find)
-		clState->hasOpenCL11plus = true;
-	free(devoclver);
-
 	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&clState->preferred_vwidth, NULL);
 	status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&clState->preferred_vwidth, NULL);
 	if (status != CL_SUCCESS) {
 	if (status != CL_SUCCESS) {
 		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT", status);
 		applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT", status);
@@ -710,6 +696,52 @@ _clState *opencl_create_clState(unsigned int gpu, char *name, size_t nameSize)
 	return clState;
 	return clState;
 }
 }
 
 
+static
+bool opencl_test_goffset(_clState * const clState)
+{
+	if (sizeof(size_t) < sizeof(uint32_t))
+		return false;
+	
+	const char *source = "__kernel __attribute__((reqd_work_group_size(64, 1, 1))) void runtest(volatile __global uint *out) { *out = get_global_id(0); }";
+	const size_t source_len = strlen(source);
+	cl_int status;
+	cl_program program = clCreateProgramWithSource(clState->context, 1, &source, &source_len, &status);
+	if (status != CL_SUCCESS)
+		applogr(false, LOG_ERR, "Error %d: Loading %s code into cl_program (clCreateProgramWithSource)", status, "goffset test");
+	status = bfg_clBuildProgram(&program, clState->devid, "");
+	if (status != CL_SUCCESS)
+	{
+fail:
+		clReleaseProgram(program);
+		return false;
+	}
+	cl_kernel kernel = clCreateKernel(program, "runtest", &status);
+	if (status != CL_SUCCESS)
+		return_via_applog(fail, , LOG_ERR, "Error %d: Creating kernel from %s program (clCreateKernel)", status, "goffset test");
+	static const uint32_t cleardata = 0;
+	status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0, sizeof(cleardata), &cleardata, 0, NULL, NULL);
+	if (status != CL_SUCCESS)
+	{
+		applog(LOG_ERR, "Error %d: Clearing output buffer for %s kernel (clEnqueueWriteBuffer)", status, "goffset test");
+fail2:
+		clReleaseKernel(kernel);
+		goto fail;
+	}
+	status = clSetKernelArg(kernel, 0, sizeof(clState->outputBuffer), &clState->outputBuffer);
+	if (status != CL_SUCCESS)
+		return_via_applog(fail2, , LOG_ERR, "Error %d: Setting kernel argument for %s kernel (clSetKernelArg)", status, "goffset test");
+	const size_t size_t_one = 1, test_goffset = 0xfabd0bf9;
+	status = clEnqueueNDRangeKernel(clState->commandQueue, kernel, 1, &test_goffset, &size_t_one, &size_t_one, 0,  NULL, NULL);
+	if (status != CL_SUCCESS)
+		return_via_applog(fail2, , LOG_DEBUG, "Error %d: Running %s kernel (clEnqueueNDRangeKernel)", status, "goffset test");
+	uint32_t resultdata;
+	status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0, sizeof(resultdata), &resultdata, 0, NULL, NULL);
+	if (status != CL_SUCCESS)
+		return_via_applog(fail2, , LOG_DEBUG, "Error %d: Reading result from %s kernel (clEnqueueReadBuffer)", status, "goffset test");
+	applog(LOG_DEBUG, "%s kernel returned 0x%08lx for goffset 0x%08lx", "goffset test", (unsigned long)resultdata, (unsigned long)test_goffset);
+	return (resultdata == test_goffset);
+}
+
 bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState, const char * const name, struct opencl_kernel_info * const kernelinfo, const char * const kernel_file, __maybe_unused const struct mining_algorithm * const malgo)
 bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState, const char * const name, struct opencl_kernel_info * const kernelinfo, const char * const kernel_file, __maybe_unused const struct mining_algorithm * const malgo)
 {
 {
 	const int gpu = cgpu->device_id;
 	const int gpu = cgpu->device_id;
@@ -774,31 +806,30 @@ bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState,
 
 
 	{
 	{
 		int kernel_goffset_support = 0;  // 0 = none; 1 = optional; 2 = required
 		int kernel_goffset_support = 0;  // 0 = none; 1 = optional; 2 = required
-		switch (kernelinfo->interface)
+		if (strstr(source, "def GOFFSET"))
+			kernel_goffset_support = 1;
+		else
+		if (strstr(source, " base,"))
+			kernel_goffset_support = 2;
+		else
+			kernel_goffset_support = 0;
+		bool device_goffset_support = false;
+		switch (data->use_goffset)
 		{
 		{
-#ifdef USE_SHA256D
-			case KL_DIABLO:
-			case KL_DIAKGCN:
-			case KL_POCLBM:
-				kernel_goffset_support = 1;
+			case BTS_TRUE:
+				device_goffset_support = true;
 				break;
 				break;
-			case KL_PHATK:
-				kernel_goffset_support = 0;
-				break;
-#endif
-#ifdef USE_OPENCL_FULLHEADER
-			case KL_FULLHEADER:
-				kernel_goffset_support = 1;
-				break;
-#endif
-			case KL_NONE: case OPENCL_KERNEL_INTERFACE_COUNT:
-#ifdef USE_SCRYPT
-			case KL_SCRYPT:
-#endif
-				kernel_goffset_support = 2;
+			case BTS_FALSE:
+				// if the kernel doesn't require goffset, allow the user to disable it
+				if (kernel_goffset_support != 2)
+					break;
+				// fallthru
+			case BTS_UNKNOWN:
+				if (opencl_test_goffset(clState))
+					device_goffset_support = true;
 				break;
 				break;
 		}
 		}
-		const bool device_goffset_support = (clState->hasOpenCL11plus && !clState->is_mesa);
+		applog(LOG_DEBUG, "%s: goffset support: device=%s kernel=%s", cgpu->dev_repr, device_goffset_support ? "yes" : "no", (kernel_goffset_support == 2) ? "required" : ((kernel_goffset_support == 1) ? "optional" : "none"));
 		if (device_goffset_support)
 		if (device_goffset_support)
 		{
 		{
 			if (kernel_goffset_support)
 			if (kernel_goffset_support)
@@ -1021,9 +1052,6 @@ build:
 	if (kernelinfo->goffset)
 	if (kernelinfo->goffset)
 		strcat(CompilerOptions, " -D GOFFSET");
 		strcat(CompilerOptions, " -D GOFFSET");
 
 
-	if (!clState->hasOpenCL11plus)
-		strcat(CompilerOptions, " -D OCL1");
-
 	applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions);
 	applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions);
 	status = bfg_clBuildProgram(&kernelinfo->program, clState->devid, CompilerOptions);
 	status = bfg_clBuildProgram(&kernelinfo->program, clState->devid, CompilerOptions);
 	free(CompilerOptions);
 	free(CompilerOptions);

+ 0 - 1
ocl.h

@@ -38,7 +38,6 @@ struct _clState {
 	void * cldata;
 	void * cldata;
 #endif
 #endif
 	bool hasBitAlign;
 	bool hasBitAlign;
-	bool hasOpenCL11plus;
 	cl_uint preferred_vwidth;
 	cl_uint preferred_vwidth;
 	cl_uint vwidth;
 	cl_uint vwidth;
 	size_t max_work_size;
 	size_t max_work_size;

+ 13 - 5
opencl/psw.cl

@@ -697,13 +697,13 @@ void salsa(uint4 B[8])
 #define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
 #define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
 #define CO Coord(z,x,y)
 #define CO Coord(z,x,y)
 
 
-void scrypt_core(uint4 X[8], __global uint4*restrict lookup)
+void scrypt_core(const uint gid, uint4 X[8], __global uint4 * restrict lookup)
 {
 {
 	shittify(X);
 	shittify(X);
 	const uint zSIZE = 8;
 	const uint zSIZE = 8;
 	const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0));
 	const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0));
 	const uint xSIZE = CONCURRENT_THREADS;
 	const uint xSIZE = CONCURRENT_THREADS;
-	uint x = get_global_id(0)%xSIZE;
+	const uint x = gid % xSIZE;
 
 
 	for(uint y=0; y<1024/LOOKUP_GAP; ++y)
 	for(uint y=0; y<1024/LOOKUP_GAP; ++y)
 	{
 	{
@@ -754,11 +754,19 @@ void scrypt_core(uint4 X[8], __global uint4*restrict lookup)
 #define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce
 #define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce
 
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search(__global const uint4 * restrict input,
+__kernel void search(
+#ifndef GOFFSET
+	const uint base,
+#endif
+	__global const uint4 * restrict input,
 volatile __global uint*restrict output, __global uint4*restrict padcache,
 volatile __global uint*restrict output, __global uint4*restrict padcache,
 const uint4 midstate0, const uint4 midstate16, const uint target)
 const uint4 midstate0, const uint4 midstate16, const uint target)
 {
 {
-	uint gid = get_global_id(0);
+	const uint gid = get_global_id(0)
+#ifndef GOFFSET
+		+ base
+#endif
+	;
 	uint4 X[8];
 	uint4 X[8];
 	uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1;
 	uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1;
 	uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid);
 	uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid);
@@ -783,7 +791,7 @@ const uint4 midstate0, const uint4 midstate16, const uint target)
 		SHA256(&pad0,&pad1, data, (uint4)(i+1,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87]));
 		SHA256(&pad0,&pad1, data, (uint4)(i+1,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87]));
 		SHA256(X+i*2,X+i*2+1, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
 		SHA256(X+i*2,X+i*2+1, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
 	}
 	}
-	scrypt_core(X,padcache);
+	scrypt_core(gid, X, padcache);
 	SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);
 	SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);
 	SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);
 	SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);
 	SHA256_fixed(&tmp0,&tmp1);
 	SHA256_fixed(&tmp0,&tmp1);

+ 13 - 5
opencl/scrypt.cl

@@ -760,13 +760,13 @@ void salsa(uint4 B[8])
 #define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
 #define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
 #define CO Coord(z,x,y)
 #define CO Coord(z,x,y)
 
 
-void scrypt_core(uint4 X[8], __global uint4*restrict lookup)
+void scrypt_core(const uint gid, uint4 X[8], __global uint4 * restrict lookup)
 {
 {
 	shittify(X);
 	shittify(X);
 	const uint zSIZE = 8;
 	const uint zSIZE = 8;
 	const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0));
 	const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0));
 	const uint xSIZE = CONCURRENT_THREADS;
 	const uint xSIZE = CONCURRENT_THREADS;
-	uint x = get_global_id(0)%xSIZE;
+	const uint x = gid % xSIZE;
 
 
 	for(uint y=0; y<1024/LOOKUP_GAP; ++y)
 	for(uint y=0; y<1024/LOOKUP_GAP; ++y)
 	{
 	{
@@ -817,11 +817,19 @@ void scrypt_core(uint4 X[8], __global uint4*restrict lookup)
 #define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce
 #define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce
 
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search(__global const uint4 * restrict input,
+__kernel void search(
+#ifndef GOFFSET
+	const uint base,
+#endif
+	__global const uint4 * restrict input,
 volatile __global uint*restrict output, __global uint4*restrict padcache,
 volatile __global uint*restrict output, __global uint4*restrict padcache,
 const uint4 midstate0, const uint4 midstate16, const uint target)
 const uint4 midstate0, const uint4 midstate16, const uint target)
 {
 {
-	uint gid = get_global_id(0);
+	const uint gid = get_global_id(0)
+#ifndef GOFFSET
+		+ base
+#endif
+	;
 	uint4 X[8];
 	uint4 X[8];
 	uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1;
 	uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1;
 	uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid);
 	uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid);
@@ -846,7 +854,7 @@ const uint4 midstate0, const uint4 midstate16, const uint target)
 		SHA256(&pad0,&pad1, data, (uint4)(i+1,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87]));
 		SHA256(&pad0,&pad1, data, (uint4)(i+1,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87]));
 		SHA256(X+i*2,X+i*2+1, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
 		SHA256(X+i*2,X+i*2+1, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
 	}
 	}
-	scrypt_core(X,padcache);
+	scrypt_core(gid, X, padcache);
 	SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);
 	SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);
 	SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);
 	SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);
 	SHA256_fixed(&tmp0,&tmp1);
 	SHA256_fixed(&tmp0,&tmp1);

+ 13 - 5
opencl/zuikkis.cl

@@ -748,13 +748,13 @@ void salsa(uint4 B[8])
 #define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
 #define Coord(x,y,z) x+y*(x ## SIZE)+z*(y ## SIZE)*(x ## SIZE)
 #define CO Coord(z,x,y)
 #define CO Coord(z,x,y)
 
 
-void scrypt_core(uint4 X[8], __global uint4*restrict lookup)
+void scrypt_core(const uint gid, uint4 X[8], __global uint4 * restrict lookup)
 {
 {
 	shittify(X);
 	shittify(X);
 	const uint zSIZE = 8;
 	const uint zSIZE = 8;
 	const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0));
 	const uint ySIZE = (1024/LOOKUP_GAP+(1024%LOOKUP_GAP>0));
 	const uint xSIZE = CONCURRENT_THREADS;
 	const uint xSIZE = CONCURRENT_THREADS;
-	uint x = get_global_id(0)%xSIZE;
+	const uint x = gid % xSIZE;
 
 
 	for(uint y=0; y<1024/LOOKUP_GAP; ++y)
 	for(uint y=0; y<1024/LOOKUP_GAP; ++y)
 	{
 	{
@@ -792,11 +792,19 @@ void scrypt_core(uint4 X[8], __global uint4*restrict lookup)
 #define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce
 #define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce
 
 
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
 __attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
-__kernel void search(__global const uint4 * restrict input,
+__kernel void search(
+#ifndef GOFFSET
+	const uint base,
+#endif
+	__global const uint4 * restrict input,
 volatile __global uint*restrict output, __global uint4*restrict padcache,
 volatile __global uint*restrict output, __global uint4*restrict padcache,
 const uint4 midstate0, const uint4 midstate16, const uint target)
 const uint4 midstate0, const uint4 midstate16, const uint target)
 {
 {
-	uint gid = get_global_id(0);
+	const uint gid = get_global_id(0)
+#ifndef GOFFSET
+		+ base
+#endif
+	;
 	uint4 X[8];
 	uint4 X[8];
 	uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1;
 	uint4 tstate0, tstate1, ostate0, ostate1, tmp0, tmp1;
 	uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid);
 	uint4 data = (uint4)(input[4].x,input[4].y,input[4].z,gid);
@@ -820,7 +828,7 @@ const uint4 midstate0, const uint4 midstate16, const uint target)
 		SHA256(&pad0,&pad1, data, (uint4)(i+1,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87]));
 		SHA256(&pad0,&pad1, data, (uint4)(i+1,K[84],0,0), (uint4)(0,0,0,0), (uint4)(0,0,0, K[87]));
 		SHA256(X+i*2,X+i*2+1, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
 		SHA256(X+i*2,X+i*2+1, pad0, pad1, (uint4)(K[84], 0U, 0U, 0U), (uint4)(0U, 0U, 0U, K[88]));
 	}
 	}
-	scrypt_core(X,padcache);
+	scrypt_core(gid, X, padcache);
 	SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);
 	SHA256(&tmp0,&tmp1, X[0], X[1], X[2], X[3]);
 	SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);
 	SHA256(&tmp0,&tmp1, X[4], X[5], X[6], X[7]);
 	SHA256_fixed(&tmp0,&tmp1);
 	SHA256_fixed(&tmp0,&tmp1);

+ 1 - 0
util.c

@@ -2244,6 +2244,7 @@ bool bfg_strtobool(const char * const s, char ** const endptr, __maybe_unused co
 		
 		
 		{true , "enable"},
 		{true , "enable"},
 		{true , "always"},
 		{true , "always"},
+		{true , "force"},
 		{true , "true"},
 		{true , "true"},
 		{true , "yes"},
 		{true , "yes"},
 		{true , "on"},
 		{true , "on"},