Browse Source

opencl: Support for per-work mining algorithms

Luke Dashjr 11 years ago
parent
commit
c7b7a4a9d5
5 changed files with 259 additions and 187 deletions
  1. 133 52
      driver-opencl.c
  2. 4 3
      driver-opencl.h
  3. 2 2
      findnonce.c
  4. 97 121
      ocl.c
  5. 23 9
      ocl.h

+ 133 - 52
driver-opencl.c

@@ -260,8 +260,6 @@ load_opencl_symbols() {
 #endif
 
 
-typedef cl_int (*queue_kernel_parameters_func_t)(_clState *, struct work *, cl_uint);
-
 struct opencl_kernel_interface {
 	const char *kiname;
 	queue_kernel_parameters_func_t queue_kernel_parameters_func;
@@ -309,6 +307,9 @@ void opencl_early_init()
 		*data = (struct opencl_device_data){
 			.dynamic = true,
 			.intensity = intensity_not_set,
+#ifdef USE_SCRYPT
+			.lookup_gap = 2,
+#endif
 		};
 		gpus[i] = (struct cgpu_info){
 			.device_data = data,
@@ -376,8 +377,8 @@ _SET_INT_LIST(worksize, (v >= 1 && v <= 9999)       , work_size)
 
 #ifdef USE_SCRYPT
 _SET_INT_LIST(shaders           , true, shaders)
-_SET_INT_LIST(lookup_gap        , true, opt_lg )
-_SET_INT_LIST(thread_concurrency, true, opt_tc )
+_SET_INT_LIST(lookup_gap        , true, lookup_gap)
+_SET_INT_LIST(thread_concurrency, true, thread_concurrency)
 #endif
 
 enum cl_kernels select_kernel(const char * const arg)
@@ -406,20 +407,26 @@ const char *opencl_get_kernel_interface_name(const enum cl_kernels kern)
 static
 bool _set_kernel(struct cgpu_info * const cgpu, const char *_val)
 {
-	FILE *F;
 	struct opencl_device_data * const data = cgpu->device_data;
 	
 	size_t knamelen = strlen(_val);
 	char filename[knamelen + 3 + 1];
 	sprintf(filename, "%s.cl", _val);
 	
-	F = opencl_open_kernel(filename);
-	if (!F)
+	int dummy_srclen;
+	enum cl_kernels interface;
+	char *src = opencl_kernel_source(filename, &dummy_srclen, &interface);
+	if (!src)
 		return false;
-	fclose(F);
+	free(src);
 	
-	free(data->kernel_file);
-	data->kernel_file = strdup(_val);
+	char **kfp = &data->kernel_file_sha256d;
+#ifdef USE_SCRYPT
+	if (interface == KL_SCRYPT)
+		kfp = &data->kernel_file_scrypt;
+#endif
+	free(*kfp);
+	*kfp = strdup(_val);
 	
 	return true;
 }
@@ -1005,10 +1012,10 @@ struct opencl_work_data *_opencl_work_data(struct work * const work)
 }
 
 static
-cl_int queue_poclbm_kernel(_clState * const clState, struct work * const work, const cl_uint threads)
+cl_int queue_poclbm_kernel(const struct opencl_kernel_info * const kinfo, _clState * const clState, struct work * const work, const cl_uint threads)
 {
 	struct opencl_work_data * const blk = _opencl_work_data(work);
-	cl_kernel *kernel = &clState->kernel;
+	const cl_kernel * const kernel = &kinfo->kernel;
 	unsigned int num = 0;
 	cl_int status = 0;
 
@@ -1029,7 +1036,8 @@ cl_int queue_poclbm_kernel(_clState * const clState, struct work * const work, c
 	CL_SET_BLKARG(cty_g);
 	CL_SET_BLKARG(cty_h);
 
-	if (!clState->goffset) {
+	if (!kinfo->goffset)
+	{
 		cl_uint vwidth = clState->vwidth;
 		uint *nonces = alloca(sizeof(uint) * vwidth);
 		unsigned int i;
@@ -1060,10 +1068,10 @@ cl_int queue_poclbm_kernel(_clState * const clState, struct work * const work, c
 }
 
 static
-cl_int queue_phatk_kernel(_clState * const clState, struct work * const work, __maybe_unused const cl_uint threads)
+cl_int queue_phatk_kernel(const struct opencl_kernel_info * const kinfo, _clState * const clState, struct work * const work, __maybe_unused const cl_uint threads)
 {
 	struct opencl_work_data * const blk = _opencl_work_data(work);
-	cl_kernel *kernel = &clState->kernel;
+	const cl_kernel * const kernel = &kinfo->kernel;
 	cl_uint vwidth = clState->vwidth;
 	unsigned int i, num = 0;
 	cl_int status = 0;
@@ -1105,14 +1113,14 @@ cl_int queue_phatk_kernel(_clState * const clState, struct work * const work, __
 }
 
 static
-cl_int queue_diakgcn_kernel(_clState * const clState, struct work * const work, __maybe_unused const cl_uint threads)
+cl_int queue_diakgcn_kernel(const struct opencl_kernel_info * const kinfo, _clState * const clState, struct work * const work, __maybe_unused const cl_uint threads)
 {
 	struct opencl_work_data * const blk = _opencl_work_data(work);
-	cl_kernel *kernel = &clState->kernel;
+	const cl_kernel * const kernel = &kinfo->kernel;
 	unsigned int num = 0;
 	cl_int status = 0;
 
-	if (!clState->goffset) {
+	if (!kinfo->goffset) {
 		cl_uint vwidth = clState->vwidth;
 		uint *nonces = alloca(sizeof(uint) * vwidth);
 		unsigned int i;
@@ -1167,14 +1175,14 @@ cl_int queue_diakgcn_kernel(_clState * const clState, struct work * const work,
 }
 
 static
-cl_int queue_diablo_kernel(_clState * const clState, struct work * const work, const cl_uint threads)
+cl_int queue_diablo_kernel(const struct opencl_kernel_info * const kinfo, _clState * const clState, struct work * const work, const cl_uint threads)
 {
 	struct opencl_work_data * const blk = _opencl_work_data(work);
-	cl_kernel *kernel = &clState->kernel;
+	const cl_kernel * const kernel = &kinfo->kernel;
 	unsigned int num = 0;
 	cl_int status = 0;
 
-	if (!clState->goffset) {
+	if (!kinfo->goffset) {
 		cl_uint vwidth = clState->vwidth;
 		uint *nonces = alloca(sizeof(uint) * vwidth);
 		unsigned int i;
@@ -1223,10 +1231,10 @@ cl_int queue_diablo_kernel(_clState * const clState, struct work * const work, c
 
 #ifdef USE_SCRYPT
 static
-cl_int queue_scrypt_kernel(_clState * const clState, struct work * const work, __maybe_unused const cl_uint threads)
+cl_int queue_scrypt_kernel(const struct opencl_kernel_info * const kinfo, _clState * const clState, struct work * const work, __maybe_unused const cl_uint threads)
 {
 	unsigned char *midstate = work->midstate;
-	cl_kernel *kernel = &clState->kernel;
+	const cl_kernel * const kernel = &kinfo->kernel;
 	unsigned int num = 0;
 	cl_uint le_target;
 	cl_int status = 0;
@@ -1555,7 +1563,6 @@ get_opencl_api_extra_device_status(struct cgpu_info *gpu)
 }
 
 struct opencl_thread_data {
-	cl_int (*queue_kernel_parameters)(_clState *, struct work *, cl_uint);
 	uint32_t *res;
 };
 
@@ -1570,7 +1577,7 @@ static bool opencl_thread_prepare(struct thr_info *thr)
 	int virtual_gpu = data->virtual_gpu;
 	int i = thr->id;
 	static bool failmessage = false;
-	int buffersize = opt_scrypt ? SCRYPT_BUFFERSIZE : BUFFERSIZE;
+	int buffersize = SCRYPT_BUFFERSIZE;
 
 	if (!blank_res)
 		blank_res = calloc(buffersize, 1);
@@ -1659,9 +1666,8 @@ static bool opencl_thread_init(struct thr_info *thr)
 
 static bool opencl_prepare_work(struct thr_info __maybe_unused *thr, struct work *work)
 {
-#ifdef USE_SCRYPT
-	if (!opt_scrypt)
-#endif
+	const struct mining_algorithm * const malgo = work_mining_algorithm(work);
+	if (malgo->algo == POW_SHA256D)
 	{
 		struct opencl_work_data * const blk = _opencl_work_data(work);
 		precalc_hash(blk, (uint32_t *)(work->midstate), (uint32_t *)(work->data + 64));
@@ -1671,49 +1677,116 @@ static bool opencl_prepare_work(struct thr_info __maybe_unused *thr, struct work
 
 extern int opt_dynamic_interval;
 
-static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
-				int64_t __maybe_unused max_nonce)
+const struct opencl_kernel_info *opencl_scanhash_get_kernel(struct cgpu_info * const cgpu, _clState * const clState, const struct mining_algorithm * const malgo)
 {
-	const int thr_id = thr->id;
-	struct opencl_thread_data *thrdata = thr->cgpu_data;
-	struct cgpu_info *gpu = thr->cgpu;
-	struct opencl_device_data * const data = gpu->device_data;
-	_clState *clState = clStates[thr_id];
-	if (!clState->kernel_loaded)
+	struct opencl_device_data * const data = cgpu->device_data;
+	struct opencl_kernel_info *kernelinfo;
+	char *kernel_file;
+	switch (malgo->algo)
+	{
+		case POW_SHA256D:
+			kernelinfo = &clState->kernel_sha256d;
+			if (!data->kernel_file_sha256d)
+			{
+				const char * const vbuff = clState->platform_ver_str;
+				if (clState->is_mesa)
+				{
+					applog(LOG_INFO, "Selecting phatk kernel for Mesa");
+					data->kernel_file_sha256d = strdup("phatk");
+				}
+				else  /* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */
+				if (!strstr(cgpu->name, "Tahiti") &&
+				   (strstr(vbuff, "844.4") ||  // Linux 64 bit ATI 2.6 SDK
+				    strstr(vbuff, "851.4") ||  // Windows 64 bit ""
+				    strstr(vbuff, "831.4") ||
+				    strstr(vbuff, "898.1") ||  // 12.2 driver SDK 
+				    strstr(vbuff, "923.1") ||  // 12.4
+				    strstr(vbuff, "938.2") ||  // SDK 2.7
+				    strstr(vbuff, "1113.2")))  // SDK 2.8
+				{
+					applog(LOG_INFO, "Selecting diablo kernel");
+					data->kernel_file_sha256d = strdup("diablo");
+				}
+				else  /* Detect all 7970s, older ATI and NVIDIA and use poclbm */
+				if (strstr(cgpu->name, "Tahiti") || !clState->hasBitAlign)
+				{
+					applog(LOG_INFO, "Selecting poclbm kernel");
+					data->kernel_file_sha256d = strdup("poclbm");
+				}
+				else  /* Use phatk for the rest R5xxx R6xxx */
+				{
+					applog(LOG_INFO, "Selecting phatk kernel");
+					data->kernel_file_sha256d = strdup("phatk");
+				}
+			}
+			kernel_file = data->kernel_file_sha256d;
+			break;
+#ifdef USE_SCRYPT
+		case POW_SCRYPT:
+			kernelinfo = &clState->kernel_scrypt;
+			BFGINIT(data->kernel_file_scrypt, strdup("scrypt"));
+			kernel_file = data->kernel_file_scrypt;
+			break;
+#endif
+	}
+	if (!kernelinfo->loaded)
 	{
-		if (!opencl_load_kernel(gpu, clState, gpu->name))
-			applogr(-1, LOG_ERR, "%s: Failed to load kernel", gpu->dev_repr);
+		if (!opencl_load_kernel(cgpu, clState, cgpu->name, kernelinfo, kernel_file, malgo))
+			applogr(NULL, LOG_ERR, "%s: Failed to load kernel", cgpu->dev_repr);
 		
-		switch (clState->chosen_kernel) {
+		switch (kernelinfo->interface)
+		{
 			case KL_POCLBM:
-				thrdata->queue_kernel_parameters = &queue_poclbm_kernel;
+				kernelinfo->queue_kernel_parameters = &queue_poclbm_kernel;
 				break;
 			case KL_PHATK:
-				thrdata->queue_kernel_parameters = &queue_phatk_kernel;
+				kernelinfo->queue_kernel_parameters = &queue_phatk_kernel;
 				break;
 			case KL_DIAKGCN:
-				thrdata->queue_kernel_parameters = &queue_diakgcn_kernel;
+				kernelinfo->queue_kernel_parameters = &queue_diakgcn_kernel;
 				break;
 #ifdef USE_SCRYPT
 			case KL_SCRYPT:
-				thrdata->queue_kernel_parameters = &queue_scrypt_kernel;
+				kernelinfo->queue_kernel_parameters = &queue_scrypt_kernel;
 				break;
 #endif
 			default:
 			case KL_DIABLO:
-				thrdata->queue_kernel_parameters = &queue_diablo_kernel;
+				kernelinfo->queue_kernel_parameters = &queue_diablo_kernel;
 				break;
 		}
 	}
-	const cl_kernel *kernel = &clState->kernel;
+	return kernelinfo;
+}
+
+static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
+				int64_t __maybe_unused max_nonce)
+{
+	const int thr_id = thr->id;
+	struct opencl_thread_data *thrdata = thr->cgpu_data;
+	struct cgpu_info *gpu = thr->cgpu;
+	struct opencl_device_data * const data = gpu->device_data;
+	_clState *clState = clStates[thr_id];
+	const struct mining_algorithm * const malgo = work_mining_algorithm(work);
+	const struct opencl_kernel_info *kinfo = opencl_scanhash_get_kernel(gpu, clState, malgo);
+	if (!kinfo)
+		return -1;
+	const cl_kernel * const kernel = &kinfo->kernel;
 	const int dynamic_us = opt_dynamic_interval * 1000;
 
 	cl_int status;
 	size_t globalThreads[1];
-	size_t localThreads[1] = { clState->wsize };
+	size_t localThreads[1] = { kinfo->wsize };
 	int64_t hashes;
-	int found = opt_scrypt ? SCRYPT_FOUND : FOUND;
-	int buffersize = opt_scrypt ? SCRYPT_BUFFERSIZE : BUFFERSIZE;
+	int found = FOUND;
+	int buffersize = BUFFERSIZE;
+#ifdef USE_SCRYPT
+	if (malgo->algo == POW_SCRYPT)
+	{
+		found = SCRYPT_FOUND;
+		buffersize = SCRYPT_BUFFERSIZE;
+	}
+#endif
 
 	/* Windows' timer resolution is only 15ms so oversample 5x */
 	if (data->dynamic && (++data->intervals * dynamic_us) > 70000) {
@@ -1748,13 +1821,14 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 	if (hashes > gpu->max_hashes)
 		gpu->max_hashes = hashes;
 
-	status = thrdata->queue_kernel_parameters(clState, work, globalThreads[0]);
+	status = kinfo->queue_kernel_parameters(kinfo, clState, work, globalThreads[0]);
 	if (unlikely(status != CL_SUCCESS)) {
 		applog(LOG_ERR, "Error: clSetKernelArg of all params failed.");
 		return -1;
 	}
 
-	if (clState->goffset) {
+	if (kinfo->goffset)
+	{
 		size_t global_work_offset[1];
 
 		global_work_offset[0] = work->blk.nonce;
@@ -1802,13 +1876,20 @@ static int64_t opencl_scanhash(struct thr_info *thr, struct work *work,
 	return hashes;
 }
 
+static
+void opencl_clean_kernel_info(struct opencl_kernel_info * const kinfo)
+{
+	clReleaseKernel(kinfo->kernel);
+	clReleaseProgram(kinfo->program);
+}
+
 static void opencl_thread_shutdown(struct thr_info *thr)
 {
 	const int thr_id = thr->id;
 	_clState *clState = clStates[thr_id];
 
-	clReleaseKernel(clState->kernel);
-	clReleaseProgram(clState->program);
+	opencl_clean_kernel_info(&clState->kernel_sha256d);
+	opencl_clean_kernel_info(&clState->kernel_scrypt);
 	clReleaseCommandQueue(clState->commandQueue);
 	clReleaseContext(clState->context);
 }

+ 4 - 3
driver-opencl.h

@@ -46,13 +46,14 @@ struct opencl_device_data {
 	
 	cl_uint vwidth;
 	size_t work_size;
-	char *kernel_file;
+	char *kernel_file_sha256d;
 	cl_ulong max_alloc;
 	
 	enum opencl_binary_usage opt_opencl_binaries;
 #ifdef USE_SCRYPT
-	int opt_lg, lookup_gap;
-	size_t opt_tc, thread_concurrency;
+	char *kernel_file_scrypt;
+	int lookup_gap;
+	size_t thread_concurrency;
 	size_t shaders;
 #endif
 	struct timeval tv_gpustart;

+ 2 - 2
findnonce.c

@@ -147,7 +147,7 @@ static void *postcalc_hash(void *userdata)
 	struct pc_data *pcd = (struct pc_data *)userdata;
 	struct thr_info *thr = pcd->thr;
 	unsigned int entry = 0;
-	int found = opt_scrypt ? SCRYPT_FOUND : FOUND;
+	int found = (work_mining_algorithm(&pcd->work)->algo == POW_SCRYPT) ? SCRYPT_FOUND : FOUND;
 
 	pthread_detach(pthread_self());
 	RenameThread("postcalchsh");
@@ -188,7 +188,7 @@ void postcalc_hash_async(struct thr_info *thr, struct work *work, uint32_t *res)
 		.thr = thr,
 	};
 	__copy_work(&pcd->work, work);
-	buffersize = opt_scrypt ? SCRYPT_BUFFERSIZE : BUFFERSIZE;
+	buffersize = (work_mining_algorithm(work)->algo == POW_SCRYPT) ? SCRYPT_BUFFERSIZE : BUFFERSIZE;
 	memcpy(&pcd->res, res, buffersize);
 
 	if (pthread_create(&pcd->pth, NULL, postcalc_hash, (void *)pcd)) {

+ 97 - 121
ocl.c

@@ -258,6 +258,30 @@ char *file_contents(const char *filename, int *length)
 	return (char*)buffer;
 }
 
+char *opencl_kernel_source(const char * const filename, int * const out_sourcelen, enum cl_kernels * const out_kinterface)
+{
+	char *source = file_contents(filename, out_sourcelen);
+	if (!source)
+		return NULL;
+	char *s = strstr(source, "kernel-interface:"), *q;
+	if (s)
+	{
+		for (s = &s[17]; s[0] && isspace(s[0]); ++s)
+			if (s[0] == '\n' || s[0] == '\r')
+				break;
+		for (q = s; q[0] && !isspace(q[0]); ++q)
+		{}  // Find end of string
+		const size_t kinamelen = q - s;
+		char kiname[kinamelen + 1];
+		memcpy(kiname, s, kinamelen);
+		kiname[kinamelen] = '\0';
+		*out_kinterface = select_kernel(kiname);
+	}
+	else
+		*out_kinterface = KL_NONE;
+	return source;
+}
+
 extern int opt_g_threads;
 
 int clDevicesNum(void) {
@@ -339,20 +363,20 @@ int clDevicesNum(void) {
 	return most_devices;
 }
 
-cl_int bfg_clBuildProgram(_clState * const clState, const cl_device_id devid, const char * const CompilerOptions)
+cl_int bfg_clBuildProgram(cl_program * const program, const cl_device_id devid, const char * const CompilerOptions)
 {
 	cl_int status;
 	
-	status = clBuildProgram(clState->program, 1, &devid, CompilerOptions, NULL, NULL);
+	status = clBuildProgram(*program, 1, &devid, CompilerOptions, NULL, NULL);
 	
 	if (status != CL_SUCCESS)
 	{
 		applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status);
 		size_t logSize;
-		status = clGetProgramBuildInfo(clState->program, devid, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
+		status = clGetProgramBuildInfo(*program, devid, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
 		
 		char *log = malloc(logSize ?: 1);
-		status = clGetProgramBuildInfo(clState->program, devid, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
+		status = clGetProgramBuildInfo(*program, devid, CL_PROGRAM_BUILD_LOG, logSize, log, NULL);
 		if (logSize > 0 && log[0])
 			applog(LOG_ERR, "%s", log);
 		free(log);
@@ -640,6 +664,22 @@ _clState *opencl_create_clState(unsigned int gpu, char *name, size_t nameSize)
 	clState->devid = devices[gpu];
 	free(devices);
 	
+	/* For some reason 2 vectors is still better even if the card says
+	 * otherwise, and many cards lie about their max so use 256 as max
+	 * unless explicitly set on the command line. Tahiti prefers 1 */
+	if (strstr(name, "Tahiti"))
+		clState->preferred_vwidth = 1;
+	else
+	if (clState->preferred_vwidth > 2)
+		clState->preferred_vwidth = 2;
+
+	if (data->vwidth)
+		clState->vwidth = data->vwidth;
+	else {
+		clState->vwidth = clState->preferred_vwidth;
+		data->vwidth = clState->preferred_vwidth;
+	}
+
 	clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, SCRYPT_BUFFERSIZE, NULL, &status);
 	if (status != CL_SUCCESS) {
 		applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status);
@@ -649,13 +689,13 @@ _clState *opencl_create_clState(unsigned int gpu, char *name, size_t nameSize)
 	return clState;
 }
 
-bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState, const char * const name)
+bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState, const char * const name, struct opencl_kernel_info * const kernelinfo, const char * const kernel_file, __maybe_unused const struct mining_algorithm * const malgo)
 {
 	const int gpu = cgpu->device_id;
 	bool patchbfi = false, prog_built = false;
 	struct opencl_device_data * const data = cgpu->device_data;
 	const char * const vbuff = clState->platform_ver_str;
-	char *s, *q;
+	char *s;
 	cl_int status;
 	
 	/* Create binary filename based on parameters passed to opencl
@@ -669,43 +709,10 @@ bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState,
 	char filename[255];
 	char numbuf[32];
 
-	if (!data->kernel_file)
-	{
-		if (opt_scrypt) {
-			applog(LOG_INFO, "Selecting scrypt kernel");
-			clState->chosen_kernel = KL_SCRYPT;
-		}
-		else if (clState->is_mesa)
-		{
-			applog(LOG_INFO, "Selecting phatk kernel for Mesa");
-			clState->chosen_kernel = KL_PHATK;
-		} else if (!strstr(name, "Tahiti") &&
-			/* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */
-			(strstr(vbuff, "844.4") ||  // Linux 64 bit ATI 2.6 SDK
-			 strstr(vbuff, "851.4") ||  // Windows 64 bit ""
-			 strstr(vbuff, "831.4") ||
-			 strstr(vbuff, "898.1") ||  // 12.2 driver SDK 
-			 strstr(vbuff, "923.1") ||  // 12.4
-			 strstr(vbuff, "938.2") ||  // SDK 2.7
-			 strstr(vbuff, "1113.2"))) {// SDK 2.8
-				applog(LOG_INFO, "Selecting diablo kernel");
-				clState->chosen_kernel = KL_DIABLO;
-		/* Detect all 7970s, older ATI and NVIDIA and use poclbm */
-		} else if (strstr(name, "Tahiti") || !clState->hasBitAlign) {
-			applog(LOG_INFO, "Selecting poclbm kernel");
-			clState->chosen_kernel = KL_POCLBM;
-		/* Use phatk for the rest R5xxx R6xxx */
-		} else {
-			applog(LOG_INFO, "Selecting phatk kernel");
-			clState->chosen_kernel = KL_PHATK;
-		}
-		data->kernel_file = strdup(opencl_get_kernel_interface_name(clState->chosen_kernel));
-	}
-	
-	snprintf(filename, sizeof(filename), "%s.cl", data->kernel_file);
-	snprintf(binaryfilename, sizeof(filename), "%s", data->kernel_file);
+	snprintf(filename, sizeof(filename), "%s.cl", kernel_file);
+	snprintf(binaryfilename, sizeof(filename), "%s", kernel_file);
 	int pl;
-	char *source = file_contents(filename, &pl);
+	char *source = opencl_kernel_source(filename, &pl, &kernelinfo->interface);
 	if (!source)
 		return false;
 	{
@@ -715,27 +722,11 @@ bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState,
 		bin2hex(hashhex, hash, 3);
 		tailsprintf(binaryfilename, sizeof(binaryfilename), "-%s", hashhex);
 	}
-	s = strstr(source, "kernel-interface:");
-	if (s)
+	switch (kernelinfo->interface)
 	{
-		for (s = &s[17]; s[0] && isspace(s[0]); ++s)
-			if (s[0] == '\n' || s[0] == '\r')
-				break;
-		for (q = s; q[0] && !isspace(q[0]); ++q)
-		{}  // Find end of string
-		const size_t kinamelen = q - s;
-		char kiname[kinamelen + 1];
-		memcpy(kiname, s, kinamelen);
-		kiname[kinamelen] = '\0';
-		clState->chosen_kernel = select_kernel(kiname);
-	}
-	else
-	if (opt_scrypt)
-		clState->chosen_kernel = KL_SCRYPT;
-	switch (clState->chosen_kernel) {
 		case KL_NONE:
 			applog(LOG_ERR, "%s: Failed to identify kernel interface for %s",
-			       cgpu->dev_repr, data->kernel_file);
+			       cgpu->dev_repr, kernel_file);
 			free(source);
 			return false;
 		case KL_PHATK:
@@ -752,48 +743,30 @@ bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState,
 			;
 	}
 	applog(LOG_DEBUG, "%s: Using kernel %s with interface %s",
-	       cgpu->dev_repr, data->kernel_file,
-	       opencl_get_kernel_interface_name(clState->chosen_kernel));
-
-	/* For some reason 2 vectors is still better even if the card says
-	 * otherwise, and many cards lie about their max so use 256 as max
-	 * unless explicitly set on the command line. Tahiti prefers 1 */
-	if (strstr(name, "Tahiti"))
-		clState->preferred_vwidth = 1;
-	else
-	if (clState->preferred_vwidth > 2)
-		clState->preferred_vwidth = 2;
+	       cgpu->dev_repr, kernel_file,
+	       opencl_get_kernel_interface_name(kernelinfo->interface));
 
-	if (data->vwidth)
-		clState->vwidth = data->vwidth;
-	else {
-		clState->vwidth = clState->preferred_vwidth;
-		data->vwidth = clState->preferred_vwidth;
-	}
-
-	if (((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) &&
-		clState->vwidth == 1 && clState->hasOpenCL11plus) || opt_scrypt)
-			clState->goffset = true;
+	if (((kernelinfo->interface == KL_POCLBM || kernelinfo->interface == KL_DIABLO || kernelinfo->interface == KL_DIAKGCN || kernelinfo->interface) && clState->vwidth == 1 && clState->hasOpenCL11plus) || kernelinfo->interface == KL_SCRYPT)
+		kernelinfo->goffset = true;
 
 	if (data->work_size && data->work_size <= clState->max_work_size)
-		clState->wsize = data->work_size;
-	else if (opt_scrypt)
-		clState->wsize = 256;
-	else if (strstr(name, "Tahiti"))
-		clState->wsize = 64;
+		kernelinfo->wsize = data->work_size;
+	else
+#ifdef USE_SCRYPT
+	if (malgo->algo == POW_SCRYPT)
+		kernelinfo->wsize = 256;
 	else
-		clState->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth;
-	data->work_size = clState->wsize;
+#endif
+	if (strstr(name, "Tahiti"))
+		kernelinfo->wsize = 64;
+	else
+		kernelinfo->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth;
 
 #ifdef USE_SCRYPT
-	if (opt_scrypt) {
-		if (!data->opt_lg) {
-			applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu);
-			data->lookup_gap = 2;
-		} else
-			data->lookup_gap = data->opt_lg;
-
-		if (!data->opt_tc) {
+	if (kernelinfo->interface == KL_SCRYPT)
+	{
+		if (!data->thread_concurrency)
+		{
 			unsigned int sixtyfours;
 
 			sixtyfours =  data->max_alloc / 131072 / 64 - 1;
@@ -804,8 +777,7 @@ bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState,
 					data->thread_concurrency = data->shaders * 5;
 			}
 			applog(LOG_DEBUG, "GPU %u: selecting thread concurrency of %lu", gpu,  (unsigned long)data->thread_concurrency);
-		} else
-			data->thread_concurrency = data->opt_tc;
+		}
 	}
 #endif
 
@@ -829,18 +801,21 @@ bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState,
 	}
 
 	strcat(binaryfilename, name);
-	if (clState->goffset)
+	if (kernelinfo->goffset)
 		strcat(binaryfilename, "g");
-	if (opt_scrypt) {
 #ifdef USE_SCRYPT
+	if (kernelinfo->interface == KL_SCRYPT)
+	{
 		sprintf(numbuf, "lg%utc%u", data->lookup_gap, (unsigned int)data->thread_concurrency);
 		strcat(binaryfilename, numbuf);
+	}
+	else
 #endif
-	} else {
+	{
 		sprintf(numbuf, "v%d", clState->vwidth);
 		strcat(binaryfilename, numbuf);
 	}
-	sprintf(numbuf, "w%d", (int)clState->wsize);
+	sprintf(numbuf, "w%d", (int)kernelinfo->wsize);
 	strcat(binaryfilename, numbuf);
 	sprintf(numbuf, "l%d", (int)sizeof(long));
 	strcat(binaryfilename, numbuf);
@@ -882,7 +857,7 @@ bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState,
 			goto build;
 		}
 
-		clState->program = clCreateProgramWithBinary(clState->context, 1, &clState->devid, &binary_sizes[slot], (const unsigned char **)binaries, &status, NULL);
+		kernelinfo->program = clCreateProgramWithBinary(clState->context, 1, &clState->devid, &binary_sizes[slot], (const unsigned char **)binaries, &status, NULL);
 		if (status != CL_SUCCESS) {
 			applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
 			fclose(binaryfile);
@@ -901,7 +876,7 @@ bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState,
 	/////////////////////////////////////////////////////////////////
 
 build:
-	clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status);
+	kernelinfo->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status);
 	if (status != CL_SUCCESS) {
 		applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithSource)", status);
 		return false;
@@ -911,16 +886,16 @@ build:
 	char *CompilerOptions = calloc(1, 256);
 
 #ifdef USE_SCRYPT
-	if (opt_scrypt)
+	if (kernelinfo->interface == KL_SCRYPT)
 		sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d",
-			data->lookup_gap, (unsigned int)data->thread_concurrency, (int)clState->wsize);
+			data->lookup_gap, (unsigned int)data->thread_concurrency, (int)kernelinfo->wsize);
 	else
 #endif
 	{
 		sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d",
-			(int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth);
+			(int)kernelinfo->wsize, clState->vwidth, (int)kernelinfo->wsize * clState->vwidth);
 	}
-	applog(LOG_DEBUG, "Setting worksize to %"PRId64, (int64_t)clState->wsize);
+	applog(LOG_DEBUG, "Setting worksize to %"PRId64, (int64_t)kernelinfo->wsize);
 	if (clState->vwidth > 1)
 		applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth);
 
@@ -965,14 +940,14 @@ build:
 	} else
 		applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch");
 
-	if (clState->goffset)
+	if (kernelinfo->goffset)
 		strcat(CompilerOptions, " -D GOFFSET");
 
 	if (!clState->hasOpenCL11plus)
 		strcat(CompilerOptions, " -D OCL1");
 
 	applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions);
-	status = bfg_clBuildProgram(clState, clState->devid, CompilerOptions);
+	status = bfg_clBuildProgram(&kernelinfo->program, clState->devid, CompilerOptions);
 	free(CompilerOptions);
 
 	if (status != CL_SUCCESS)
@@ -983,13 +958,13 @@ build:
 	if (!(data->opt_opencl_binaries & OBU_SAVE))
 		goto built;
 
-	status = clGetProgramInfo(clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL);
+	status = clGetProgramInfo(kernelinfo->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL);
 	if (unlikely(status != CL_SUCCESS)) {
 		applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)", status);
 		return false;
 	}
 
-	status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL);
+	status = clGetProgramInfo(kernelinfo->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL);
 	if (unlikely(status != CL_SUCCESS)) {
 		applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)", status);
 		return false;
@@ -1010,7 +985,7 @@ build:
 		return false;
 	}
 	binaries[slot] = calloc(sizeof(char) * binary_sizes[slot], 1);
-	status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL );
+	status = clGetProgramInfo(kernelinfo->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL );
 	if (unlikely(status != CL_SUCCESS)) {
 		applog(LOG_ERR, "Error %d: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)", status);
 		return false;
@@ -1050,13 +1025,13 @@ build:
 			w, remaining);
 		patch_opcodes(w, length);
 
-		status = clReleaseProgram(clState->program);
+		status = clReleaseProgram(kernelinfo->program);
 		if (status != CL_SUCCESS) {
 			applog(LOG_ERR, "Error %d: Releasing program. (clReleaseProgram)", status);
 			return false;
 		}
 
-		clState->program = clCreateProgramWithBinary(clState->context, 1, &clState->devid, &binary_sizes[slot], (const unsigned char **)&binaries[slot], &status, NULL);
+		kernelinfo->program = clCreateProgramWithBinary(clState->context, 1, &clState->devid, &binary_sizes[slot], (const unsigned char **)&binaries[slot], &status, NULL);
 		if (status != CL_SUCCESS) {
 			applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
 			return false;
@@ -1087,27 +1062,28 @@ built:
 	free(binary_sizes);
 
 	applog(LOG_INFO, "Initialising kernel %s with%s bitalign, %"PRId64" vectors and worksize %"PRIu64,
-	       filename, clState->hasBitAlign ? "" : "out", (int64_t)clState->vwidth, (uint64_t)clState->wsize);
+	       filename, clState->hasBitAlign ? "" : "out", (int64_t)clState->vwidth, (uint64_t)kernelinfo->wsize);
 
 	if (!prog_built) {
 		/* create a cl program executable for all the devices specified */
-		status = bfg_clBuildProgram(clState, clState->devid, NULL);
+		status = bfg_clBuildProgram(&kernelinfo->program, clState->devid, NULL);
 		if (status != CL_SUCCESS)
 			return false;
 	}
 
 	/* get a kernel object handle for a kernel with the given name */
-	clState->kernel = clCreateKernel(clState->program, "search", &status);
+	kernelinfo->kernel = clCreateKernel(kernelinfo->program, "search", &status);
 	if (status != CL_SUCCESS) {
 		applog(LOG_ERR, "Error %d: Creating Kernel from program. (clCreateKernel)", status);
 		return false;
 	}
 	
 	free((void*)cgpu->kname);
-	cgpu->kname = strdup(data->kernel_file);
+	cgpu->kname = strdup(kernel_file);
 
 #ifdef USE_SCRYPT
-	if (opt_scrypt) {
+	if (kernelinfo->interface == KL_SCRYPT && !clState->padbufsize)
+	{
 		size_t ipt = (1024 / data->lookup_gap + (1024 % data->lookup_gap > 0));
 		size_t bufsize = 128 * ipt * data->thread_concurrency;
 
@@ -1138,7 +1114,7 @@ built:
 	}
 #endif
 
-	clState->kernel_loaded = true;
+	kernelinfo->loaded = true;
 	return true;
 }
 

+ 23 - 9
ocl.h

@@ -11,16 +11,32 @@
 
 #include "miner.h"
 
-typedef struct {
+struct opencl_kernel_info;
+typedef struct _clState _clState;
+
+typedef cl_int (*queue_kernel_parameters_func_t)(const struct opencl_kernel_info *, _clState *, struct work *, cl_uint);
+
+struct opencl_kernel_info {
+	bool loaded;
+	cl_program program;
+	cl_kernel kernel;
+	bool goffset;
+	enum cl_kernels interface;
+	size_t wsize;
+	queue_kernel_parameters_func_t queue_kernel_parameters;
+};
+
+struct _clState {
 	cl_device_id devid;
 	char *platform_ver_str;
 	bool is_mesa;
 	
 	cl_context context;
-	bool kernel_loaded;
-	cl_kernel kernel;
 	cl_command_queue commandQueue;
-	cl_program program;
+	
+	struct opencl_kernel_info kernel_sha256d;
+	struct opencl_kernel_info kernel_scrypt;
+	
 	cl_mem outputBuffer;
 #ifdef USE_SCRYPT
 	cl_mem CLbuffer0;
@@ -30,19 +46,17 @@ typedef struct {
 #endif
 	bool hasBitAlign;
 	bool hasOpenCL11plus;
-	bool goffset;
 	cl_uint preferred_vwidth;
 	cl_uint vwidth;
 	size_t max_work_size;
-	size_t wsize;
 	cl_uint max_compute_units;
-	enum cl_kernels chosen_kernel;
-} _clState;
+};
 
 extern FILE *opencl_open_kernel(const char *filename);
 extern char *file_contents(const char *filename, int *length);
+extern char *opencl_kernel_source(const char *filename, int *out_sourcelen, enum cl_kernels *out_kinterface);
 extern int clDevicesNum(void);
 extern _clState *opencl_create_clState(unsigned int gpu, char *name, size_t nameSize);
-extern bool opencl_load_kernel(struct cgpu_info *, _clState *clState, const char *name);
+extern bool opencl_load_kernel(struct cgpu_info *, _clState *clState, const char *name, struct opencl_kernel_info *, const char *kernel_file, const struct mining_algorithm *);
 #endif /* HAVE_OPENCL */
 #endif /* __OCL_H__ */