Browse Source

opencl: Major refactor, splitting up opencl_load_kernel into many new functions

Luke Dashjr 11 years ago
parent
commit
74d8924323
1 changed files with 317 additions and 262 deletions
  1. 317 262
      ocl.c

+ 317 - 262
ocl.c

@@ -704,6 +704,286 @@ err2:
 	return clState;
 }
 
+static
+bool opencl_load_kernel_binary(struct cgpu_info * const cgpu, _clState * const clState, struct opencl_kernel_info * const kernelinfo, const char * const binaryfilename, bytes_t * const b)
+{
+	cl_int status;
+	
+	FILE * const binaryfile = fopen(binaryfilename, "rb");
+	if (!binaryfile)
+		return false;
+	
+	struct stat binary_stat;
+	if (unlikely(stat(binaryfilename, &binary_stat)))
+	{
+		applog(LOG_DEBUG, "Unable to stat binary, generating from source");
+		fclose(binaryfile);
+		return false;
+	}
+	if (!binary_stat.st_size)
+	{
+		fclose(binaryfile);
+		return false;
+	}
+	
+	const size_t binsz = binary_stat.st_size;
+	bytes_resize(b, binsz);
+	if (fread(bytes_buf(b), 1, binsz, binaryfile) != binsz)
+	{
+		applog(LOG_ERR, "Unable to fread binaries");
+		fclose(binaryfile);
+		return false;
+	}
+	fclose(binaryfile);
+	
+	kernelinfo->program = clCreateProgramWithBinary(clState->context, 1, &clState->devid, &binsz, (void*)&bytes_buf(b), &status, NULL);
+	if (status != CL_SUCCESS)
+		applogr(false, LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
+	
+	status = bfg_clBuildProgram(&kernelinfo->program, clState->devid, NULL);
+	if (status != CL_SUCCESS)
+		return false;
+	
+	applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename);
+	return true;
+}
+
+static
+bool opencl_should_patch_bfi_int(struct cgpu_info * const cgpu, _clState * const clState, struct opencl_kernel_info * const kernelinfo)
+{
+#ifdef USE_SHA256D
+	struct opencl_device_data * const data = cgpu->device_data;
+	const char * const name = cgpu->name;
+	const char * const vbuff = clState->platform_ver_str;
+	char *s;
+	
+	if (!clState->hasBitAlign)
+		return false;
+	
+	if (!(strstr(name, "Cedar") ||
+	      strstr(name, "Redwood") ||
+	      strstr(name, "Juniper") ||
+	      strstr(name, "Cypress" ) ||
+	      strstr(name, "Hemlock" ) ||
+	      strstr(name, "Caicos" ) ||
+	      strstr(name, "Turks" ) ||
+	      strstr(name, "Barts" ) ||
+	      strstr(name, "Cayman" ) ||
+	      strstr(name, "Antilles" ) ||
+	      strstr(name, "Wrestler" ) ||
+	      strstr(name, "Zacate" ) ||
+	      strstr(name, "WinterPark" )))
+		return false;
+	
+	// BFI_INT patching only works with AMD-APP up to 1084
+	if (strstr(vbuff, "ATI-Stream"))
+	{}
+	else
+	if ((s = strstr(vbuff, "AMD-APP")) && (s = strchr(s, '(')) && atoi(&s[1]) < 1085)
+	{}
+	else
+		return false;
+	
+	switch (kernelinfo->interface)
+	{
+		case KL_DIABLO: case KL_DIAKGCN: case KL_PHATK: case KL_POCLBM:
+			// Okay, these actually use BFI_INT hacking
+			break;
+		default:
+			// Anything else has never needed it
+			return false;
+			break;
+	}
+	
+	if (data->opt_opencl_binaries != OBU_LOADSAVE)
+		applogr(false, LOG_WARNING, "BFI_INT patch requiring device found, but OpenCL binary usage disabled; cannot BFI_INT patch");
+	
+	applog(LOG_DEBUG, "BFI_INT patch requiring device found, will patch source with BFI_INT");
+	return true;
+#else
+	return false;
+#endif
+}
+
+static
+bool opencl_build_kernel(struct cgpu_info * const cgpu, _clState * const clState, struct opencl_kernel_info * const kernelinfo, const char *source, const size_t source_len, const bool patchbfi)
+{
+	struct opencl_device_data * const data = cgpu->device_data;
+	cl_int status;
+	
+	kernelinfo->program = clCreateProgramWithSource(clState->context, 1, &source, &source_len, &status);
+	if (status != CL_SUCCESS)
+		applogr(false, LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithSource)", status);
+
+	/* create a cl program executable for all the devices specified */
+	char *CompilerOptions = calloc(1, 256);
+
+#ifdef USE_SCRYPT
+	if (kernelinfo->interface == KL_SCRYPT)
+		sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d",
+			data->lookup_gap, (unsigned int)data->thread_concurrency, (int)kernelinfo->wsize);
+	else
+#endif
+	{
+		sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d",
+			(int)kernelinfo->wsize, clState->vwidth, (int)kernelinfo->wsize * clState->vwidth);
+	}
+	applog(LOG_DEBUG, "Setting worksize to %"PRId64, (int64_t)kernelinfo->wsize);
+	if (clState->vwidth > 1)
+		applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth);
+
+	if (clState->hasBitAlign)
+	{
+		strcat(CompilerOptions, " -D BITALIGN");
+		applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN");
+	}
+	else
+		applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN");
+
+#ifdef USE_SHA256D
+	if (patchbfi)
+		strcat(CompilerOptions, " -D BFI_INT");
+#endif
+
+	if (kernelinfo->goffset)
+		strcat(CompilerOptions, " -D GOFFSET");
+
+	applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions);
+	status = bfg_clBuildProgram(&kernelinfo->program, clState->devid, CompilerOptions);
+	free(CompilerOptions);
+
+	if (status != CL_SUCCESS)
+		return false;
+	
+	return true;
+}
+
+static
+bool opencl_get_kernel_binary(struct cgpu_info * const cgpu, _clState * const clState, struct opencl_kernel_info * const kernelinfo, bytes_t * const b)
+{
+	cl_int status;
+	cl_uint slot, cpnd;
+	
+	status = clGetProgramInfo(kernelinfo->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL);
+	if (unlikely(status != CL_SUCCESS))
+		applogr(false, LOG_ERR, "Error %d: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)", status);
+
+	size_t binary_sizes[cpnd];
+	status = clGetProgramInfo(kernelinfo->program, CL_PROGRAM_BINARY_SIZES, sizeof(binary_sizes), binary_sizes, NULL);
+	if (unlikely(status != CL_SUCCESS))
+		applogr(false, LOG_ERR, "Error %d: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)", status);
+	
+	uint8_t **binaries = malloc(sizeof(*binaries) * cpnd);
+	for (slot = 0; slot < cpnd; ++slot)
+		binaries[slot] = malloc(binary_sizes[slot] + 1);
+
+	/* The actual compiled binary ends up in a RANDOM slot! Grr, so we have
+	 * to iterate over all the binary slots and find where the real program
+	 * is. What the heck is this!? */
+	for (slot = 0; slot < cpnd; slot++)
+		if (binary_sizes[slot])
+			break;
+
+	/* copy over all of the generated binaries. */
+	applog(LOG_DEBUG, "%s: Binary size found in binary slot %u: %"PRId64, cgpu->dev_repr, (unsigned)slot, (int64_t)binary_sizes[slot]);
+	if (!binary_sizes[slot])
+		applogr(false, LOG_ERR, "OpenCL compiler generated a zero sized binary, FAIL!");
+	status = clGetProgramInfo(kernelinfo->program, CL_PROGRAM_BINARIES, sizeof(binaries), binaries, NULL);
+	if (unlikely(status != CL_SUCCESS))
+		applogr(false, LOG_ERR, "Error %d: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)", status);
+	
+	bytes_resize(b, binary_sizes[slot]);
+	memcpy(bytes_buf(b), binaries[slot], bytes_len(b));
+	
+	for (slot = 0; slot < cpnd; ++slot)
+		free(binaries[slot]);
+	free(binaries);
+	
+	return true;
+}
+
+#ifdef USE_SHA256D
+	/* Patch the kernel if the hardware supports BFI_INT but it needs to
+	 * be hacked in */
+static
+bool opencl_patch_kernel_binary(bytes_t * const b)
+{
+	unsigned remaining = bytes_len(b);
+	char *w = (void*)bytes_buf(b);
+	unsigned int start, length;
+
+	/* Find 2nd incidence of .text, and copy the program's
+	* position and length at a fixed offset from that. Then go
+	* back and find the 2nd incidence of \x7ELF (rewind by one
+	* from ELF) and then patch the opcocdes */
+	if (!advance(&w, &remaining, ".text"))
+		return false;
+	w++; remaining--;
+	if (!advance(&w, &remaining, ".text")) {
+		/* 32 bit builds only one ELF */
+		w--; remaining++;
+	}
+	memcpy(&start, w + 285, 4);
+	memcpy(&length, w + 289, 4);
+	w = (void*)bytes_buf(b);
+	remaining = bytes_len(b);
+	if (!advance(&w, &remaining, "ELF"))
+		return false;
+	w++; remaining--;
+	if (!advance(&w, &remaining, "ELF")) {
+		/* 32 bit builds only one ELF */
+		w--; remaining++;
+	}
+	w--; remaining++;
+	w += start; remaining -= start;
+	applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching", w, remaining);
+	patch_opcodes(w, length);
+	return true;
+}
+
+static
+bool opencl_replace_binary_kernel(struct cgpu_info * const cgpu, _clState * const clState, struct opencl_kernel_info * const kernelinfo, bytes_t * const b)
+{
+	cl_int status;
+	
+	status = clReleaseProgram(kernelinfo->program);
+	if (status != CL_SUCCESS)
+		applogr(false, LOG_ERR, "Error %d: Releasing program. (clReleaseProgram)", status);
+	
+	const size_t binsz = bytes_len(b);
+	kernelinfo->program = clCreateProgramWithBinary(clState->context, 1, &clState->devid, &binsz, (void*)&bytes_buf(b), &status, NULL);
+	if (status != CL_SUCCESS)
+		applogr(false, LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
+	
+	status = bfg_clBuildProgram(&kernelinfo->program, clState->devid, NULL);
+	if (status != CL_SUCCESS)
+		return false;
+	
+	return true;
+}
+#endif
+
+static
+bool opencl_save_kernel_binary(const char * const binaryfilename, bytes_t * const b)
+{
+	FILE *binaryfile;
+	
+	/* Save the binary to be loaded next time */
+	binaryfile = fopen(binaryfilename, "wb");
+	if (!binaryfile)
+		return false;
+	
+	// FIXME: Failure here results in a bad file; better to write and move-replace (but unlink before replacing for Windows)
+	if (unlikely(fwrite(bytes_buf(b), 1, bytes_len(b), binaryfile) != bytes_len(b)))
+	{
+		fclose(binaryfile);
+		return false;
+	}
+	
+	fclose(binaryfile);
+	return true;
+}
+
 static
 bool opencl_test_goffset(_clState * const clState)
 {
@@ -753,13 +1033,8 @@ fail2:
 bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState, const char * const name, struct opencl_kernel_info * const kernelinfo, const char * const kernel_file, __maybe_unused const struct mining_algorithm * const malgo)
 {
 	const int gpu = cgpu->device_id;
-#ifdef USE_SHA256D
-	bool patchbfi = false;
-#endif
-	bool prog_built = false;
 	struct opencl_device_data * const data = cgpu->device_data;
 	const char * const vbuff = clState->platform_ver_str;
-	char *s;
 	cl_int status;
 	
 	/* Create binary filename based on parameters passed to opencl
@@ -884,25 +1159,6 @@ bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState,
 	}
 #endif
 
-	FILE *binaryfile;
-	size_t *binary_sizes;
-	char **binaries;
-	size_t sourceSize[] = {(size_t)pl};
-	cl_uint slot, cpnd;
-
-	slot = cpnd = 0;
-
-	binary_sizes = calloc(sizeof(size_t) * MAX_GPUDEVICES * 4, 1);
-	if (unlikely(!binary_sizes)) {
-		applog(LOG_ERR, "Unable to calloc binary_sizes");
-		return false;
-	}
-	binaries = calloc(sizeof(char *) * MAX_GPUDEVICES * 4, 1);
-	if (unlikely(!binaries)) {
-		applog(LOG_ERR, "Unable to calloc binaries");
-		return false;
-	}
-
 	strcat(binaryfilename, name);
 	if (kernelinfo->goffset)
 		strcat(binaryfilename, "g");
@@ -928,265 +1184,64 @@ bool opencl_load_kernel(struct cgpu_info * const cgpu, _clState * const clState,
 	applog(LOG_DEBUG, "OCL%2u: Configured OpenCL kernel name: %s", gpu, binaryfilename);
 	strcat(binaryfilename, ".bin");
 	
-	if (!(data->opt_opencl_binaries & OBU_LOAD))
-		goto build;
-
-	binaryfile = fopen(binaryfilename, "rb");
-	if (!binaryfile) {
-		applog(LOG_DEBUG, "No binary found, generating from source");
-	} else {
-		struct stat binary_stat;
-
-		if (unlikely(stat(binaryfilename, &binary_stat))) {
-			applog(LOG_DEBUG, "Unable to stat binary, generating from source");
-			fclose(binaryfile);
-			goto build;
-		}
-		if (!binary_stat.st_size)
-			goto build;
-
-		binary_sizes[slot] = binary_stat.st_size;
-		binaries[slot] = (char *)calloc(binary_sizes[slot], 1);
-		if (unlikely(!binaries[slot])) {
-			applog(LOG_ERR, "Unable to calloc binaries");
-			fclose(binaryfile);
-			return false;
-		}
-
-		if (fread(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot]) {
-			applog(LOG_ERR, "Unable to fread binaries");
-			fclose(binaryfile);
-			free(binaries[slot]);
-			goto build;
-		}
-
-		kernelinfo->program = clCreateProgramWithBinary(clState->context, 1, &clState->devid, &binary_sizes[slot], (const unsigned char **)binaries, &status, NULL);
-		if (status != CL_SUCCESS) {
-			applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
-			fclose(binaryfile);
-			free(binaries[slot]);
+	bool patchbfi = opencl_should_patch_bfi_int(cgpu, clState, kernelinfo);
+	
+	bytes_t binary_bytes = BYTES_INIT;
+	if (data->opt_opencl_binaries & OBU_LOAD)
+	{
+		if (!opencl_load_kernel_binary(cgpu, clState, kernelinfo, binaryfilename, &binary_bytes))
+		{
+			bytes_free(&binary_bytes);
+			applog(LOG_DEBUG, "No usable binary found, generating from source");
 			goto build;
 		}
-
-		fclose(binaryfile);
-		applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename);
-
-		goto built;
-	}
-
-	/////////////////////////////////////////////////////////////////
-	// Load CL file, build CL program object, create CL kernel object
-	/////////////////////////////////////////////////////////////////
-
-build:
-	kernelinfo->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status);
-	if (status != CL_SUCCESS) {
-		applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithSource)", status);
-		return false;
 	}
-
-	/* create a cl program executable for all the devices specified */
-	char *CompilerOptions = calloc(1, 256);
-
-#ifdef USE_SCRYPT
-	if (kernelinfo->interface == KL_SCRYPT)
-		sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d",
-			data->lookup_gap, (unsigned int)data->thread_concurrency, (int)kernelinfo->wsize);
 	else
-#endif
 	{
-		sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d",
-			(int)kernelinfo->wsize, clState->vwidth, (int)kernelinfo->wsize * clState->vwidth);
-	}
-	applog(LOG_DEBUG, "Setting worksize to %"PRId64, (int64_t)kernelinfo->wsize);
-	if (clState->vwidth > 1)
-		applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth);
-
-	if (clState->hasBitAlign) {
-		strcat(CompilerOptions, " -D BITALIGN");
-		applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN");
-#ifdef USE_SHA256D
-		if (strstr(name, "Cedar") ||
-		    strstr(name, "Redwood") ||
-		    strstr(name, "Juniper") ||
-		    strstr(name, "Cypress" ) ||
-		    strstr(name, "Hemlock" ) ||
-		    strstr(name, "Caicos" ) ||
-		    strstr(name, "Turks" ) ||
-		    strstr(name, "Barts" ) ||
-		    strstr(name, "Cayman" ) ||
-		    strstr(name, "Antilles" ) ||
-		    strstr(name, "Wrestler" ) ||
-		    strstr(name, "Zacate" ) ||
-		    strstr(name, "WinterPark" ))
+build:
+		if (!opencl_build_kernel(cgpu, clState, kernelinfo, source, pl, patchbfi))
 		{
-			// BFI_INT patching only works with AMD-APP up to 1084
-			if (strstr(vbuff, "ATI-Stream"))
-				patchbfi = true;
-			else
-			if ((s = strstr(vbuff, "AMD-APP")) && (s = strchr(s, '(')) && atoi(&s[1]) < 1085)
-				patchbfi = true;
+			free(source);
+			return false;
 		}
-#endif
-	} else
-		applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN");
-
-#ifdef USE_SHA256D
-	switch (kernelinfo->interface)
-	{
-		case KL_DIABLO: case KL_DIAKGCN: case KL_PHATK: case KL_POCLBM:
-			// Okay, these actually use BFI_INT hacking
-			break;
-		default:
-			// Anything else has never needed it
-			patchbfi = false;
-			break;
 	}
-	if (patchbfi) {
-		if (data->opt_opencl_binaries == OBU_LOADSAVE)
+	
+	if ((patchbfi || (data->opt_opencl_binaries & OBU_SAVE)) && !bytes_len(&binary_bytes))
+	{
+		if (!opencl_get_kernel_binary(cgpu, clState, kernelinfo, &binary_bytes))
 		{
-			strcat(CompilerOptions, " -D BFI_INT");
-			applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT");
+			bytes_free(&binary_bytes);
+			applog(LOG_DEBUG, "%s: Failed to get compiled kernel binary from OpenCL (cannot save it)", cgpu->dev_repr);
+			// NOTE: empty binary_bytes will fail BFI_INT patch on its own
 		}
-		else
-		{
-			patchbfi = false;
-			applog(LOG_WARNING, "BFI_INT patch requiring device found, but OpenCL binary usage disabled; cannot BFI_INT patch");
-		}
-	} else
-		applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch");
-#endif
-
-	if (kernelinfo->goffset)
-		strcat(CompilerOptions, " -D GOFFSET");
-
-	applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions);
-	status = bfg_clBuildProgram(&kernelinfo->program, clState->devid, CompilerOptions);
-	free(CompilerOptions);
-
-	if (status != CL_SUCCESS)
-		return false;
-
-	prog_built = true;
-	
-	if (!(data->opt_opencl_binaries & OBU_SAVE))
-		goto built;
-
-	status = clGetProgramInfo(kernelinfo->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL);
-	if (unlikely(status != CL_SUCCESS)) {
-		applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)", status);
-		return false;
-	}
-
-	status = clGetProgramInfo(kernelinfo->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL);
-	if (unlikely(status != CL_SUCCESS)) {
-		applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)", status);
-		return false;
 	}
-
-	/* The actual compiled binary ends up in a RANDOM slot! Grr, so we have
-	 * to iterate over all the binary slots and find where the real program
-	 * is. What the heck is this!? */
-	for (slot = 0; slot < cpnd; slot++)
-		if (binary_sizes[slot])
-			break;
-
-	/* copy over all of the generated binaries. */
-	applog(LOG_DEBUG, "Binary size for gpu %u found in binary slot %u: %"PRId64,
-	       gpu, (unsigned)slot, (int64_t)binary_sizes[slot]);
-	if (!binary_sizes[slot]) {
-		applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, FAIL!");
-		return false;
-	}
-	binaries[slot] = calloc(sizeof(char) * binary_sizes[slot], 1);
-	status = clGetProgramInfo(kernelinfo->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL );
-	if (unlikely(status != CL_SUCCESS)) {
-		applog(LOG_ERR, "Error %d: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)", status);
-		return false;
-	}
-
+	
 #ifdef USE_SHA256D
-	/* Patch the kernel if the hardware supports BFI_INT but it needs to
-	 * be hacked in */
-	if (patchbfi) {
-		unsigned remaining = binary_sizes[slot];
-		char *w = binaries[slot];
-		unsigned int start, length;
-
-		/* Find 2nd incidence of .text, and copy the program's
-		* position and length at a fixed offset from that. Then go
-		* back and find the 2nd incidence of \x7ELF (rewind by one
-		* from ELF) and then patch the opcocdes */
-		if (!advance(&w, &remaining, ".text"))
-			goto build;
-		w++; remaining--;
-		if (!advance(&w, &remaining, ".text")) {
-			/* 32 bit builds only one ELF */
-			w--; remaining++;
-		}
-		memcpy(&start, w + 285, 4);
-		memcpy(&length, w + 289, 4);
-		w = binaries[slot]; remaining = binary_sizes[slot];
-		if (!advance(&w, &remaining, "ELF"))
+	if (patchbfi)
+	{
+		if (!(opencl_patch_kernel_binary(&binary_bytes)) && opencl_replace_binary_kernel(cgpu, clState, kernelinfo, &binary_bytes))
+		{
+			// Rebuild without BFI_INT
+			patchbfi = false;
+			bytes_free(&binary_bytes);
 			goto build;
-		w++; remaining--;
-		if (!advance(&w, &remaining, "ELF")) {
-			/* 32 bit builds only one ELF */
-			w--; remaining++;
-		}
-		w--; remaining++;
-		w += start; remaining -= start;
-		applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching",
-			w, remaining);
-		patch_opcodes(w, length);
-
-		status = clReleaseProgram(kernelinfo->program);
-		if (status != CL_SUCCESS) {
-			applog(LOG_ERR, "Error %d: Releasing program. (clReleaseProgram)", status);
-			return false;
-		}
-
-		kernelinfo->program = clCreateProgramWithBinary(clState->context, 1, &clState->devid, &binary_sizes[slot], (const unsigned char **)&binaries[slot], &status, NULL);
-		if (status != CL_SUCCESS) {
-			applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status);
-			return false;
 		}
-
-		/* Program needs to be rebuilt */
-		prog_built = false;
 	}
 #endif
-
+	
 	free(source);
-
-	/* Save the binary to be loaded next time */
-	binaryfile = fopen(binaryfilename, "wb");
-	if (!binaryfile) {
-		/* Not a fatal problem, just means we build it again next time */
-		applog(LOG_DEBUG, "Unable to create file %s", binaryfilename);
-	} else {
-		if (unlikely(fwrite(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot])) {
-			applog(LOG_ERR, "Unable to fwrite to binaryfile");
-			return false;
-		}
-		fclose(binaryfile);
+	
+	if ((data->opt_opencl_binaries & OBU_SAVE) && bytes_len(&binary_bytes))
+	{
+		if (!opencl_save_kernel_binary(binaryfilename, &binary_bytes))
+			applog(LOG_DEBUG, "Unable to save file %s", binaryfilename);
 	}
-built:
-	if (binaries[slot])
-		free(binaries[slot]);
-	free(binaries);
-	free(binary_sizes);
-
+	
+	bytes_free(&binary_bytes);
+	
 	applog(LOG_INFO, "Initialising kernel %s with%s bitalign, %"PRId64" vectors and worksize %"PRIu64,
 	       filename, clState->hasBitAlign ? "" : "out", (int64_t)clState->vwidth, (uint64_t)kernelinfo->wsize);
 
-	if (!prog_built) {
-		/* create a cl program executable for all the devices specified */
-		status = bfg_clBuildProgram(&kernelinfo->program, clState->devid, NULL);
-		if (status != CL_SUCCESS)
-			return false;
-	}
-
 	/* get a kernel object handle for a kernel with the given name */
 	kernelinfo->kernel = clCreateKernel(kernelinfo->program, "search", &status);
 	if (status != CL_SUCCESS) {