Browse Source

Partial restart of threads has proven to be unsuccessful so reinstate device re-initialisation to restart GPUs.
Do this by having a reinit thread that is told via a queue which device to restart.
If this thread fails to return from opencl code, it should not interrupt the workings of other devices.

Con Kolivas 14 years ago
parent
commit
cfe8534c39
1 changed files with 66 additions and 36 deletions
  1. 66 36
      main.c

+ 66 - 36
main.c

@@ -184,6 +184,8 @@ int longpoll_thr_id;
 static int stage_thr_id;
 static int watchdog_thr_id;
 static int input_thr_id;
+static int gpur_thr_id;
+static int cpur_thr_id;
 static int total_threads;
 
 struct work_restart *work_restart = NULL;
@@ -3340,6 +3342,7 @@ static void restart_longpoll(void)
 
 static void *reinit_cpu(void *userdata)
 {
+	pthread_detach(pthread_self());
 #if 0
 	struct cgpu_info *cgpu = (struct cgpu_info *)userdata;
 	int cpu = cgpu->cpu_gpu;
@@ -3371,35 +3374,52 @@ static void *reinit_cpu(void *userdata)
 }
 
 #ifdef HAVE_OPENCL
+/* We have only one thread that ever re-initialises GPUs, thus if any GPU
+ * init command fails due to a completely wedged GPU, the thread will never
+ * return, unable to harm other GPUs. If it does return, it means we only had
+ * a soft failure and then the reinit_gpu thread is ready to tackle another
+ * GPU */
 static void *reinit_gpu(void *userdata)
 {
-	struct cgpu_info *cgpu = (struct cgpu_info *)userdata;
-	int gpu = cgpu->cpu_gpu;
+	struct thr_info *mythr = userdata;
+	struct cgpu_info *cgpu;
 	struct thr_info *thr;
 	char name[256];
 	int thr_id;
-	_clState *clState;
+	int gpu;
+
+	pthread_detach(pthread_self());
+
+select_cgpu:
+	cgpu = tq_pop(mythr->q, NULL);
+	if (!cgpu)
+		goto out;
+
+	if (clDevicesNum() != nDevs) {
+		applog(LOG_WARNING, "Hardware not reporting same number of active devices, will not attempt to restart GPU");
+		goto out;
+	}
 
-	/* Send threads message to stop */
+	gpu = cgpu->cpu_gpu;
 	gpu_devices[gpu] = false;
-	sleep(5);
 
 	for (thr_id = 0; thr_id < gpu_threads; thr_id ++) {
 		if (dev_from_id(thr_id) != gpu)
 			continue;
 
-		clState = clStates[thr_id];
-		/* Send it a command. If it responds we can restart */
-		applog(LOG_WARNING, "Attempting to send GPU command");
-		clFlush(clState->commandQueue);
-		clFinish(clState->commandQueue);
-
 		thr = &thr_info[thr_id];
 		thr->rolling = thr->cgpu->rolling = 0;
 		if (!pthread_cancel(*thr->pth)) {
-			applog(LOG_WARNING, "Thread still exists, killing it off");
+			applog(LOG_WARNING, "Thread %d still exists, killing it off", thr_id);
 		} else
-			applog(LOG_WARNING, "Thread no longer exists");
+			applog(LOG_WARNING, "Thread %d no longer exists", thr_id);
+	}
+
+	for (thr_id = 0; thr_id < gpu_threads; thr_id ++) {
+		if (dev_from_id(thr_id) != gpu)
+			continue;
+
+		thr = &thr_info[thr_id];
 
 		/* Lose this ram cause we may get stuck here! */
 		//tq_freeze(thr->q);
@@ -3410,13 +3430,12 @@ static void *reinit_gpu(void *userdata)
 
 		/* Lose this ram cause we may dereference in the dying thread! */
 		//free(clState);
-		applog(LOG_WARNING, "Command successful, attempting to reinit device");
 
 		applog(LOG_INFO, "Reinit GPU thread %d", thr_id);
-		clState = initCl(gpu, name, sizeof(name));
-		if (!clState) {
+		clStates[thr_id] = initCl(gpu, name, sizeof(name));
+		if (!clStates[thr_id]) {
 			applog(LOG_ERR, "Failed to reinit GPU thread %d", thr_id);
-			return NULL;
+			goto out;
 		}
 		applog(LOG_INFO, "initCl() finished. Found %s", name);
 
@@ -3427,14 +3446,17 @@ static void *reinit_gpu(void *userdata)
 		applog(LOG_WARNING, "Thread %d restarted", thr_id);
 	}
 
-	/* Try to re-enable it */
 	gpu_devices[gpu] = true;
 	for (thr_id = 0; thr_id < gpu_threads; thr_id ++) {
+		if (dev_from_id(thr_id) != gpu)
+			continue;
+
 		thr = &thr_info[thr_id];
-		if (dev_from_id(thr_id) == gpu)
-			tq_push(thr->q, &ping);
+		tq_push(thr->q, &ping);
 	}
 
+	goto select_cgpu;
+out:
 	return NULL;
 }
 #else
@@ -3445,16 +3467,10 @@ static void *reinit_gpu(void *userdata)
 
 static void reinit_device(struct cgpu_info *cgpu)
 {
-	pthread_t resus_thread;
-	void *reinit;
-
 	if (cgpu->is_gpu)
-		reinit = reinit_gpu;
+		tq_push(thr_info[gpur_thr_id].q, cgpu);
 	else
-		reinit = reinit_cpu;
-
-	if (unlikely(pthread_create(&resus_thread, NULL, reinit, (void *)cgpu)))
-		applog(LOG_ERR, "Failed to create reinit thread");
+		tq_push(thr_info[cpur_thr_id].q, cgpu);
 }
 
 /* Determine which are the first threads belonging to a device and if they're
@@ -3811,7 +3827,7 @@ static void fork_monitor()
 
 int main (int argc, char *argv[])
 {
-	unsigned int i, j = 0, x, y, pools_active = 0;
+	unsigned int i, x, y, pools_active = 0;
 	struct sigaction handler;
 	struct thr_info *thr;
 	char name[256];
@@ -3969,7 +3985,7 @@ int main (int argc, char *argv[])
 
 	mining_threads = opt_n_threads + gpu_threads;
 
-	total_threads = mining_threads + 5;
+	total_threads = mining_threads + 7;
 	work_restart = calloc(total_threads, sizeof(*work_restart));
 	if (!work_restart)
 		quit(1, "Failed to calloc work_restart");
@@ -4050,11 +4066,9 @@ int main (int argc, char *argv[])
 		quit(0, "No pools active! Exiting.");
 
 #ifdef HAVE_OPENCL
-	i = 0;
-
 	/* start GPU mining threads */
-	for (j = 0; j < nDevs * opt_g_threads; j++) {
-		int gpu = j % nDevs;
+	for (i = 0; i < nDevs * opt_g_threads; i++) {
+		int gpu = i % nDevs;
 
 		gpus[gpu].is_gpu = 1;
 		gpus[gpu].cpu_gpu = gpu;
@@ -4087,8 +4101,6 @@ int main (int argc, char *argv[])
 
 		if (unlikely(thr_info_create(thr, NULL, gpuminer_thread, thr)))
 			quit(1, "thread %d create failed", i);
-
-		i++;
 	}
 
 	applog(LOG_INFO, "%d gpu miner threads started", gpu_threads);
@@ -4134,6 +4146,24 @@ int main (int argc, char *argv[])
 		quit(1, "input thread create failed");
 	pthread_detach(*thr->pth);
 
+	/* Create reinit cpu thread */
+	cpur_thr_id = mining_threads + 5;
+	thr = &thr_info[cpur_thr_id];
+	thr->q = tq_new();
+	if (!thr->q)
+		quit(1, "tq_new failed for cpur_thr_id");
+	if (thr_info_create(thr, NULL, reinit_cpu, thr))
+		quit(1, "reinit_cpu thread create failed");
+
+	/* Create reinit gpu thread */
+	gpur_thr_id = mining_threads + 6;
+	thr = &thr_info[gpur_thr_id];
+	thr->q = tq_new();
+	if (!thr->q)
+		quit(1, "tq_new failed for gpur_thr_id");
+	if (thr_info_create(thr, NULL, reinit_gpu, thr))
+		quit(1, "reinit_gpu thread create failed");
+
 	/* main loop - simply wait for workio thread to exit */
 	pthread_join(*thr_info[work_thr_id].pth, NULL);
 	applog(LOG_INFO, "workio thread dead, exiting.");