Browse Source

Make the thread restart do a pthread_join after disabling the device, only re-enabling it if we succeed in restarting the thread.
Do this from a separate thread so as to not block any other code.
This will allow cgminer to continue even if one GPU hangs.

Con Kolivas 14 years ago
parent
commit
b3d20b573c
1 changed files with 40 additions and 35 deletions
  1. 40 35
      main.c

+ 40 - 35
main.c

@@ -2928,79 +2928,85 @@ static void restart_longpoll(void)
 		start_longpoll();
 		start_longpoll();
 }
 }
 
 
-static void reinit_cputhread(int thr_id)
+static void *reinit_cputhread(void *userdata)
 {
 {
+	long thr_id = (long)userdata;
 	struct thr_info *thr = &thr_info[thr_id];
 	struct thr_info *thr = &thr_info[thr_id];
 
 
 	tq_freeze(thr->q);
 	tq_freeze(thr->q);
-	if (!(pthread_cancel(*thr->pth)) && pthread_join(*thr->pth, NULL)) {
-		applog(LOG_ERR, "Failed to pthread_join in reinit_cputhread");
-		goto failed_out;
-	}
+	if (!pthread_cancel(*thr->pth))
+		pthread_join(*thr->pth, NULL);
 
 
 	applog(LOG_INFO, "Reinit CPU thread %d", thr_id);
 	applog(LOG_INFO, "Reinit CPU thread %d", thr_id);
-	tq_thaw(thr->q);
-
-	thread_reportin(thr);
 
 
 	if (unlikely(thr_info_create(thr, NULL, miner_thread, thr))) {
 	if (unlikely(thr_info_create(thr, NULL, miner_thread, thr))) {
 		applog(LOG_ERR, "thread %d create failed", thr_id);
 		applog(LOG_ERR, "thread %d create failed", thr_id);
-		goto failed_out;
+		return NULL;
 	}
 	}
-	return;
 
 
-failed_out:
-	kill_work();
+	applog(LOG_WARNING, "Thread %d restarted", thr_id);
+	thread_reportin(thr);
+	tq_thaw(thr->q);
+	return NULL;
 }
 }
 
 
 #ifdef HAVE_OPENCL
 #ifdef HAVE_OPENCL
-static void reinit_gputhread(int thr_id)
+static void *reinit_gputhread(void *userdata)
 {
 {
+	long thr_id = (long)userdata;
 	int gpu = dev_from_id(thr_id);
 	int gpu = dev_from_id(thr_id);
 	struct thr_info *thr = &thr_info[thr_id];
 	struct thr_info *thr = &thr_info[thr_id];
 	char name[256];
 	char name[256];
 
 
 	tq_freeze(thr->q);
 	tq_freeze(thr->q);
-	if (!(pthread_cancel(*thr->pth)) && pthread_join(*thr->pth, NULL)) {
-		applog(LOG_ERR, "Failed to pthread_join in reinit_gputhread");
-		goto failed_out;
-	}
+	/* Disable the GPU device in case the pthread never joins, hung in GPU
+	 * space */
+	gpu_devices[dev_from_id(thr_id)] = false;
+	if (!pthread_cancel(*thr->pth))
+		pthread_join(*thr->pth, NULL);
 	free(clStates[thr_id]);
 	free(clStates[thr_id]);
 
 
 	applog(LOG_INFO, "Reinit GPU thread %d", thr_id);
 	applog(LOG_INFO, "Reinit GPU thread %d", thr_id);
-	tq_thaw(thr->q);
 	clStates[thr_id] = initCl(gpu, name, sizeof(name));
 	clStates[thr_id] = initCl(gpu, name, sizeof(name));
 	if (!clStates[thr_id]) {
 	if (!clStates[thr_id]) {
 		applog(LOG_ERR, "Failed to reinit GPU thread %d", thr_id);
 		applog(LOG_ERR, "Failed to reinit GPU thread %d", thr_id);
-		goto failed_out;
+		return NULL;
 	}
 	}
 	applog(LOG_INFO, "initCl() finished. Found %s", name);
 	applog(LOG_INFO, "initCl() finished. Found %s", name);
 
 
-	thread_reportin(thr);
-
 	if (unlikely(thr_info_create(thr, NULL, gpuminer_thread, thr))) {
 	if (unlikely(thr_info_create(thr, NULL, gpuminer_thread, thr))) {
 		applog(LOG_ERR, "thread %d create failed", thr_id);
 		applog(LOG_ERR, "thread %d create failed", thr_id);
-		goto failed_out;
+		return NULL;
 	}
 	}
-	return;
 
 
-failed_out:
-	kill_work();
+	/* Re-enabble the device only if we succeeded in creating a thread
+	 * for it */
+	applog(LOG_WARNING, "Thread %d restarted", thr_id);
+	thread_reportin(thr);
+	tq_thaw(thr->q);
+	gpu_devices[dev_from_id(thr_id)] = true;
+
+	return NULL;
+}
+#else
+static void *reinit_gputhread(void *userdata)
+{
 }
 }
+#endif
 
 
-static void reinit_thread(int thr_id)
+static void reinit_thread(long thr_id)
 {
 {
+	pthread_t resus_thread;
+	void *reinit;
+
 	if (thr_id < gpu_threads)
 	if (thr_id < gpu_threads)
-		reinit_gputhread(thr_id);
+		reinit = reinit_gputhread;
 	else
 	else
-		reinit_cputhread(thr_id);
-}
-#else /* HAVE_OPENCL */
-static void reinit_thread(int thr_id)
-{
-	reinit_cputhread(thr_id);
+		reinit = reinit_cputhread;
+
+	if (unlikely(pthread_create(&resus_thread, NULL, reinit, (void *)thr_id)))
+		applog(LOG_ERR, "Failed to create reinit thread");
 }
 }
-#endif
 
 
 /* Determine which are the first threads belonging to a device and if they're
 /* Determine which are the first threads belonging to a device and if they're
  * active */
  * active */
@@ -3095,7 +3101,6 @@ static void *watchdog_thread(void *userdata)
 					break;
 					break;
 				}
 				}
 				reinit_thread(i);
 				reinit_thread(i);
-				applog(LOG_WARNING, "Thread %d restarted", i);
 			}
 			}
 		}
 		}
 	}
 	}