Browse Source

Use ADL activity report to tell us if a sick GPU is still busy suggesting it is hard hung and do not attempt to restart it.

Con Kolivas 14 years ago
parent
commit
27b05db4a5
1 changed files with 6 additions and 0 deletions
  1. 6 0
      main.c

+ 6 - 0
main.c

@@ -5017,6 +5017,12 @@ static void *watchdog_thread(void *userdata)
 				gpus[gpu].status = LIFE_SICK;
 				gpus[gpu].status = LIFE_SICK;
 				applog(LOG_ERR, "Thread %d idle for more than 60 seconds, GPU %d declared SICK!", i, gpu);
 				applog(LOG_ERR, "Thread %d idle for more than 60 seconds, GPU %d declared SICK!", i, gpu);
 				gettimeofday(&thr->sick, NULL);
 				gettimeofday(&thr->sick, NULL);
+#ifdef HAVE_ADL
+				if (adl_active && gpus[gpu].has_adl && gpu_activity(gpu) > 50) {
+					applog(LOG_ERR, "GPU still showing activity suggesting a hard hang.");
+					applog(LOG_ERR, "Will not attempt to auto-restart it.");
+				} else
+#endif
 				if (opt_restart) {
 				if (opt_restart) {
 					applog(LOG_ERR, "Attempting to restart GPU");
 					applog(LOG_ERR, "Attempting to restart GPU");
 					reinit_device(thr->cgpu);
 					reinit_device(thr->cgpu);