Browse Source

minerloop_async: Run watchdog code within actual device thread

Luke Dashjr 12 years ago
parent
commit
e689356318
3 changed files with 38 additions and 15 deletions
  1. 13 0
      deviceapi.c
  2. 20 15
      miner.c
  3. 5 0
      miner.h

+ 13 - 0
deviceapi.c

@@ -418,6 +418,12 @@ void minerloop_async(struct thr_info *mythr)
 	
 	if (mythr->work_restart_notifier[1] == -1)
 		notifier_init(mythr->work_restart_notifier);
+	for (proc = cgpu; proc; proc = proc->next_proc)
+	{
+		mythr = proc->thr[0];
+		timer_set_now(&mythr->tv_watchdog);
+		proc->disable_watchdog = true;
+	}
 	
 	while (likely(!cgpu->shutdown)) {
 		tv_timeout.tv_sec = -1;
@@ -468,8 +474,15 @@ defer_events:
 			if (timer_passed(&mythr->tv_poll, &tv_now))
 				api->poll(mythr);
 			
+			if (timer_passed(&mythr->tv_watchdog, &tv_now))
+			{
+				timer_set_delay(&mythr->tv_watchdog, &tv_now, WATCHDOG_INTERVAL * 1000000);
+				bfg_watchdog(proc, &tv_now);
+			}
+			
 			reduce_timeout_to(&tv_timeout, &mythr->tv_morework);
 			reduce_timeout_to(&tv_timeout, &mythr->tv_poll);
+			reduce_timeout_to(&tv_timeout, &mythr->tv_watchdog);
 		}
 		
 		do_notifier_select(thr, &tv_timeout);

+ 20 - 15
miner.c

@@ -8714,7 +8714,6 @@ void proc_enable(struct cgpu_info *cgpu)
 /* Makes sure the hashmeter keeps going even if mining threads stall, updates
  * the screen at regular intervals, and restarts threads if they appear to have
  * died. */
-#define WATCHDOG_INTERVAL		2
 #define WATCHDOG_SICK_TIME		60
 #define WATCHDOG_DEAD_TIME		600
 #define WATCHDOG_SICK_COUNT		(WATCHDOG_SICK_TIME/WATCHDOG_INTERVAL)
@@ -8804,6 +8803,16 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 
 		for (i = 0; i < total_devices; ++i) {
 			struct cgpu_info *cgpu = get_devices(i);
+			if (!cgpu->disable_watchdog)
+				bfg_watchdog(cgpu, &now);
+		}
+	}
+
+	return NULL;
+}
+
+void bfg_watchdog(struct cgpu_info * const cgpu, struct timeval * const tvp_now)
+{
 			struct thr_info *thr = cgpu->thr[0];
 			enum dev_enable *denable;
 			char *dev_str = cgpu->proc_repr;
@@ -8830,7 +8839,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 			
 			/* Thread is disabled */
 			if (*denable == DEV_DISABLED)
-				continue;
+				return;
 			else
 			if (*denable == DEV_RECOVER_ERR) {
 				if (opt_restart && timer_elapsed(&cgpu->tv_device_last_not_well, NULL) > cgpu->reinit_backoff) {
@@ -8840,7 +8849,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 						cgpu->reinit_backoff *= 2;
 					device_recovered(cgpu);
 				}
-				continue;
+				return;
 			}
 			else
 			if (*denable == DEV_RECOVER) {
@@ -8850,7 +8859,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 					device_recovered(cgpu);
 				}
 				dev_error_update(cgpu, REASON_DEV_THERMAL_CUTOFF);
-				continue;
+				return;
 			}
 			else
 			if (cgpu->temp > cgpu->cutofftemp)
@@ -8864,7 +8873,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 			}
 
 			if (thr->getwork) {
-				if (cgpu->status == LIFE_WELL && thr->getwork < now.tv_sec - opt_log_interval) {
+				if (cgpu->status == LIFE_WELL && thr->getwork < tvp_now->tv_sec - opt_log_interval) {
 					int thrid;
 					bool cgpu_idle = true;
 					thr->rolling = 0;
@@ -8876,21 +8885,21 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 						cgpu->status = LIFE_WAIT;
 					}
 				}
-				continue;
+				return;
 			}
 			else if (cgpu->status == LIFE_WAIT)
 				cgpu->status = LIFE_WELL;
 
 #ifdef WANT_CPUMINE
 			if (!strcmp(cgpu->drv->dname, "cpu"))
-				continue;
+				return;
 #endif
-			if (cgpu->status != LIFE_WELL && (now.tv_sec - thr->last.tv_sec < WATCHDOG_SICK_TIME)) {
+			if (cgpu->status != LIFE_WELL && (tvp_now->tv_sec - thr->last.tv_sec < WATCHDOG_SICK_TIME)) {
 				if (likely(cgpu->status != LIFE_INIT && cgpu->status != LIFE_INIT2))
 				applog(LOG_ERR, "%s: Recovered, declaring WELL!", dev_str);
 				cgpu->status = LIFE_WELL;
 				cgpu->device_last_well = time(NULL);
-			} else if (cgpu->status == LIFE_WELL && (now.tv_sec - thr->last.tv_sec > WATCHDOG_SICK_TIME)) {
+			} else if (cgpu->status == LIFE_WELL && (tvp_now->tv_sec - thr->last.tv_sec > WATCHDOG_SICK_TIME)) {
 				thr->rolling = cgpu->rolling = 0;
 				cgpu->status = LIFE_SICK;
 				applog(LOG_ERR, "%s: Idle for more than 60 seconds, declaring SICK!", dev_str);
@@ -8909,14 +8918,14 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 					applog(LOG_ERR, "%s: Attempting to restart", dev_str);
 					reinit_device(cgpu);
 				}
-			} else if (cgpu->status == LIFE_SICK && (now.tv_sec - thr->last.tv_sec > WATCHDOG_DEAD_TIME)) {
+			} else if (cgpu->status == LIFE_SICK && (tvp_now->tv_sec - thr->last.tv_sec > WATCHDOG_DEAD_TIME)) {
 				cgpu->status = LIFE_DEAD;
 				applog(LOG_ERR, "%s: Not responded for more than 10 minutes, declaring DEAD!", dev_str);
 				cgtime(&thr->sick);
 
 				dev_error(cgpu, REASON_DEV_DEAD_IDLE_600);
 				run_cmd(cmd_dead);
-			} else if (now.tv_sec - thr->sick.tv_sec > 60 &&
+			} else if (tvp_now->tv_sec - thr->sick.tv_sec > 60 &&
 				   (cgpu->status == LIFE_SICK || cgpu->status == LIFE_DEAD)) {
 				/* Attempt to restart a GPU that's sick or dead once every minute */
 				cgtime(&thr->sick);
@@ -8928,10 +8937,6 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 				if (opt_restart)
 					reinit_device(cgpu);
 			}
-		}
-	}
-
-	return NULL;
 }
 
 static void log_print_status(struct cgpu_info *cgpu)

+ 5 - 0
miner.h

@@ -572,6 +572,7 @@ struct cgpu_info {
 	struct work *queued_work;
 	unsigned int queued_count;
 
+	bool disable_watchdog;
 	bool shutdown;
 };
 
@@ -630,6 +631,7 @@ struct thr_info {
 	struct timeval tv_results_jobstart;
 	struct timeval tv_jobstart;
 	struct timeval tv_poll;
+	struct timeval tv_watchdog;
 	notifier_t notifier;
 	bool starting_next_work;
 	uint32_t _max_nonce;
@@ -732,6 +734,9 @@ static inline void swab256(void *dest_p, const void *src_p)
 
 #define flip32(dest_p, src_p) swap32yes(dest_p, src_p, 32 / 4)
 
+#define WATCHDOG_INTERVAL  2
+extern void bfg_watchdog(struct cgpu_info *, struct timeval *tvp_now);
+
 extern void _quit(int status);
 
 static inline void mutex_lock(pthread_mutex_t *lock)