12 years ago · e689356318
--- a/deviceapi.c
+++ b/deviceapi.c
@@ -418,6 +418,12 @@ void minerloop_async(struct thr_info *mythr)
 
				 	
			
 
				 	if (mythr->work_restart_notifier[1] == -1)
			
 
				 		notifier_init(mythr->work_restart_notifier);
			
 
				+	for (proc = cgpu; proc; proc = proc->next_proc)
			
 
				+	{
			
 
				+		mythr = proc->thr[0];
			
 
				+		timer_set_now(&mythr->tv_watchdog);
			
 
				+		proc->disable_watchdog = true;
			
 
				+	}
			
 
				 	
			
 
				 	while (likely(!cgpu->shutdown)) {
			
 
				 		tv_timeout.tv_sec = -1;
			
@@ -468,8 +474,15 @@ defer_events:
 
				 			if (timer_passed(&mythr->tv_poll, &tv_now))
			
 
				 				api->poll(mythr);
			
 
				 			
			
 
				+			if (timer_passed(&mythr->tv_watchdog, &tv_now))
			
 
				+			{
			
 
				+				timer_set_delay(&mythr->tv_watchdog, &tv_now, WATCHDOG_INTERVAL * 1000000);
			
 
				+				bfg_watchdog(proc, &tv_now);
			
 
				+			}
			
 
				+			
			
 
				 			reduce_timeout_to(&tv_timeout, &mythr->tv_morework);
			
 
				 			reduce_timeout_to(&tv_timeout, &mythr->tv_poll);
			
 
				+			reduce_timeout_to(&tv_timeout, &mythr->tv_watchdog);
			
 
				 		}
			
 
				 		
			
 
				 		do_notifier_select(thr, &tv_timeout);
			
--- a/miner.c
+++ b/miner.c
@@ -8714,7 +8714,6 @@ void proc_enable(struct cgpu_info *cgpu)
 
				 /* Makes sure the hashmeter keeps going even if mining threads stall, updates
			
 
				  * the screen at regular intervals, and restarts threads if they appear to have
			
 
				  * died. */
			
 
				-#define WATCHDOG_INTERVAL		2
			
 
				 #define WATCHDOG_SICK_TIME		60
			
 
				 #define WATCHDOG_DEAD_TIME		600
			
 
				 #define WATCHDOG_SICK_COUNT		(WATCHDOG_SICK_TIME/WATCHDOG_INTERVAL)
			
@@ -8804,6 +8803,16 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 
				 
			
 
				 		for (i = 0; i < total_devices; ++i) {
			
 
				 			struct cgpu_info *cgpu = get_devices(i);
			
 
				+			if (!cgpu->disable_watchdog)
			
 
				+				bfg_watchdog(cgpu, &now);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+void bfg_watchdog(struct cgpu_info * const cgpu, struct timeval * const tvp_now)
			
 
				+{
			
 
				 			struct thr_info *thr = cgpu->thr[0];
			
 
				 			enum dev_enable *denable;
			
 
				 			char *dev_str = cgpu->proc_repr;
			
@@ -8830,7 +8839,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 
				 			
			
 
				 			/* Thread is disabled */
			
 
				 			if (*denable == DEV_DISABLED)
			
 
				-				continue;
			
 
				+				return;
			
 
				 			else
			
 
				 			if (*denable == DEV_RECOVER_ERR) {
			
 
				 				if (opt_restart && timer_elapsed(&cgpu->tv_device_last_not_well, NULL) > cgpu->reinit_backoff) {
			
@@ -8840,7 +8849,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 
				 						cgpu->reinit_backoff *= 2;
			
 
				 					device_recovered(cgpu);
			
 
				 				}
			
 
				-				continue;
			
 
				+				return;
			
 
				 			}
			
 
				 			else
			
 
				 			if (*denable == DEV_RECOVER) {
			
@@ -8850,7 +8859,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 
				 					device_recovered(cgpu);
			
 
				 				}
			
 
				 				dev_error_update(cgpu, REASON_DEV_THERMAL_CUTOFF);
			
 
				-				continue;
			
 
				+				return;
			
 
				 			}
			
 
				 			else
			
 
				 			if (cgpu->temp > cgpu->cutofftemp)
			
@@ -8864,7 +8873,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 
				 			}
			
 
				 
			
 
				 			if (thr->getwork) {
			
 
				-				if (cgpu->status == LIFE_WELL && thr->getwork < now.tv_sec - opt_log_interval) {
			
 
				+				if (cgpu->status == LIFE_WELL && thr->getwork < tvp_now->tv_sec - opt_log_interval) {
			
 
				 					int thrid;
			
 
				 					bool cgpu_idle = true;
			
 
				 					thr->rolling = 0;
			
@@ -8876,21 +8885,21 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 
				 						cgpu->status = LIFE_WAIT;
			
 
				 					}
			
 
				 				}
			
 
				-				continue;
			
 
				+				return;
			
 
				 			}
			
 
				 			else if (cgpu->status == LIFE_WAIT)
			
 
				 				cgpu->status = LIFE_WELL;
			
 
				 
			
 
				 #ifdef WANT_CPUMINE
			
 
				 			if (!strcmp(cgpu->drv->dname, "cpu"))
			
 
				-				continue;
			
 
				+				return;
			
 
				 #endif
			
 
				-			if (cgpu->status != LIFE_WELL && (now.tv_sec - thr->last.tv_sec < WATCHDOG_SICK_TIME)) {
			
 
				+			if (cgpu->status != LIFE_WELL && (tvp_now->tv_sec - thr->last.tv_sec < WATCHDOG_SICK_TIME)) {
			
 
				 				if (likely(cgpu->status != LIFE_INIT && cgpu->status != LIFE_INIT2))
			
 
				 				applog(LOG_ERR, "%s: Recovered, declaring WELL!", dev_str);
			
 
				 				cgpu->status = LIFE_WELL;
			
 
				 				cgpu->device_last_well = time(NULL);
			
 
				-			} else if (cgpu->status == LIFE_WELL && (now.tv_sec - thr->last.tv_sec > WATCHDOG_SICK_TIME)) {
			
 
				+			} else if (cgpu->status == LIFE_WELL && (tvp_now->tv_sec - thr->last.tv_sec > WATCHDOG_SICK_TIME)) {
			
 
				 				thr->rolling = cgpu->rolling = 0;
			
 
				 				cgpu->status = LIFE_SICK;
			
 
				 				applog(LOG_ERR, "%s: Idle for more than 60 seconds, declaring SICK!", dev_str);
			
@@ -8909,14 +8918,14 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 
				 					applog(LOG_ERR, "%s: Attempting to restart", dev_str);
			
 
				 					reinit_device(cgpu);
			
 
				 				}
			
 
				-			} else if (cgpu->status == LIFE_SICK && (now.tv_sec - thr->last.tv_sec > WATCHDOG_DEAD_TIME)) {
			
 
				+			} else if (cgpu->status == LIFE_SICK && (tvp_now->tv_sec - thr->last.tv_sec > WATCHDOG_DEAD_TIME)) {
			
 
				 				cgpu->status = LIFE_DEAD;
			
 
				 				applog(LOG_ERR, "%s: Not responded for more than 10 minutes, declaring DEAD!", dev_str);
			
 
				 				cgtime(&thr->sick);
			
 
				 
			
 
				 				dev_error(cgpu, REASON_DEV_DEAD_IDLE_600);
			
 
				 				run_cmd(cmd_dead);
			
 
				-			} else if (now.tv_sec - thr->sick.tv_sec > 60 &&
			
 
				+			} else if (tvp_now->tv_sec - thr->sick.tv_sec > 60 &&
			
 
				 				   (cgpu->status == LIFE_SICK || cgpu->status == LIFE_DEAD)) {
			
 
				 				/* Attempt to restart a GPU that's sick or dead once every minute */
			
 
				 				cgtime(&thr->sick);
			
@@ -8928,10 +8937,6 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 
				 				if (opt_restart)
			
 
				 					reinit_device(cgpu);
			
 
				 			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	return NULL;
			
 
				 }
			
 
				 
			
 
				 static void log_print_status(struct cgpu_info *cgpu)
			
--- a/miner.h
+++ b/miner.h
@@ -572,6 +572,7 @@ struct cgpu_info {
 
				 	struct work *queued_work;
			
 
				 	unsigned int queued_count;
			
 
				 
			
 
				+	bool disable_watchdog;
			
 
				 	bool shutdown;
			
 
				 };
			
 
				 
			
@@ -630,6 +631,7 @@ struct thr_info {
 
				 	struct timeval tv_results_jobstart;
			
 
				 	struct timeval tv_jobstart;
			
 
				 	struct timeval tv_poll;
			
 
				+	struct timeval tv_watchdog;
			
 
				 	notifier_t notifier;
			
 
				 	bool starting_next_work;
			
 
				 	uint32_t _max_nonce;
			
@@ -732,6 +734,9 @@ static inline void swab256(void *dest_p, const void *src_p)
 
				 
			
 
				 #define flip32(dest_p, src_p) swap32yes(dest_p, src_p, 32 / 4)
			
 
				 
			
 
				+#define WATCHDOG_INTERVAL  2
			
 
				+extern void bfg_watchdog(struct cgpu_info *, struct timeval *tvp_now);
			
 
				+
			
 
				 extern void _quit(int status);
			
 
				 
			
 
				 static inline void mutex_lock(pthread_mutex_t *lock)