Browse Source

Merge branch 'recovery' into bfgminer

Conflicts:
	miner.c
Luke Dashjr 13 years ago
parent
commit
292b2b67f1
3 changed files with 30 additions and 7 deletions
  1. 3 3
      driver-icarus.c
  2. 25 4
      miner.c
  3. 2 0
      miner.h

+ 3 - 3
driver-icarus.c

@@ -754,7 +754,7 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
 
 				}
 				if (info->quirk_reopen == 1 && !icarus_reopen(icarus, state, &fd))
-					return 0;
+					return -1;
 			}
 			
 		}
@@ -764,7 +764,7 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
 	}
 	else
 	if (fd == -1 && !icarus_reopen(icarus, state, &fd))
-		return 0;
+		return -1;
 
 #ifndef WIN32
 	tcflush(fd, TCOFLUSH);
@@ -802,7 +802,7 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
 	}
 
 	if (info->quirk_reopen == 2 && !icarus_reopen(icarus, state, &fd))
-		return 0;
+		return -1;
 
 	work->blk.nonce = 0xffffffff;
 

+ 25 - 4
miner.c

@@ -1934,6 +1934,8 @@ static void curses_print_devstatus(int thr_id)
 		wprintw(statuswin, "OFF  ");
 	else if (cgpu->deven == DEV_RECOVER)
 		wprintw(statuswin, "REST ");
+	else if (cgpu->deven == DEV_RECOVER_ERR)
+		wprintw(statuswin, " ERR ");
 	else if (cgpu->status == LIFE_WAIT)
 		wprintw(statuswin, "WAIT ");
 	else
@@ -5232,6 +5234,7 @@ void *miner_thread(void *userdata)
 	uint32_t max_nonce = api->can_limit_work ? api->can_limit_work(mythr) : 0xffffffff;
 	int64_t hashes_done = 0;
 	int64_t hashes;
+	bool scanhash_working = true;
 	struct work *work = make_work();
 	const bool primary = (!mythr->device_thread) || mythr->primary_thread;
 
@@ -5312,15 +5315,22 @@ void *miner_thread(void *userdata)
 			gettimeofday(&getwork_start, NULL);
 
 			if (unlikely(hashes == -1)) {
-				applog(LOG_ERR, "%s %d failure, disabling!", api->name, cgpu->device_id);
-				cgpu->deven = DEV_DISABLED;
-
 				cgpu->device_last_not_well = time(NULL);
 				cgpu->device_not_well_reason = REASON_THREAD_ZERO_HASH;
 				cgpu->thread_zero_hash_count++;
 
-				mt_disable(mythr, thr_id, api);
+				if (scanhash_working && opt_restart) {
+					applog(LOG_ERR, "%s %u failure, attempting to reinitialize", api->name, cgpu->device_id);
+					scanhash_working = false;
+					cgpu->reinit_backoff = 5.2734375;
+				} else {
+					applog(LOG_ERR, "%s %u failure, disabling!", api->name, cgpu->device_id);
+					cgpu->deven = DEV_RECOVER_ERR;
+					mt_disable(mythr, thr_id, api);
+				}
 			}
+			else
+				scanhash_working = true;
 
 			hashes_done += hashes;
 			if (hashes > cgpu->max_hashes)
@@ -5838,6 +5848,17 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 			if (*denable == DEV_DISABLED)
 				continue;
 			else
+			if (*denable == DEV_RECOVER_ERR) {
+				if (opt_restart && difftime(time(NULL), cgpu->device_last_not_well) > cgpu->reinit_backoff) {
+					applog(LOG_NOTICE, "Attempting to reinitialize %s %u",
+					       cgpu->api->name, cgpu->device_id);
+					if (cgpu->reinit_backoff < 300)
+						cgpu->reinit_backoff *= 2;
+					device_recovered(cgpu);
+				}
+				continue;
+			}
+			else
 			if (*denable == DEV_RECOVER) {
 				if (opt_restart && cgpu->temp < cgpu->targettemp) {
 					applog(LOG_NOTICE, "%s %u recovered to temperature below target, re-enabling",

+ 2 - 0
miner.h

@@ -294,6 +294,7 @@ enum dev_enable {
 	DEV_ENABLED,
 	DEV_DISABLED,
 	DEV_RECOVER,
+	DEV_RECOVER_ERR,
 };
 
 enum cl_kernels {
@@ -457,6 +458,7 @@ struct cgpu_info {
 	time_t device_last_well;
 	time_t device_last_not_well;
 	enum dev_reason device_not_well_reason;
+	float reinit_backoff;
 	int thread_fail_init_count;
 	int thread_zero_hash_count;
 	int thread_fail_queue_count;