13 years ago · f675d64e1b
--- a/adl.c
+++ b/adl.c
@@ -1056,9 +1056,7 @@ static bool fan_autotune(int gpu, int temp, int fanpercent, int lasttemp, bool *
 
				 		applog(LOG_WARNING, "Overheat detected on GPU %d, increasing fan to 100%", gpu);
			
 
				 		newpercent = iMax;
			
 
				 
			
 
				-		cgpu->device_last_not_well = time(NULL);
			
 
				-		cgpu->device_not_well_reason = REASON_DEV_OVER_HEAT;
			
 
				-		cgpu->dev_over_heat_count++;
			
 
				+		dev_error(cgpu, REASON_DEV_OVER_HEAT);
			
 
				 	} else if (temp > gpus[gpu].targettemp && fanpercent < top && tdiff >= 0) {
			
 
				 		applog(LOG_DEBUG, "Temperature over target, increasing fanspeed");
			
 
				 		if (temp > gpus[gpu].targettemp + opt_hysteresis)
			
@@ -1166,9 +1164,7 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
 
				 			applog(LOG_WARNING, "Overheat detected, decreasing GPU %d clock speed", gpu);
			
 
				 			newengine = ga->minspeed;
			
 
				 
			
 
				-			cgpu->device_last_not_well = time(NULL);
			
 
				-			cgpu->device_not_well_reason = REASON_DEV_OVER_HEAT;
			
 
				-			cgpu->dev_over_heat_count++;
			
 
				+			dev_error(cgpu, REASON_DEV_OVER_HEAT);
			
 
				 		} else if (temp > gpus[gpu].targettemp + opt_hysteresis && engine > ga->minspeed && fan_optimal) {
			
 
				 			applog(LOG_DEBUG, "Temperature %d degrees over target, decreasing clock speed", opt_hysteresis);
			
 
				 			newengine = engine - ga->lpOdParameters.sEngineClock.iStep;
			
--- a/driver-bitforce.c
+++ b/driver-bitforce.c
@@ -319,9 +319,7 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce)
 
				 		 * our responses are out of sync and flush the buffer to
			
 
				 		 * hopefully recover */
			
 
				 		applog(LOG_WARNING, "BFL%i: Garbled response probably throttling, clearing buffer", bitforce->device_id);
			
 
				-		bitforce->device_last_not_well = time(NULL);
			
 
				-		bitforce->device_not_well_reason = REASON_DEV_THROTTLE;
			
 
				-		bitforce->dev_throttle_count++;
			
 
				+		dev_error(bitforce, REASON_DEV_THROTTLE);
			
 
				 		/* Count throttling episodes as hardware errors */
			
 
				 		bitforce->hw_errors++;
			
 
				 		bitforce_clear_buffer(bitforce);
			
@@ -465,9 +463,7 @@ static int64_t bitforce_get_result(struct thr_info *thr, struct work *work)
 
				 	if (elapsed.tv_sec > BITFORCE_TIMEOUT_S) {
			
 
				 		applog(LOG_ERR, "BFL%i: took %dms - longer than %dms", bitforce->device_id,
			
 
				 			tv_to_ms(elapsed), BITFORCE_TIMEOUT_MS);
			
 
				-		bitforce->device_last_not_well = time(NULL);
			
 
				-		bitforce->device_not_well_reason = REASON_DEV_OVER_HEAT;
			
 
				-		bitforce->dev_over_heat_count++;
			
 
				+		dev_error(bitforce, REASON_DEV_OVER_HEAT);
			
 
				 		++bitforce->hw_errors;
			
 
				 		++hw_errors;
			
 
				 
			
@@ -578,9 +574,7 @@ static int64_t bitforce_scanhash(struct thr_info *thr, struct work *work, int64_
 
				 commerr:
			
 
				 		ret = 0;
			
 
				 		applog(LOG_ERR, "BFL%i: Comms error", bitforce->device_id);
			
 
				-		bitforce->device_last_not_well = time(NULL);
			
 
				-		bitforce->device_not_well_reason = REASON_DEV_COMMS_ERROR;
			
 
				-		bitforce->dev_comms_error_count++;
			
 
				+		dev_error(bitforce, REASON_DEV_COMMS_ERROR);
			
 
				 		bitforce->hw_errors++;
			
 
				 		BFclose(bitforce->device_fd);
			
 
				 		int fd = bitforce->device_fd = BFopen(bitforce->device_path);
			
--- a/driver-icarus.c
+++ b/driver-icarus.c
@@ -707,9 +707,7 @@ static bool icarus_reopen(struct cgpu_info *icarus, struct icarus_state *state,
 
				 	*fdp = icarus->device_fd = icarus_open(icarus->device_path, info->baud);
			
 
				 	if (unlikely(-1 == *fdp)) {
			
 
				 		applog(LOG_ERR, "%s %u: Failed to reopen on %s", icarus->api->name, icarus->device_id, icarus->device_path);
			
 
				-		icarus->device_last_not_well = time(NULL);
			
 
				-		icarus->device_not_well_reason = REASON_DEV_COMMS_ERROR;
			
 
				-		icarus->dev_comms_error_count++;
			
 
				+		dev_error(icarus, REASON_DEV_COMMS_ERROR);
			
 
				 		state->firstrun = true;
			
 
				 		return false;
			
 
				 	}
			
@@ -730,9 +728,7 @@ static bool icarus_start_work(struct thr_info *thr, const unsigned char *ob_bin)
 
				 	if (ret) {
			
 
				 		do_icarus_close(thr);
			
 
				 		applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id);
			
 
				-		icarus->device_last_not_well = time(NULL);
			
 
				-		icarus->device_not_well_reason = REASON_DEV_COMMS_ERROR;
			
 
				-		icarus->dev_comms_error_count++;
			
 
				+		dev_error(icarus, REASON_DEV_COMMS_ERROR);
			
 
				 		return false;	/* This should never happen */
			
 
				 	}
			
 
				 
			
@@ -815,9 +811,7 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
 
				 				case ICA_GETS_ERROR:
			
 
				 					do_icarus_close(thr);
			
 
				 					applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id);
			
 
				-					icarus->device_last_not_well = time(NULL);
			
 
				-					icarus->device_not_well_reason = REASON_DEV_COMMS_ERROR;
			
 
				-					icarus->dev_comms_error_count++;
			
 
				+					dev_error(icarus, REASON_DEV_COMMS_ERROR);
			
 
				 					if (!icarus_reopen(icarus, state, &fd))
			
 
				 						return -1;
			
 
				 					break;
			
--- a/miner.c
+++ b/miner.c
@@ -6314,10 +6314,7 @@ void *miner_thread(void *userdata)
 
				 	gettimeofday(&getwork_start, NULL);
			
 
				 
			
 
				 	if (api->thread_init && !api->thread_init(mythr)) {
			
 
				-		cgpu->device_last_not_well = time(NULL);
			
 
				-		cgpu->device_not_well_reason = REASON_THREAD_FAIL_INIT;
			
 
				-		cgpu->thread_fail_init_count++;
			
 
				-
			
 
				+		dev_error(cgpu, REASON_THREAD_FAIL_INIT);
			
 
				 		goto out;
			
 
				 	}
			
 
				 
			
@@ -6388,9 +6385,7 @@ void *miner_thread(void *userdata)
 
				 			if (unlikely(hashes == -1)) {
			
 
				 				time_t now = time(NULL);
			
 
				 				if (difftime(now, cgpu->device_last_not_well) > 1.) {
			
 
				-					cgpu->device_last_not_well = time(NULL);
			
 
				-					cgpu->device_not_well_reason = REASON_THREAD_ZERO_HASH;
			
 
				-					cgpu->thread_zero_hash_count++;
			
 
				+					dev_error(cgpu, REASON_THREAD_ZERO_HASH);
			
 
				 				}
			
 
				 
			
 
				 				if (scanhash_working && opt_restart) {
			
@@ -6971,9 +6966,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 
				 				       cgpu->api->name, cgpu->device_id);
			
 
				 				*denable = DEV_RECOVER;
			
 
				 
			
 
				-				cgpu->device_last_not_well = time(NULL);
			
 
				-				cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
			
 
				-				++cgpu->dev_thermal_cutoff_count;
			
 
				+				dev_error(cgpu, REASON_DEV_THERMAL_CUTOFF);
			
 
				 			}
			
 
				 
			
 
				 			if (thr->getwork) {
			
@@ -7009,9 +7002,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 
				 				applog(LOG_ERR, "%s: Idle for more than 60 seconds, declaring SICK!", dev_str);
			
 
				 				gettimeofday(&thr->sick, NULL);
			
 
				 
			
 
				-				cgpu->device_last_not_well = time(NULL);
			
 
				-				cgpu->device_not_well_reason = REASON_DEV_SICK_IDLE_60;
			
 
				-				cgpu->dev_sick_idle_60_count++;
			
 
				+				dev_error(cgpu, REASON_DEV_SICK_IDLE_60);
			
 
				 #ifdef HAVE_ADL
			
 
				 				if (adl_active && cgpu->has_adl && gpu_activity(gpu) > 50) {
			
 
				 					applog(LOG_ERR, "GPU still showing activity suggesting a hard hang.");
			
@@ -7027,9 +7018,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 
				 				applog(LOG_ERR, "%s: Not responded for more than 10 minutes, declaring DEAD!", dev_str);
			
 
				 				gettimeofday(&thr->sick, NULL);
			
 
				 
			
 
				-				cgpu->device_last_not_well = time(NULL);
			
 
				-				cgpu->device_not_well_reason = REASON_DEV_DEAD_IDLE_600;
			
 
				-				cgpu->dev_dead_idle_600_count++;
			
 
				+				dev_error(cgpu, REASON_DEV_DEAD_IDLE_600);
			
 
				 			} else if (now.tv_sec - thr->sick.tv_sec > 60 &&
			
 
				 				   (cgpu->status == LIFE_SICK || cgpu->status == LIFE_DEAD)) {
			
 
				 				/* Attempt to restart a GPU that's sick or dead once every minute */
			
--- a/util.c
+++ b/util.c
@@ -1413,3 +1413,46 @@ out:
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				+
			
 
				+
			
 
				+void dev_error(struct cgpu_info *dev, enum dev_reason reason)
			
 
				+{
			
 
				+	dev->device_last_not_well = time(NULL);
			
 
				+	dev->device_not_well_reason = reason;
			
 
				+
			
 
				+
			
 
				+	switch (reason)
			
 
				+	{
			
 
				+		case REASON_THREAD_FAIL_INIT:
			
 
				+			dev->thread_fail_init_count++;
			
 
				+		break;
			
 
				+		case REASON_THREAD_ZERO_HASH:
			
 
				+			dev->thread_zero_hash_count++;
			
 
				+		break;
			
 
				+		case REASON_THREAD_FAIL_QUEUE:
			
 
				+			dev->thread_fail_queue_count++;
			
 
				+		break;
			
 
				+		case REASON_DEV_SICK_IDLE_60:
			
 
				+			dev->dev_sick_idle_60_count++;
			
 
				+		break;
			
 
				+		case REASON_DEV_DEAD_IDLE_600:
			
 
				+			dev->dev_dead_idle_600_count++;
			
 
				+		break;
			
 
				+		case REASON_DEV_NOSTART:
			
 
				+			dev->dev_nostart_count++;
			
 
				+		break;
			
 
				+		case REASON_DEV_OVER_HEAT:
			
 
				+			dev->dev_over_heat_count++;
			
 
				+		break;
			
 
				+		case REASON_DEV_THERMAL_CUTOFF:
			
 
				+			dev->dev_thermal_cutoff_count++;
			
 
				+		break;
			
 
				+		case REASON_DEV_COMMS_ERROR:
			
 
				+			dev->dev_comms_error_count++;
			
 
				+		break;
			
 
				+		case REASON_DEV_THROTTLE:
			
 
				+			dev->dev_throttle_count++;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+}
			
--- a/util.h
+++ b/util.h
@@ -43,11 +43,14 @@
 
				 #endif
			
 
				 
			
 
				 struct pool;
			
 
				+enum dev_reason;
			
 
				+struct cgpu_info;
			
 
				 bool stratum_send(struct pool *pool, char *s, ssize_t len);
			
 
				 char *recv_line(struct pool *pool);
			
 
				 bool parse_method(struct pool *pool, char *s);
			
 
				 bool extract_sockaddr(struct pool *pool, char *url);
			
 
				 bool auth_stratum(struct pool *pool);
			
 
				 bool initiate_stratum(struct pool *pool);
			
 
				+void dev_error(struct cgpu_info *dev, enum dev_reason reason);
			
 
				 
			
 
				 #endif /* __UTIL_H__ */