13 years ago · dfa30502bf
--- a/adl.c
+++ b/adl.c
@@ -505,6 +505,7 @@ void init_adl(int nDevs)
 
															 			ga->autoengine = true;
														
 
															 			ga->managed = true;
														
 
															 		}
														
 
															+		gpus[gpu].temp =
														
 
															 		ga->lasttemp = __gpu_temp(ga);
														
 
															 	}
														
@@ -744,6 +745,7 @@ bool gpu_stats(int gpu, float *temp, int *engineclock, int *memclock, float *vdd
 
															 	ga = &gpus[gpu].adl;
														
 
															 	lock_adl();
														
 
															+	gpus[gpu].temp =
														
 
															 	*temp = __gpu_temp(ga);
														
 
															 	if (ADL_Overdrive5_CurrentActivity_Get(ga->iAdapterIndex, &ga->lpActivity) != ADL_OK) {
														
 
															 		*engineclock = 0;
														
@@ -1120,6 +1122,7 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
 
															 	lock_adl();
														
 
															 	ADL_Overdrive5_CurrentActivity_Get(ga->iAdapterIndex, &ga->lpActivity);
														
 
															+	gpus[gpu].temp =
														
 
															 	temp = __gpu_temp(ga);
														
 
															 	if (ga->twin)
														
 
															 		twintemp = __gpu_temp(ga->twin);
														
@@ -1154,13 +1157,8 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
 
															 	if (engine && ga->autoengine) {
														
 
															 		if (temp > cgpu->cutofftemp) {
														
 
															-			applog(LOG_WARNING, "Hit thermal cutoff limit on GPU %d, disabling!", gpu);
														
 
															-			*denable = DEV_RECOVER;
														
 
															+			// Shutoff and recovery happens back in watchdog_thread
														
 
															 			newengine = ga->minspeed;
														
 
															-
														
 
															-			cgpu->device_last_not_well = time(NULL);
														
 
															-			cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
														
 
															-			cgpu->dev_thermal_cutoff_count++;
														
 
															 		} else if (temp > ga->overtemp && engine > ga->minspeed) {
														
 
															 			applog(LOG_WARNING, "Overheat detected, decreasing GPU %d clock speed", gpu);
														
 
															 			newengine = ga->minspeed;
														
@@ -1178,9 +1176,6 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
 
															 				newengine = ga->maxspeed;
														
 
															 			else
														
 
															 				newengine = engine + ga->lpOdParameters.sEngineClock.iStep;
														
 
															-		} else if (temp < gpus[gpu].targettemp && *denable == DEV_RECOVER && opt_restart) {
														
 
															-			applog(LOG_NOTICE, "Device recovered to temperature below target, re-enabling");
														
 
															-			*denable = DEV_ENABLED;
														
 
															 		}
														
 
															 		if (newengine > ga->maxspeed)
														
--- a/driver-bitforce.c
+++ b/driver-bitforce.c
@@ -308,14 +308,6 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce)
 
															 		if (temp > 0) {
														
 
															 			bitforce->temp = temp;
														
 
															-			if (unlikely(bitforce->cutofftemp > 0 && temp > bitforce->cutofftemp)) {
														
 
															-				applog(LOG_WARNING, "BFL%i: Hit thermal cutoff limit, disabling!", bitforce->device_id);
														
 
															-				bitforce->deven = DEV_RECOVER;
														
 
															-
														
 
															-				bitforce->device_last_not_well = time(NULL);
														
 
															-				bitforce->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
														
 
															-				bitforce->dev_thermal_cutoff_count++;
														
 
															-			}
														
 
															 		}
														
 
															 	} else {
														
 
															 		/* Use the temperature monitor as a kind of watchdog for when
														
--- a/driver-modminer.c
+++ b/driver-modminer.c
@@ -595,14 +595,7 @@ modminer_process_results(struct thr_info*thr)
 
															 		if (!fpgaid)
														
 
															 			modminer->temp = (float)temperature;
														
 
															 		if (temperature > modminer->cutofftemp - 2) {
														
 
															-			if (temperature > modminer->cutofftemp) {
														
 
															-				applog(LOG_WARNING, "%s %u.%u: Hit thermal cutoff limit, disabling device!", modminer->api->name, modminer->device_id, fpgaid);
														
 
															-				modminer->deven = DEV_RECOVER;
														
 
															-
														
 
															-				modminer->device_last_not_well = time(NULL);
														
 
															-				modminer->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
														
 
															-				++modminer->dev_thermal_cutoff_count;
														
 
															-			} else {
														
 
															+			{
														
 
															 				time_t now = time(NULL);
														
 
															 				if (state->last_cutoff_reduced != now) {
														
 
															 					state->last_cutoff_reduced = now;
														
--- a/miner.c
+++ b/miner.c
@@ -5712,6 +5712,19 @@ static void *watchpool_thread(void __maybe_unused *userdata)
 
															 	return NULL;
														
 
															 }
														
 
															+void device_recovered(struct cgpu_info *cgpu)
														
 
															+{
														
 
															+	struct thr_info *thr;
														
 
															+	int j;
														
 
															+
														
 
															+	cgpu->deven = DEV_ENABLED;
														
 
															+	for (j = 0; j < cgpu->threads; ++j) {
														
 
															+		thr = cgpu->thr[j];
														
 
															+		applog(LOG_DEBUG, "Pushing ping to thread %d", thr->id);
														
 
															+		tq_push(thr->q, &ping);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 /* Makes sure the hashmeter keeps going even if mining threads stall, updates
														
 
															  * the screen at regular intervals, and restarts threads if they appear to have
														
 
															  * died. */
														
@@ -5824,6 +5837,26 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 
															 			/* Thread is disabled */
														
 
															 			if (*denable == DEV_DISABLED)
														
 
															 				continue;
														
 
															+			else
														
 
															+			if (*denable == DEV_RECOVER) {
														
 
															+				if (opt_restart && cgpu->temp < cgpu->targettemp) {
														
 
															+					applog(LOG_NOTICE, "%s %u recovered to temperature below target, re-enabling",
														
 
															+					       cgpu->api->name, cgpu->device_id);
														
 
															+					device_recovered(cgpu);
														
 
															+				}
														
 
															+				continue;
														
 
															+			}
														
 
															+			else
														
 
															+			if (cgpu->temp > cgpu->cutofftemp)
														
 
															+			{
														
 
															+				applog(LOG_WARNING, "%s %u hit thermal cutoff limit, disabling!",
														
 
															+				       cgpu->api->name, cgpu->device_id);
														
 
															+				*denable = DEV_RECOVER;
														
 
															+
														
 
															+				cgpu->device_last_not_well = time(NULL);
														
 
															+				cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
														
 
															+				++cgpu->dev_thermal_cutoff_count;
														
 
															+			}
														
 
															 			if (thr->getwork) {
														
 
															 				if (cgpu->status == LIFE_WELL && thr->getwork < now.tv_sec - opt_log_interval) {