Browse Source

Bugfix: Move thermal cutoff to general watchdog code (fixes bitforce recovery)

Luke Dashjr 13 years ago
parent
commit
dfa30502bf
4 changed files with 38 additions and 25 deletions
  1. 4 9
      adl.c
  2. 0 8
      driver-bitforce.c
  3. 1 8
      driver-modminer.c
  4. 33 0
      miner.c

+ 4 - 9
adl.c

@@ -505,6 +505,7 @@ void init_adl(int nDevs)
 			ga->autoengine = true;
 			ga->autoengine = true;
 			ga->managed = true;
 			ga->managed = true;
 		}
 		}
+		gpus[gpu].temp =
 		ga->lasttemp = __gpu_temp(ga);
 		ga->lasttemp = __gpu_temp(ga);
 	}
 	}
 
 
@@ -744,6 +745,7 @@ bool gpu_stats(int gpu, float *temp, int *engineclock, int *memclock, float *vdd
 	ga = &gpus[gpu].adl;
 	ga = &gpus[gpu].adl;
 
 
 	lock_adl();
 	lock_adl();
+	gpus[gpu].temp =
 	*temp = __gpu_temp(ga);
 	*temp = __gpu_temp(ga);
 	if (ADL_Overdrive5_CurrentActivity_Get(ga->iAdapterIndex, &ga->lpActivity) != ADL_OK) {
 	if (ADL_Overdrive5_CurrentActivity_Get(ga->iAdapterIndex, &ga->lpActivity) != ADL_OK) {
 		*engineclock = 0;
 		*engineclock = 0;
@@ -1120,6 +1122,7 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
 
 
 	lock_adl();
 	lock_adl();
 	ADL_Overdrive5_CurrentActivity_Get(ga->iAdapterIndex, &ga->lpActivity);
 	ADL_Overdrive5_CurrentActivity_Get(ga->iAdapterIndex, &ga->lpActivity);
+	gpus[gpu].temp =
 	temp = __gpu_temp(ga);
 	temp = __gpu_temp(ga);
 	if (ga->twin)
 	if (ga->twin)
 		twintemp = __gpu_temp(ga->twin);
 		twintemp = __gpu_temp(ga->twin);
@@ -1154,13 +1157,8 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
 
 
 	if (engine && ga->autoengine) {
 	if (engine && ga->autoengine) {
 		if (temp > cgpu->cutofftemp) {
 		if (temp > cgpu->cutofftemp) {
-			applog(LOG_WARNING, "Hit thermal cutoff limit on GPU %d, disabling!", gpu);
-			*denable = DEV_RECOVER;
+			// Shutoff and recovery happens back in watchdog_thread
 			newengine = ga->minspeed;
 			newengine = ga->minspeed;
-
-			cgpu->device_last_not_well = time(NULL);
-			cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
-			cgpu->dev_thermal_cutoff_count++;
 		} else if (temp > ga->overtemp && engine > ga->minspeed) {
 		} else if (temp > ga->overtemp && engine > ga->minspeed) {
 			applog(LOG_WARNING, "Overheat detected, decreasing GPU %d clock speed", gpu);
 			applog(LOG_WARNING, "Overheat detected, decreasing GPU %d clock speed", gpu);
 			newengine = ga->minspeed;
 			newengine = ga->minspeed;
@@ -1178,9 +1176,6 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
 				newengine = ga->maxspeed;
 				newengine = ga->maxspeed;
 			else
 			else
 				newengine = engine + ga->lpOdParameters.sEngineClock.iStep;
 				newengine = engine + ga->lpOdParameters.sEngineClock.iStep;
-		} else if (temp < gpus[gpu].targettemp && *denable == DEV_RECOVER && opt_restart) {
-			applog(LOG_NOTICE, "Device recovered to temperature below target, re-enabling");
-			*denable = DEV_ENABLED;
 		}
 		}
 
 
 		if (newengine > ga->maxspeed)
 		if (newengine > ga->maxspeed)

+ 0 - 8
driver-bitforce.c

@@ -308,14 +308,6 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce)
 
 
 		if (temp > 0) {
 		if (temp > 0) {
 			bitforce->temp = temp;
 			bitforce->temp = temp;
-			if (unlikely(bitforce->cutofftemp > 0 && temp > bitforce->cutofftemp)) {
-				applog(LOG_WARNING, "BFL%i: Hit thermal cutoff limit, disabling!", bitforce->device_id);
-				bitforce->deven = DEV_RECOVER;
-
-				bitforce->device_last_not_well = time(NULL);
-				bitforce->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
-				bitforce->dev_thermal_cutoff_count++;
-			}
 		}
 		}
 	} else {
 	} else {
 		/* Use the temperature monitor as a kind of watchdog for when
 		/* Use the temperature monitor as a kind of watchdog for when

+ 1 - 8
driver-modminer.c

@@ -595,14 +595,7 @@ modminer_process_results(struct thr_info*thr)
 		if (!fpgaid)
 		if (!fpgaid)
 			modminer->temp = (float)temperature;
 			modminer->temp = (float)temperature;
 		if (temperature > modminer->cutofftemp - 2) {
 		if (temperature > modminer->cutofftemp - 2) {
-			if (temperature > modminer->cutofftemp) {
-				applog(LOG_WARNING, "%s %u.%u: Hit thermal cutoff limit, disabling device!", modminer->api->name, modminer->device_id, fpgaid);
-				modminer->deven = DEV_RECOVER;
-
-				modminer->device_last_not_well = time(NULL);
-				modminer->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
-				++modminer->dev_thermal_cutoff_count;
-			} else {
+			{
 				time_t now = time(NULL);
 				time_t now = time(NULL);
 				if (state->last_cutoff_reduced != now) {
 				if (state->last_cutoff_reduced != now) {
 					state->last_cutoff_reduced = now;
 					state->last_cutoff_reduced = now;

+ 33 - 0
miner.c

@@ -5712,6 +5712,19 @@ static void *watchpool_thread(void __maybe_unused *userdata)
 	return NULL;
 	return NULL;
 }
 }
 
 
+void device_recovered(struct cgpu_info *cgpu)
+{
+	struct thr_info *thr;
+	int j;
+
+	cgpu->deven = DEV_ENABLED;
+	for (j = 0; j < cgpu->threads; ++j) {
+		thr = cgpu->thr[j];
+		applog(LOG_DEBUG, "Pushing ping to thread %d", thr->id);
+		tq_push(thr->q, &ping);
+	}
+}
+
 /* Makes sure the hashmeter keeps going even if mining threads stall, updates
 /* Makes sure the hashmeter keeps going even if mining threads stall, updates
  * the screen at regular intervals, and restarts threads if they appear to have
  * the screen at regular intervals, and restarts threads if they appear to have
  * died. */
  * died. */
@@ -5824,6 +5837,26 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 			/* Thread is disabled */
 			/* Thread is disabled */
 			if (*denable == DEV_DISABLED)
 			if (*denable == DEV_DISABLED)
 				continue;
 				continue;
+			else
+			if (*denable == DEV_RECOVER) {
+				if (opt_restart && cgpu->temp < cgpu->targettemp) {
+					applog(LOG_NOTICE, "%s %u recovered to temperature below target, re-enabling",
+					       cgpu->api->name, cgpu->device_id);
+					device_recovered(cgpu);
+				}
+				continue;
+			}
+			else
+			if (cgpu->temp > cgpu->cutofftemp)
+			{
+				applog(LOG_WARNING, "%s %u hit thermal cutoff limit, disabling!",
+				       cgpu->api->name, cgpu->device_id);
+				*denable = DEV_RECOVER;
+
+				cgpu->device_last_not_well = time(NULL);
+				cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
+				++cgpu->dev_thermal_cutoff_count;
+			}
 
 
 			if (thr->getwork) {
 			if (thr->getwork) {
 				if (cgpu->status == LIFE_WELL && thr->getwork < now.tv_sec - opt_log_interval) {
 				if (cgpu->status == LIFE_WELL && thr->getwork < now.tv_sec - opt_log_interval) {