Browse Source

Merge branch 'recovery' into bfgminer

Conflicts:
	adl.c
	driver-bitforce.c
Luke Dashjr 13 years ago
parent
commit
a33bcbd9ce
6 changed files with 125 additions and 82 deletions
  1. 18 26
      adl.c
  2. 0 10
      driver-bitforce.c
  3. 60 37
      driver-modminer.c
  4. 3 3
      driver-opencl.c
  5. 43 5
      miner.c
  6. 1 1
      miner.h

+ 18 - 26
adl.c

@@ -490,8 +490,8 @@ void init_adl(int nDevs)
 		}
 
 		/* Set some default temperatures for autotune when enabled */
-		if (!ga->targettemp)
-			ga->targettemp = opt_targettemp;
+		if (!gpus[gpu].targettemp)
+			gpus[gpu].targettemp = opt_targettemp;
 		if (!ga->overtemp)
 			ga->overtemp = opt_overheattemp;
 		if (!gpus[gpu].cutofftemp)
@@ -505,6 +505,7 @@ void init_adl(int nDevs)
 			ga->autoengine = true;
 			ga->managed = true;
 		}
+		gpus[gpu].temp =
 		ga->lasttemp = __gpu_temp(ga);
 	}
 
@@ -744,6 +745,7 @@ bool gpu_stats(int gpu, float *temp, int *engineclock, int *memclock, float *vdd
 	ga = &gpus[gpu].adl;
 
 	lock_adl();
+	gpus[gpu].temp =
 	*temp = __gpu_temp(ga);
 	if (ADL_Overdrive5_CurrentActivity_Get(ga->iAdapterIndex, &ga->lpActivity) != ADL_OK) {
 		*engineclock = 0;
@@ -1054,15 +1056,15 @@ static bool fan_autotune(int gpu, int temp, int fanpercent, int lasttemp, bool *
 		cgpu->device_last_not_well = time(NULL);
 		cgpu->device_not_well_reason = REASON_DEV_OVER_HEAT;
 		cgpu->dev_over_heat_count++;
-	} else if (temp > ga->targettemp && fanpercent < top && tdiff >= 0) {
+	} else if (temp > gpus[gpu].targettemp && fanpercent < top && tdiff >= 0) {
 		applog(LOG_DEBUG, "Temperature over target, increasing fanspeed");
-		if (temp > ga->targettemp + opt_hysteresis)
+		if (temp > gpus[gpu].targettemp + opt_hysteresis)
 			newpercent = ga->targetfan + 10;
 		else
 			newpercent = ga->targetfan + 5;
 		if (newpercent > top)
 			newpercent = top;
-	} else if (fanpercent > bot && temp < ga->targettemp - opt_hysteresis) {
+	} else if (fanpercent > bot && temp < gpus[gpu].targettemp - opt_hysteresis) {
 		/* Detect large swings of 5 degrees or more and change fan by
 		 * a proportion more */
 		if (tdiff <= 0) {
@@ -1076,10 +1078,10 @@ static bool fan_autotune(int gpu, int temp, int fanpercent, int lasttemp, bool *
 
 		/* We're in the optimal range, make minor adjustments if the
 		 * temp is still drifting */
-		if (fanpercent > bot && tdiff < 0 && lasttemp < ga->targettemp) {
+		if (fanpercent > bot && tdiff < 0 && lasttemp < gpus[gpu].targettemp) {
 			applog(LOG_DEBUG, "Temperature dropping while in target range, decreasing fanspeed");
 			newpercent = ga->targetfan + tdiff;
-		} else if (fanpercent < top && tdiff > 0 && temp > ga->targettemp - opt_hysteresis) {
+		} else if (fanpercent < top && tdiff > 0 && temp > gpus[gpu].targettemp - opt_hysteresis) {
 			applog(LOG_DEBUG, "Temperature rising while in target range, increasing fanspeed");
 			newpercent = ga->targetfan + tdiff;
 		}
@@ -1120,6 +1122,7 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
 
 	lock_adl();
 	ADL_Overdrive5_CurrentActivity_Get(ga->iAdapterIndex, &ga->lpActivity);
+	gpus[gpu].temp =
 	temp = __gpu_temp(ga);
 	if (ga->twin)
 		twintemp = __gpu_temp(ga->twin);
@@ -1154,16 +1157,8 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
 
 	if (engine && ga->autoengine) {
 		if (temp > cgpu->cutofftemp) {
-			if (*denable == DEV_ENABLED) {
-				applog(LOG_WARNING, "Hit thermal cutoff limit on GPU %d, disabling!", gpu);
-				*denable = DEV_RECOVER;
-				++cgpu->dev_thermal_cutoff_count;
-			}
-
+			// Shutoff and recovery happens back in watchdog_thread
 			newengine = ga->minspeed;
-
-			cgpu->device_last_not_well = time(NULL);
-			cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
 		} else if (temp > ga->overtemp && engine > ga->minspeed) {
 			applog(LOG_WARNING, "Overheat detected, decreasing GPU %d clock speed", gpu);
 			newengine = ga->minspeed;
@@ -1171,19 +1166,16 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
 			cgpu->device_last_not_well = time(NULL);
 			cgpu->device_not_well_reason = REASON_DEV_OVER_HEAT;
 			cgpu->dev_over_heat_count++;
-		} else if (temp > ga->targettemp + opt_hysteresis && engine > ga->minspeed && fan_optimal) {
+		} else if (temp > gpus[gpu].targettemp + opt_hysteresis && engine > ga->minspeed && fan_optimal) {
 			applog(LOG_DEBUG, "Temperature %d degrees over target, decreasing clock speed", opt_hysteresis);
 			newengine = engine - ga->lpOdParameters.sEngineClock.iStep;
 			/* Only try to tune engine speed up if this GPU is not disabled */
-		} else if (temp < ga->targettemp && engine < ga->maxspeed && fan_window && *denable == DEV_ENABLED) {
+		} else if (temp < gpus[gpu].targettemp && engine < ga->maxspeed && fan_window && *denable == DEV_ENABLED) {
 			applog(LOG_DEBUG, "Temperature below target, increasing clock speed");
-			if (temp < ga->targettemp - opt_hysteresis)
+			if (temp < gpus[gpu].targettemp - opt_hysteresis)
 				newengine = ga->maxspeed;
 			else
 				newengine = engine + ga->lpOdParameters.sEngineClock.iStep;
-		} else if (temp < ga->targettemp && *denable == DEV_RECOVER && opt_restart) {
-			applog(LOG_NOTICE, "Device recovered to temperature below target, re-enabling");
-			*denable = DEV_ENABLED;
 		}
 
 		if (newengine > ga->maxspeed)
@@ -1235,7 +1227,7 @@ void change_autosettings(int gpu)
 	char input;
 	int val;
 
-	wlogprint("Target temperature: %d\n", ga->targettemp);
+	wlogprint("Target temperature: %d\n", gpus[gpu].targettemp);
 	wlogprint("Overheat temperature: %d\n", ga->overtemp);
 	wlogprint("Cutoff temperature: %d\n", gpus[gpu].cutofftemp);
 	wlogprint("Toggle [F]an auto [G]PU auto\nChange [T]arget [O]verheat [C]utoff\n");
@@ -1260,11 +1252,11 @@ void change_autosettings(int gpu)
 		if (val < 0 || val > 200)
 			wlogprint("Invalid temperature");
 		else
-			ga->targettemp = val;
+			gpus[gpu].targettemp = val;
 	} else if (!strncasecmp(&input, "o", 1)) {
-		wlogprint("Enter overheat temperature for this GPU in C (%d+)", ga->targettemp);
+		wlogprint("Enter overheat temperature for this GPU in C (%d+)", gpus[gpu].targettemp);
 		val = curses_int("");
-		if (val <= ga->targettemp || val > 200)
+		if (val <= gpus[gpu].targettemp || val > 200)
 			wlogprint("Invalid temperature");
 		else
 			ga->overtemp = val;

+ 0 - 10
driver-bitforce.c

@@ -308,16 +308,6 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce)
 
 		if (temp > 0) {
 			bitforce->temp = temp;
-			if (unlikely(bitforce->cutofftemp > 0 && temp > bitforce->cutofftemp)) {
-				if (bitforce->deven == DEV_ENABLED) {
-					applog(LOG_WARNING, "BFL%i: Hit thermal cutoff limit, disabling!", bitforce->device_id);
-					bitforce->deven = DEV_RECOVER;
-					++bitforce->dev_thermal_cutoff_count;
-				}
-
-				bitforce->device_last_not_well = time(NULL);
-				bitforce->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
-			}
 		}
 	} else {
 		/* Use the temperature monitor as a kind of watchdog for when

+ 60 - 37
driver-modminer.c

@@ -490,6 +490,64 @@ get_modminer_statline_before(char *buf, struct cgpu_info *modminer)
 		strcat(buf, "               | ");
 }
 
+static void modminer_get_temperature(struct cgpu_info *modminer, struct thr_info *thr)
+{
+	struct modminer_fpga_state *state = thr->cgpu_data;
+
+#ifdef WIN32
+	/* Workaround for bug in Windows driver */
+	if (!modminer_reopen(modminer))
+		return -1;
+#endif
+
+	int fd = modminer->device_fd;
+	int fpgaid = thr->device_thread;
+	char cmd[2] = {'\x0a', fpgaid};
+	char temperature;
+
+	if (2 == write(fd, cmd, 2) && read(fd, &temperature, 1) == 1)
+	{
+		state->temp = temperature;
+		if (temperature > modminer->cutofftemp - 2) {
+			{
+				time_t now = time(NULL);
+				if (state->last_cutoff_reduced != now) {
+					state->last_cutoff_reduced = now;
+					int oldFreq = state->dclk.freqM;
+					if (modminer_reduce_clock(thr, false))
+						applog(LOG_NOTICE, "%s %u.%u: Frequency %s from %u to %u Mhz (temp: %d)",
+						       modminer->api->name, modminer->device_id, fpgaid,
+						       (oldFreq > state->dclk.freqM ? "dropped" : "raised "),
+						       oldFreq * 2, state->dclk.freqM * 2,
+						       temperature
+						);
+				}
+			}
+		}
+	}
+}
+
+static bool modminer_get_stats(struct cgpu_info *modminer)
+{
+	int hottest = 0;
+	bool get_temp = (modminer->deven != DEV_ENABLED);
+	// Getting temperature more efficiently while enabled
+	// NOTE: Don't need to mess with mutex here, since the device is disabled
+	for (int i = modminer->threads; i--; ) {
+		struct thr_info*thr = modminer->thr[i];
+		struct modminer_fpga_state *state = thr->cgpu_data;
+		if (get_temp)
+			modminer_get_temperature(modminer, thr);
+		int temp = state->temp;
+		if (temp > hottest)
+			hottest = temp;
+	}
+
+	modminer->temp = (float)hottest;
+
+	return true;
+}
+
 static struct api_data*
 get_modminer_api_extra_device_status(struct cgpu_info*modminer)
 {
@@ -574,50 +632,14 @@ modminer_process_results(struct thr_info*thr)
 	int fd;
 	struct work *work = &state->running_work;
 
-	char cmd[2], temperature;
 	uint32_t nonce;
 	long iter;
 	int immediate_bad_nonces = 0, immediate_nonces = 0;
 	bool bad;
-	cmd[0] = '\x0a';
-	cmd[1] = fpgaid;
 
 	mutex_lock(&modminer->device_mutex);
-#ifdef WIN32
-	/* Workaround for bug in Windows driver */
-	if (!modminer_reopen(modminer))
-		return -1;
-#endif
+	modminer_get_temperature(modminer, thr);
 	fd = modminer->device_fd;
-	if (2 == write(fd, cmd, 2) && read(fd, &temperature, 1) == 1)
-	{
-		state->temp = temperature;
-		if (!fpgaid)
-			modminer->temp = (float)temperature;
-		if (temperature > modminer->cutofftemp - 2) {
-			if (temperature > modminer->cutofftemp) {
-				applog(LOG_WARNING, "%s %u.%u: Hit thermal cutoff limit, disabling device!", modminer->api->name, modminer->device_id, fpgaid);
-				modminer->deven = DEV_RECOVER;
-
-				modminer->device_last_not_well = time(NULL);
-				modminer->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
-				++modminer->dev_thermal_cutoff_count;
-			} else {
-				time_t now = time(NULL);
-				if (state->last_cutoff_reduced != now) {
-					state->last_cutoff_reduced = now;
-					int oldFreq = state->dclk.freqM;
-					if (modminer_reduce_clock(thr, false))
-						applog(LOG_NOTICE, "%s %u.%u: Frequency %s from %u to %u Mhz (temp: %d)",
-						       modminer->api->name, modminer->device_id, fpgaid,
-						       (oldFreq > state->dclk.freqM ? "dropped" : "raised "),
-						       oldFreq * 2, state->dclk.freqM * 2,
-						       temperature
-						);
-				}
-			}
-		}
-	}
 
 	iter = 200;
 	while (1) {
@@ -738,6 +760,7 @@ struct device_api modminer_api = {
 	.name = "MMQ",
 	.api_detect = modminer_detect,
 	.get_statline_before = get_modminer_statline_before,
+	.get_stats = modminer_get_stats,
 	.get_api_extra_device_status = get_modminer_api_extra_device_status,
 	.thread_prepare = modminer_fpga_prepare,
 	.thread_init = modminer_fpga_init,

+ 3 - 3
driver-opencl.c

@@ -742,7 +742,7 @@ char *set_temp_target(char *arg)
 	if (val < 0 || val > 200)
 		return "Invalid value passed to set temp target";
 
-	tt = &gpus[device++].adl.targettemp;
+	tt = &gpus[device++].targettemp;
 	*tt = val;
 
 	while ((nextptr = strtok(NULL, ",")) != NULL) {
@@ -750,12 +750,12 @@ char *set_temp_target(char *arg)
 		if (val < 0 || val > 200)
 			return "Invalid value passed to set temp target";
 
-		tt = &gpus[device++].adl.targettemp;
+		tt = &gpus[device++].targettemp;
 		*tt = val;
 	}
 	if (device == 1) {
 		for (i = device; i < MAX_GPUDEVICES; i++) {
-			tt = &gpus[i].adl.targettemp;
+			tt = &gpus[i].targettemp;
 			*tt = val;
 		}
 	}

+ 43 - 5
miner.c

@@ -825,6 +825,9 @@ static void load_temp_cutoffs()
 		for (i = device; i < total_devices; ++i)
 			devices[i]->cutofftemp = val;
 	}
+	for (i = 0; i < total_devices; ++i)
+		if (!devices[i]->targettemp)
+			devices[i]->targettemp = devices[i]->cutofftemp - 6;
 }
 
 static char *set_api_allow(const char *arg)
@@ -4008,7 +4011,7 @@ void write_config(FILE *fcfg)
 			fprintf(fcfg, "%s%d", i > 0 ? "," : "", gpus[i].adl.overtemp);
 		fputs("\",\n\"temp-target\" : \"", fcfg);
 		for(i = 0; i < nDevs; i++)
-			fprintf(fcfg, "%s%d", i > 0 ? "," : "", gpus[i].adl.targettemp);
+			fprintf(fcfg, "%s%d", i > 0 ? "," : "", gpus[i].targettemp);
 #endif
 		fputs("\"", fcfg);
 	}
@@ -5709,6 +5712,19 @@ static void *watchpool_thread(void __maybe_unused *userdata)
 	return NULL;
 }
 
+void device_recovered(struct cgpu_info *cgpu)
+{
+	struct thr_info *thr;
+	int j;
+
+	cgpu->deven = DEV_ENABLED;
+	for (j = 0; j < cgpu->threads; ++j) {
+		thr = cgpu->thr[j];
+		applog(LOG_DEBUG, "Pushing ping to thread %d", thr->id);
+		tq_push(thr->q, &ping);
+	}
+}
+
 /* Makes sure the hashmeter keeps going even if mining threads stall, updates
  * the screen at regular intervals, and restarts threads if they appear to have
  * died. */
@@ -5821,6 +5837,28 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 			/* Thread is disabled */
 			if (*denable == DEV_DISABLED)
 				continue;
+			else
+			if (*denable == DEV_RECOVER) {
+				if (opt_restart && cgpu->temp < cgpu->targettemp) {
+					applog(LOG_NOTICE, "%s %u recovered to temperature below target, re-enabling",
+					       cgpu->api->name, cgpu->device_id);
+					device_recovered(cgpu);
+				}
+				cgpu->device_last_not_well = time(NULL);
+				cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
+				continue;
+			}
+			else
+			if (cgpu->temp > cgpu->cutofftemp)
+			{
+				applog(LOG_WARNING, "%s %u hit thermal cutoff limit, disabling!",
+				       cgpu->api->name, cgpu->device_id);
+				*denable = DEV_RECOVER;
+
+				cgpu->device_last_not_well = time(NULL);
+				cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
+				++cgpu->dev_thermal_cutoff_count;
+			}
 
 			if (thr->getwork) {
 				if (cgpu->status == LIFE_WELL && thr->getwork < now.tv_sec - opt_log_interval) {
@@ -6264,7 +6302,7 @@ extern struct device_api ztex_api;
 
 static int cgminer_id_count = 0;
 
-void enable_device(struct cgpu_info *cgpu)
+void register_device(struct cgpu_info *cgpu)
 {
 	cgpu->deven = DEV_ENABLED;
 	devices[cgpu->cgminer_id = cgminer_id_count++] = cgpu;
@@ -6575,13 +6613,13 @@ int main(int argc, char *argv[])
 			if (devices_enabled & (1 << i)) {
 				if (i >= total_devices)
 					quit (1, "Command line options set a device that doesn't exist");
-				enable_device(devices[i]);
+				register_device(devices[i]);
 			} else if (i < total_devices) {
 				if (opt_removedisabled) {
 					if (devices[i]->api == &cpu_api)
 						--opt_n_threads;
 				} else {
-					enable_device(devices[i]);
+					register_device(devices[i]);
 				}
 				devices[i]->deven = DEV_DISABLED;
 			}
@@ -6589,7 +6627,7 @@ int main(int argc, char *argv[])
 		total_devices = cgminer_id_count;
 	} else {
 		for (i = 0; i < total_devices; ++i)
-			enable_device(devices[i]);
+			register_device(devices[i]);
 	}
 
 	if (!total_devices)

+ 1 - 1
miner.h

@@ -248,7 +248,6 @@ struct gpu_adl {
 	int lastengine;
 	int lasttemp;
 	int targetfan;
-	int targettemp;
 	int overtemp;
 	int minspeed;
 	int maxspeed;
@@ -433,6 +432,7 @@ struct cgpu_info {
 
 	float temp;
 	int cutofftemp;
+	int targettemp;
 
 #ifdef HAVE_ADL
 	bool has_adl;