13 years ago · a33bcbd9ce
--- a/adl.c
+++ b/adl.c
@@ -490,8 +490,8 @@ void init_adl(int nDevs)
 
				 		}
			
 
				 
			
 
				 		/* Set some default temperatures for autotune when enabled */
			
 
				-		if (!ga->targettemp)
			
 
				-			ga->targettemp = opt_targettemp;
			
 
				+		if (!gpus[gpu].targettemp)
			
 
				+			gpus[gpu].targettemp = opt_targettemp;
			
 
				 		if (!ga->overtemp)
			
 
				 			ga->overtemp = opt_overheattemp;
			
 
				 		if (!gpus[gpu].cutofftemp)
			
@@ -505,6 +505,7 @@ void init_adl(int nDevs)
 
				 			ga->autoengine = true;
			
 
				 			ga->managed = true;
			
 
				 		}
			
 
				+		gpus[gpu].temp =
			
 
				 		ga->lasttemp = __gpu_temp(ga);
			
 
				 	}
			
 
				 
			
@@ -744,6 +745,7 @@ bool gpu_stats(int gpu, float *temp, int *engineclock, int *memclock, float *vdd
 
				 	ga = &gpus[gpu].adl;
			
 
				 
			
 
				 	lock_adl();
			
 
				+	gpus[gpu].temp =
			
 
				 	*temp = __gpu_temp(ga);
			
 
				 	if (ADL_Overdrive5_CurrentActivity_Get(ga->iAdapterIndex, &ga->lpActivity) != ADL_OK) {
			
 
				 		*engineclock = 0;
			
@@ -1054,15 +1056,15 @@ static bool fan_autotune(int gpu, int temp, int fanpercent, int lasttemp, bool *
 
				 		cgpu->device_last_not_well = time(NULL);
			
 
				 		cgpu->device_not_well_reason = REASON_DEV_OVER_HEAT;
			
 
				 		cgpu->dev_over_heat_count++;
			
 
				-	} else if (temp > ga->targettemp && fanpercent < top && tdiff >= 0) {
			
 
				+	} else if (temp > gpus[gpu].targettemp && fanpercent < top && tdiff >= 0) {
			
 
				 		applog(LOG_DEBUG, "Temperature over target, increasing fanspeed");
			
 
				-		if (temp > ga->targettemp + opt_hysteresis)
			
 
				+		if (temp > gpus[gpu].targettemp + opt_hysteresis)
			
 
				 			newpercent = ga->targetfan + 10;
			
 
				 		else
			
 
				 			newpercent = ga->targetfan + 5;
			
 
				 		if (newpercent > top)
			
 
				 			newpercent = top;
			
 
				-	} else if (fanpercent > bot && temp < ga->targettemp - opt_hysteresis) {
			
 
				+	} else if (fanpercent > bot && temp < gpus[gpu].targettemp - opt_hysteresis) {
			
 
				 		/* Detect large swings of 5 degrees or more and change fan by
			
 
				 		 * a proportion more */
			
 
				 		if (tdiff <= 0) {
			
@@ -1076,10 +1078,10 @@ static bool fan_autotune(int gpu, int temp, int fanpercent, int lasttemp, bool *
 
				 
			
 
				 		/* We're in the optimal range, make minor adjustments if the
			
 
				 		 * temp is still drifting */
			
 
				-		if (fanpercent > bot && tdiff < 0 && lasttemp < ga->targettemp) {
			
 
				+		if (fanpercent > bot && tdiff < 0 && lasttemp < gpus[gpu].targettemp) {
			
 
				 			applog(LOG_DEBUG, "Temperature dropping while in target range, decreasing fanspeed");
			
 
				 			newpercent = ga->targetfan + tdiff;
			
 
				-		} else if (fanpercent < top && tdiff > 0 && temp > ga->targettemp - opt_hysteresis) {
			
 
				+		} else if (fanpercent < top && tdiff > 0 && temp > gpus[gpu].targettemp - opt_hysteresis) {
			
 
				 			applog(LOG_DEBUG, "Temperature rising while in target range, increasing fanspeed");
			
 
				 			newpercent = ga->targetfan + tdiff;
			
 
				 		}
			
@@ -1120,6 +1122,7 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
 
				 
			
 
				 	lock_adl();
			
 
				 	ADL_Overdrive5_CurrentActivity_Get(ga->iAdapterIndex, &ga->lpActivity);
			
 
				+	gpus[gpu].temp =
			
 
				 	temp = __gpu_temp(ga);
			
 
				 	if (ga->twin)
			
 
				 		twintemp = __gpu_temp(ga->twin);
			
@@ -1154,16 +1157,8 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
 
				 
			
 
				 	if (engine && ga->autoengine) {
			
 
				 		if (temp > cgpu->cutofftemp) {
			
 
				-			if (*denable == DEV_ENABLED) {
			
 
				-				applog(LOG_WARNING, "Hit thermal cutoff limit on GPU %d, disabling!", gpu);
			
 
				-				*denable = DEV_RECOVER;
			
 
				-				++cgpu->dev_thermal_cutoff_count;
			
 
				-			}
			
 
				-
			
 
				+			// Shutoff and recovery happens back in watchdog_thread
			
 
				 			newengine = ga->minspeed;
			
 
				-
			
 
				-			cgpu->device_last_not_well = time(NULL);
			
 
				-			cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
			
 
				 		} else if (temp > ga->overtemp && engine > ga->minspeed) {
			
 
				 			applog(LOG_WARNING, "Overheat detected, decreasing GPU %d clock speed", gpu);
			
 
				 			newengine = ga->minspeed;
			
@@ -1171,19 +1166,16 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
 
				 			cgpu->device_last_not_well = time(NULL);
			
 
				 			cgpu->device_not_well_reason = REASON_DEV_OVER_HEAT;
			
 
				 			cgpu->dev_over_heat_count++;
			
 
				-		} else if (temp > ga->targettemp + opt_hysteresis && engine > ga->minspeed && fan_optimal) {
			
 
				+		} else if (temp > gpus[gpu].targettemp + opt_hysteresis && engine > ga->minspeed && fan_optimal) {
			
 
				 			applog(LOG_DEBUG, "Temperature %d degrees over target, decreasing clock speed", opt_hysteresis);
			
 
				 			newengine = engine - ga->lpOdParameters.sEngineClock.iStep;
			
 
				 			/* Only try to tune engine speed up if this GPU is not disabled */
			
 
				-		} else if (temp < ga->targettemp && engine < ga->maxspeed && fan_window && *denable == DEV_ENABLED) {
			
 
				+		} else if (temp < gpus[gpu].targettemp && engine < ga->maxspeed && fan_window && *denable == DEV_ENABLED) {
			
 
				 			applog(LOG_DEBUG, "Temperature below target, increasing clock speed");
			
 
				-			if (temp < ga->targettemp - opt_hysteresis)
			
 
				+			if (temp < gpus[gpu].targettemp - opt_hysteresis)
			
 
				 				newengine = ga->maxspeed;
			
 
				 			else
			
 
				 				newengine = engine + ga->lpOdParameters.sEngineClock.iStep;
			
 
				-		} else if (temp < ga->targettemp && *denable == DEV_RECOVER && opt_restart) {
			
 
				-			applog(LOG_NOTICE, "Device recovered to temperature below target, re-enabling");
			
 
				-			*denable = DEV_ENABLED;
			
 
				 		}
			
 
				 
			
 
				 		if (newengine > ga->maxspeed)
			
@@ -1235,7 +1227,7 @@ void change_autosettings(int gpu)
 
				 	char input;
			
 
				 	int val;
			
 
				 
			
 
				-	wlogprint("Target temperature: %d\n", ga->targettemp);
			
 
				+	wlogprint("Target temperature: %d\n", gpus[gpu].targettemp);
			
 
				 	wlogprint("Overheat temperature: %d\n", ga->overtemp);
			
 
				 	wlogprint("Cutoff temperature: %d\n", gpus[gpu].cutofftemp);
			
 
				 	wlogprint("Toggle [F]an auto [G]PU auto\nChange [T]arget [O]verheat [C]utoff\n");
			
@@ -1260,11 +1252,11 @@ void change_autosettings(int gpu)
 
				 		if (val < 0 || val > 200)
			
 
				 			wlogprint("Invalid temperature");
			
 
				 		else
			
 
				-			ga->targettemp = val;
			
 
				+			gpus[gpu].targettemp = val;
			
 
				 	} else if (!strncasecmp(&input, "o", 1)) {
			
 
				-		wlogprint("Enter overheat temperature for this GPU in C (%d+)", ga->targettemp);
			
 
				+		wlogprint("Enter overheat temperature for this GPU in C (%d+)", gpus[gpu].targettemp);
			
 
				 		val = curses_int("");
			
 
				-		if (val <= ga->targettemp || val > 200)
			
 
				+		if (val <= gpus[gpu].targettemp || val > 200)
			
 
				 			wlogprint("Invalid temperature");
			
 
				 		else
			
 
				 			ga->overtemp = val;
			
--- a/driver-bitforce.c
+++ b/driver-bitforce.c
@@ -308,16 +308,6 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce)
 
				 
			
 
				 		if (temp > 0) {
			
 
				 			bitforce->temp = temp;
			
 
				-			if (unlikely(bitforce->cutofftemp > 0 && temp > bitforce->cutofftemp)) {
			
 
				-				if (bitforce->deven == DEV_ENABLED) {
			
 
				-					applog(LOG_WARNING, "BFL%i: Hit thermal cutoff limit, disabling!", bitforce->device_id);
			
 
				-					bitforce->deven = DEV_RECOVER;
			
 
				-					++bitforce->dev_thermal_cutoff_count;
			
 
				-				}
			
 
				-
			
 
				-				bitforce->device_last_not_well = time(NULL);
			
 
				-				bitforce->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
			
 
				-			}
			
 
				 		}
			
 
				 	} else {
			
 
				 		/* Use the temperature monitor as a kind of watchdog for when
			
--- a/driver-modminer.c
+++ b/driver-modminer.c
@@ -490,6 +490,64 @@ get_modminer_statline_before(char *buf, struct cgpu_info *modminer)
 
				 		strcat(buf, "               | ");
			
 
				 }
			
 
				 
			
 
				+static void modminer_get_temperature(struct cgpu_info *modminer, struct thr_info *thr)
			
 
				+{
			
 
				+	struct modminer_fpga_state *state = thr->cgpu_data;
			
 
				+
			
 
				+#ifdef WIN32
			
 
				+	/* Workaround for bug in Windows driver */
			
 
				+	if (!modminer_reopen(modminer))
			
 
				+		return -1;
			
 
				+#endif
			
 
				+
			
 
				+	int fd = modminer->device_fd;
			
 
				+	int fpgaid = thr->device_thread;
			
 
				+	char cmd[2] = {'\x0a', fpgaid};
			
 
				+	char temperature;
			
 
				+
			
 
				+	if (2 == write(fd, cmd, 2) && read(fd, &temperature, 1) == 1)
			
 
				+	{
			
 
				+		state->temp = temperature;
			
 
				+		if (temperature > modminer->cutofftemp - 2) {
			
 
				+			{
			
 
				+				time_t now = time(NULL);
			
 
				+				if (state->last_cutoff_reduced != now) {
			
 
				+					state->last_cutoff_reduced = now;
			
 
				+					int oldFreq = state->dclk.freqM;
			
 
				+					if (modminer_reduce_clock(thr, false))
			
 
				+						applog(LOG_NOTICE, "%s %u.%u: Frequency %s from %u to %u Mhz (temp: %d)",
			
 
				+						       modminer->api->name, modminer->device_id, fpgaid,
			
 
				+						       (oldFreq > state->dclk.freqM ? "dropped" : "raised "),
			
 
				+						       oldFreq * 2, state->dclk.freqM * 2,
			
 
				+						       temperature
			
 
				+						);
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static bool modminer_get_stats(struct cgpu_info *modminer)
			
 
				+{
			
 
				+	int hottest = 0;
			
 
				+	bool get_temp = (modminer->deven != DEV_ENABLED);
			
 
				+	// Getting temperature more efficiently while enabled
			
 
				+	// NOTE: Don't need to mess with mutex here, since the device is disabled
			
 
				+	for (int i = modminer->threads; i--; ) {
			
 
				+		struct thr_info*thr = modminer->thr[i];
			
 
				+		struct modminer_fpga_state *state = thr->cgpu_data;
			
 
				+		if (get_temp)
			
 
				+			modminer_get_temperature(modminer, thr);
			
 
				+		int temp = state->temp;
			
 
				+		if (temp > hottest)
			
 
				+			hottest = temp;
			
 
				+	}
			
 
				+
			
 
				+	modminer->temp = (float)hottest;
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				 static struct api_data*
			
 
				 get_modminer_api_extra_device_status(struct cgpu_info*modminer)
			
 
				 {
			
@@ -574,50 +632,14 @@ modminer_process_results(struct thr_info*thr)
 
				 	int fd;
			
 
				 	struct work *work = &state->running_work;
			
 
				 
			
 
				-	char cmd[2], temperature;
			
 
				 	uint32_t nonce;
			
 
				 	long iter;
			
 
				 	int immediate_bad_nonces = 0, immediate_nonces = 0;
			
 
				 	bool bad;
			
 
				-	cmd[0] = '\x0a';
			
 
				-	cmd[1] = fpgaid;
			
 
				 
			
 
				 	mutex_lock(&modminer->device_mutex);
			
 
				-#ifdef WIN32
			
 
				-	/* Workaround for bug in Windows driver */
			
 
				-	if (!modminer_reopen(modminer))
			
 
				-		return -1;
			
 
				-#endif
			
 
				+	modminer_get_temperature(modminer, thr);
			
 
				 	fd = modminer->device_fd;
			
 
				-	if (2 == write(fd, cmd, 2) && read(fd, &temperature, 1) == 1)
			
 
				-	{
			
 
				-		state->temp = temperature;
			
 
				-		if (!fpgaid)
			
 
				-			modminer->temp = (float)temperature;
			
 
				-		if (temperature > modminer->cutofftemp - 2) {
			
 
				-			if (temperature > modminer->cutofftemp) {
			
 
				-				applog(LOG_WARNING, "%s %u.%u: Hit thermal cutoff limit, disabling device!", modminer->api->name, modminer->device_id, fpgaid);
			
 
				-				modminer->deven = DEV_RECOVER;
			
 
				-
			
 
				-				modminer->device_last_not_well = time(NULL);
			
 
				-				modminer->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
			
 
				-				++modminer->dev_thermal_cutoff_count;
			
 
				-			} else {
			
 
				-				time_t now = time(NULL);
			
 
				-				if (state->last_cutoff_reduced != now) {
			
 
				-					state->last_cutoff_reduced = now;
			
 
				-					int oldFreq = state->dclk.freqM;
			
 
				-					if (modminer_reduce_clock(thr, false))
			
 
				-						applog(LOG_NOTICE, "%s %u.%u: Frequency %s from %u to %u Mhz (temp: %d)",
			
 
				-						       modminer->api->name, modminer->device_id, fpgaid,
			
 
				-						       (oldFreq > state->dclk.freqM ? "dropped" : "raised "),
			
 
				-						       oldFreq * 2, state->dclk.freqM * 2,
			
 
				-						       temperature
			
 
				-						);
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				 
			
 
				 	iter = 200;
			
 
				 	while (1) {
			
@@ -738,6 +760,7 @@ struct device_api modminer_api = {
 
				 	.name = "MMQ",
			
 
				 	.api_detect = modminer_detect,
			
 
				 	.get_statline_before = get_modminer_statline_before,
			
 
				+	.get_stats = modminer_get_stats,
			
 
				 	.get_api_extra_device_status = get_modminer_api_extra_device_status,
			
 
				 	.thread_prepare = modminer_fpga_prepare,
			
 
				 	.thread_init = modminer_fpga_init,
			
--- a/driver-opencl.c
+++ b/driver-opencl.c
@@ -742,7 +742,7 @@ char *set_temp_target(char *arg)
 
				 	if (val < 0 || val > 200)
			
 
				 		return "Invalid value passed to set temp target";
			
 
				 
			
 
				-	tt = &gpus[device++].adl.targettemp;
			
 
				+	tt = &gpus[device++].targettemp;
			
 
				 	*tt = val;
			
 
				 
			
 
				 	while ((nextptr = strtok(NULL, ",")) != NULL) {
			
@@ -750,12 +750,12 @@ char *set_temp_target(char *arg)
 
				 		if (val < 0 || val > 200)
			
 
				 			return "Invalid value passed to set temp target";
			
 
				 
			
 
				-		tt = &gpus[device++].adl.targettemp;
			
 
				+		tt = &gpus[device++].targettemp;
			
 
				 		*tt = val;
			
 
				 	}
			
 
				 	if (device == 1) {
			
 
				 		for (i = device; i < MAX_GPUDEVICES; i++) {
			
 
				-			tt = &gpus[i].adl.targettemp;
			
 
				+			tt = &gpus[i].targettemp;
			
 
				 			*tt = val;
			
 
				 		}
			
 
				 	}
			
--- a/miner.c
+++ b/miner.c
@@ -825,6 +825,9 @@ static void load_temp_cutoffs()
 
				 		for (i = device; i < total_devices; ++i)
			
 
				 			devices[i]->cutofftemp = val;
			
 
				 	}
			
 
				+	for (i = 0; i < total_devices; ++i)
			
 
				+		if (!devices[i]->targettemp)
			
 
				+			devices[i]->targettemp = devices[i]->cutofftemp - 6;
			
 
				 }
			
 
				 
			
 
				 static char *set_api_allow(const char *arg)
			
@@ -4008,7 +4011,7 @@ void write_config(FILE *fcfg)
 
				 			fprintf(fcfg, "%s%d", i > 0 ? "," : "", gpus[i].adl.overtemp);
			
 
				 		fputs("\",\n\"temp-target\" : \"", fcfg);
			
 
				 		for(i = 0; i < nDevs; i++)
			
 
				-			fprintf(fcfg, "%s%d", i > 0 ? "," : "", gpus[i].adl.targettemp);
			
 
				+			fprintf(fcfg, "%s%d", i > 0 ? "," : "", gpus[i].targettemp);
			
 
				 #endif
			
 
				 		fputs("\"", fcfg);
			
 
				 	}
			
@@ -5709,6 +5712,19 @@ static void *watchpool_thread(void __maybe_unused *userdata)
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				+void device_recovered(struct cgpu_info *cgpu)
			
 
				+{
			
 
				+	struct thr_info *thr;
			
 
				+	int j;
			
 
				+
			
 
				+	cgpu->deven = DEV_ENABLED;
			
 
				+	for (j = 0; j < cgpu->threads; ++j) {
			
 
				+		thr = cgpu->thr[j];
			
 
				+		applog(LOG_DEBUG, "Pushing ping to thread %d", thr->id);
			
 
				+		tq_push(thr->q, &ping);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /* Makes sure the hashmeter keeps going even if mining threads stall, updates
			
 
				  * the screen at regular intervals, and restarts threads if they appear to have
			
 
				  * died. */
			
@@ -5821,6 +5837,28 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 
				 			/* Thread is disabled */
			
 
				 			if (*denable == DEV_DISABLED)
			
 
				 				continue;
			
 
				+			else
			
 
				+			if (*denable == DEV_RECOVER) {
			
 
				+				if (opt_restart && cgpu->temp < cgpu->targettemp) {
			
 
				+					applog(LOG_NOTICE, "%s %u recovered to temperature below target, re-enabling",
			
 
				+					       cgpu->api->name, cgpu->device_id);
			
 
				+					device_recovered(cgpu);
			
 
				+				}
			
 
				+				cgpu->device_last_not_well = time(NULL);
			
 
				+				cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
			
 
				+				continue;
			
 
				+			}
			
 
				+			else
			
 
				+			if (cgpu->temp > cgpu->cutofftemp)
			
 
				+			{
			
 
				+				applog(LOG_WARNING, "%s %u hit thermal cutoff limit, disabling!",
			
 
				+				       cgpu->api->name, cgpu->device_id);
			
 
				+				*denable = DEV_RECOVER;
			
 
				+
			
 
				+				cgpu->device_last_not_well = time(NULL);
			
 
				+				cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
			
 
				+				++cgpu->dev_thermal_cutoff_count;
			
 
				+			}
			
 
				 
			
 
				 			if (thr->getwork) {
			
 
				 				if (cgpu->status == LIFE_WELL && thr->getwork < now.tv_sec - opt_log_interval) {
			
@@ -6264,7 +6302,7 @@ extern struct device_api ztex_api;
 
				 
			
 
				 static int cgminer_id_count = 0;
			
 
				 
			
 
				-void enable_device(struct cgpu_info *cgpu)
			
 
				+void register_device(struct cgpu_info *cgpu)
			
 
				 {
			
 
				 	cgpu->deven = DEV_ENABLED;
			
 
				 	devices[cgpu->cgminer_id = cgminer_id_count++] = cgpu;
			
@@ -6575,13 +6613,13 @@ int main(int argc, char *argv[])
 
				 			if (devices_enabled & (1 << i)) {
			
 
				 				if (i >= total_devices)
			
 
				 					quit (1, "Command line options set a device that doesn't exist");
			
 
				-				enable_device(devices[i]);
			
 
				+				register_device(devices[i]);
			
 
				 			} else if (i < total_devices) {
			
 
				 				if (opt_removedisabled) {
			
 
				 					if (devices[i]->api == &cpu_api)
			
 
				 						--opt_n_threads;
			
 
				 				} else {
			
 
				-					enable_device(devices[i]);
			
 
				+					register_device(devices[i]);
			
 
				 				}
			
 
				 				devices[i]->deven = DEV_DISABLED;
			
 
				 			}
			
@@ -6589,7 +6627,7 @@ int main(int argc, char *argv[])
 
				 		total_devices = cgminer_id_count;
			
 
				 	} else {
			
 
				 		for (i = 0; i < total_devices; ++i)
			
 
				-			enable_device(devices[i]);
			
 
				+			register_device(devices[i]);
			
 
				 	}
			
 
				 
			
 
				 	if (!total_devices)
			
--- a/miner.h
+++ b/miner.h
@@ -248,7 +248,6 @@ struct gpu_adl {
 
				 	int lastengine;
			
 
				 	int lasttemp;
			
 
				 	int targetfan;
			
 
				-	int targettemp;
			
 
				 	int overtemp;
			
 
				 	int minspeed;
			
 
				 	int maxspeed;
			
@@ -433,6 +432,7 @@ struct cgpu_info {
 
				 
			
 
				 	float temp;
			
 
				 	int cutofftemp;
			
 
				+	int targettemp;
			
 
				 
			
 
				 #ifdef HAVE_ADL
			
 
				 	bool has_adl;