Browse Source

Merge branch 'knc' into bfgminer

Luke Dashjr 12 years ago
parent
commit
1711e657ae
3 changed files with 109 additions and 10 deletions
  1. 93 3
      driver-knc.c
  2. 12 4
      miner.c
  3. 4 3
      miner.h

+ 93 - 3
driver-knc.c

@@ -38,6 +38,25 @@
 #define KNC_SPI_MODE  (SPI_CPHA | SPI_CPOL | SPI_CS_HIGH)
 #define KNC_SPI_MODE  (SPI_CPHA | SPI_CPOL | SPI_CS_HIGH)
 #define KNC_SPI_BITS  8
 #define KNC_SPI_BITS  8
 
 
+
+/*
+ The core disable/enable strategy is as follows:
+
+ If a core gets 10 HW errors in a row without doing any proper work
+ it is disabled for 10 seconds.
+
+ When a core gets 10 HW errors the next time it checks when it was enabled
+ the last time and compare that to when it started to get errors.
+ If those times are close (50%) the disabled time is doubled,
+ if not it is just disabled for 10s again.
+
+ */
+
+#define KNC_MAX_HWERR_IN_ROW    10
+#define KNC_HWERR_DISABLE_SECS (10)
+#define KNC_MAX_DISABLE_SECS   (15 * 60)
+
+
 static const char * const i2cpath = "/dev/i2c-2";
 static const char * const i2cpath = "/dev/i2c-2";
 
 
 #define KNC_I2C_TEMPLATE "/dev/i2c-%d"
 #define KNC_I2C_TEMPLATE "/dev/i2c-%d"
@@ -52,6 +71,11 @@ enum knc_reply_type {
 	KNC_REPLY_WORK_DONE   = 2,
 	KNC_REPLY_WORK_DONE   = 2,
 };
 };
 
 
+enum knc_i2c_core_status {
+	KNC_I2CSTATUS_DISABLED = 2,
+	KNC_I2CSTATUS_ENABLED  = 3,
+};
+
 struct device_drv knc_drv;
 struct device_drv knc_drv;
 
 
 struct knc_device {
 struct knc_device {
@@ -74,6 +98,11 @@ struct knc_core {
 	
 	
 	float volt;
 	float volt;
 	float current;
 	float current;
+	
+	int hwerr_in_row;
+	int hwerr_disable_time;
+	struct timeval enable_at;
+	struct timeval first_hwerr;
 };
 };
 
 
 static
 static
@@ -256,10 +285,22 @@ bool knc_init(struct thr_info * const thr)
 				*knccore = (struct knc_core){
 				*knccore = (struct knc_core){
 					.asicno = i2cslave - 0x20,
 					.asicno = i2cslave - 0x20,
 					.coreno = i + j,
 					.coreno = i + j,
+					.hwerr_in_row = 0,
+					.hwerr_disable_time = KNC_HWERR_DISABLE_SECS,
 				};
 				};
+				timer_set_now(&knccore->enable_at);
 				proc->device_data = knc;
 				proc->device_data = knc;
-				if (buf[j] != 3)
-					proc->deven = DEV_DISABLED;
+				switch (buf[j])
+				{
+					case KNC_I2CSTATUS_ENABLED:
+						break;
+					default:  // permanently disabled
+						proc->deven = DEV_DISABLED;
+						break;
+					case KNC_I2CSTATUS_DISABLED:
+						proc->deven = DEV_RECOVER_DRV;
+						break;
+				}
 				
 				
 				proc = proc->next_proc;
 				proc = proc->next_proc;
 				if ((!proc) || proc->device == proc)
 				if ((!proc) || proc->device == proc)
@@ -485,6 +526,7 @@ void knc_poll(struct thr_info * const thr)
 		for (i = 0; i < coreno; ++i)
 		for (i = 0; i < coreno; ++i)
 			proc = proc->next_proc;
 			proc = proc->next_proc;
 		mythr = proc->thr[0];
 		mythr = proc->thr[0];
+		knccore = mythr->cgpu_data;
 		
 		
 		i = get_u16be(&rxbuf[2]);
 		i = get_u16be(&rxbuf[2]);
 		HASH_FIND_INT(knc->devicework, &i, work);
 		HASH_FIND_INT(knc->devicework, &i, work);
@@ -509,7 +551,8 @@ void knc_poll(struct thr_info * const thr)
 			case KNC_REPLY_NONCE_FOUND:
 			case KNC_REPLY_NONCE_FOUND:
 				nonce = get_u32be(&rxbuf[4]);
 				nonce = get_u32be(&rxbuf[4]);
 				nonce = le32toh(nonce);
 				nonce = le32toh(nonce);
-				submit_nonce(mythr, work, nonce);
+				if (submit_nonce(mythr, work, nonce))
+					knccore->hwerr_in_row = 0;
 				break;
 				break;
 			case KNC_REPLY_WORK_DONE:
 			case KNC_REPLY_WORK_DONE:
 				HASH_DEL(knc->devicework, work);
 				HASH_DEL(knc->devicework, work);
@@ -580,6 +623,8 @@ void knc_core_disable(struct thr_info * const thr)
 static
 static
 void knc_core_enable(struct thr_info * const thr)
 void knc_core_enable(struct thr_info * const thr)
 {
 {
+	struct knc_core * const knccore = thr->cgpu_data;
+	timer_set_now(&knccore->enable_at);
 	_knc_core_setstatus(thr, 1);
 	_knc_core_setstatus(thr, 1);
 }
 }
 
 
@@ -597,6 +642,41 @@ float knc_dcdc_decode_5_11(uint16_t raw)
 	return dcdc_vin;
 	return dcdc_vin;
 }
 }
 
 
+static
+void knc_hw_error(struct thr_info * const thr)
+{
+	struct cgpu_info * const proc = thr->cgpu;
+	struct knc_core * const knccore = thr->cgpu_data;
+	
+	if(knccore->hwerr_in_row == 0)
+		timer_set_now(&knccore->first_hwerr);
+	++knccore->hwerr_in_row;
+	
+	if (knccore->hwerr_in_row >= KNC_MAX_HWERR_IN_ROW && proc->deven == DEV_ENABLED)
+	{
+		struct timeval now;
+		timer_set_now(&now);
+		float first_err_dt  = tdiff(&now, &knccore->first_hwerr);
+		float enable_dt     = tdiff(&now, &knccore->enable_at);
+		
+		if(first_err_dt * 1.5 > enable_dt)
+		{
+			// didn't really do much good
+			knccore->hwerr_disable_time *= 2;
+			if (knccore->hwerr_disable_time > KNC_MAX_DISABLE_SECS)
+				knccore->hwerr_disable_time = KNC_MAX_DISABLE_SECS;
+		}
+		else
+			knccore->hwerr_disable_time  = KNC_HWERR_DISABLE_SECS;
+		proc->deven = DEV_RECOVER_DRV;
+		applog(LOG_WARNING, "%"PRIpreprv": Disabled. %d hwerr in %.3f / %.3f . disabled %d s",
+		       proc->proc_repr, knccore->hwerr_in_row,
+		       enable_dt, first_err_dt, knccore->hwerr_disable_time);
+		
+		timer_set_delay_from_now(&knccore->enable_at, knccore->hwerr_disable_time * 1000000);
+	}
+}
+
 static
 static
 bool knc_get_stats(struct cgpu_info * const cgpu)
 bool knc_get_stats(struct cgpu_info * const cgpu)
 {
 {
@@ -613,6 +693,7 @@ bool knc_get_stats(struct cgpu_info * const cgpu)
 	int i2c;
 	int i2c;
 	int32_t rawtemp, rawvolt, rawcurrent;
 	int32_t rawtemp, rawvolt, rawcurrent;
 	float temp, volt, current;
 	float temp, volt, current;
+	struct timeval tv_now;
 	bool rv = false;
 	bool rv = false;
 	
 	
 	char i2cpath[sizeof(KNC_I2C_TEMPLATE)];
 	char i2cpath[sizeof(KNC_I2C_TEMPLATE)];
@@ -648,6 +729,7 @@ bool knc_get_stats(struct cgpu_info * const cgpu)
 	   Datasheet at http://www.lineagepower.com/oem/pdf/MDT040A0X.pdf
 	   Datasheet at http://www.lineagepower.com/oem/pdf/MDT040A0X.pdf
 	*/
 	*/
 
 
+	timer_set_now(&tv_now);
 	for (proc = cgpu, i = 0; proc && proc->device == cgpu; proc = proc->next_proc, ++i)
 	for (proc = cgpu, i = 0; proc && proc->device == cgpu; proc = proc->next_proc, ++i)
 	{
 	{
 		thr = proc->thr[0];
 		thr = proc->thr[0];
@@ -681,6 +763,13 @@ bool knc_get_stats(struct cgpu_info * const cgpu)
 		proc->temp = temp;
 		proc->temp = temp;
 		knccore->volt = volt;
 		knccore->volt = volt;
 		knccore->current = current;
 		knccore->current = current;
+		
+		// NOTE: We need to check _mt_disable_called because otherwise enabling won't assert it to i2c (it's false when getting stats for eg proc 0 before proc 1+ haven't initialised completely yet)
+		if (proc->deven == DEV_RECOVER_DRV && timer_passed(&knccore->enable_at, &tv_now) && thr->_mt_disable_called)
+		{
+			knccore->hwerr_in_row = 0;
+			proc_enable(proc);
+		}
 	}
 	}
 	
 	
 	rv = true;
 	rv = true;
@@ -727,6 +816,7 @@ struct device_drv knc_drv = {
 	.queue_append = knc_queue_append,
 	.queue_append = knc_queue_append,
 	.queue_flush = knc_queue_flush,
 	.queue_flush = knc_queue_flush,
 	.poll = knc_poll,
 	.poll = knc_poll,
+	.hw_error = knc_hw_error,
 	
 	
 	.get_stats = knc_get_stats,
 	.get_stats = knc_get_stats,
 	.get_api_extra_device_status = knc_api_extra_device_status,
 	.get_api_extra_device_status = knc_api_extra_device_status,

+ 12 - 4
miner.c

@@ -3212,8 +3212,9 @@ void get_statline3(char *buf, size_t bufsz, struct cgpu_info *cgpu, bool for_cur
 	if (for_curses)
 	if (for_curses)
 	{
 	{
 		const char *cHrStatsOpt[] = {"\2DEAD \1", "\2SICK \1", "OFF  ", "\2REST \1", " \2ERR \1", "\2WAIT \1", cHr};
 		const char *cHrStatsOpt[] = {"\2DEAD \1", "\2SICK \1", "OFF  ", "\2REST \1", " \2ERR \1", "\2WAIT \1", cHr};
+		const char *cHrStats;
 		int cHrStatsI = (sizeof(cHrStatsOpt) / sizeof(*cHrStatsOpt)) - 1;
 		int cHrStatsI = (sizeof(cHrStatsOpt) / sizeof(*cHrStatsOpt)) - 1;
-		bool all_dead = true, all_off = true;
+		bool all_dead = true, all_off = true, all_rdrv = true;
 		struct cgpu_info *proc = cgpu;
 		struct cgpu_info *proc = cgpu;
 		for (int i = 0; i < cgpu->procs; ++i, (proc = proc->next_proc))
 		for (int i = 0; i < cgpu->procs; ++i, (proc = proc->next_proc))
 		{
 		{
@@ -3234,8 +3235,12 @@ void get_statline3(char *buf, size_t bufsz, struct cgpu_info *cgpu, bool for_cur
 						all_off = false;
 						all_off = false;
 					}
 					}
 					else
 					else
-					if (likely(proc->deven != DEV_DISABLED))
-						all_off = false;
+					{
+						if (likely(proc->deven == DEV_ENABLED))
+							all_off = false;
+						if (proc->deven != DEV_RECOVER_DRV)
+							all_rdrv = false;
+					}
 				case 1:
 				case 1:
 					break;
 					break;
 			}
 			}
@@ -3249,9 +3254,12 @@ void get_statline3(char *buf, size_t bufsz, struct cgpu_info *cgpu, bool for_cur
 		else
 		else
 		if (unlikely(all_off))
 		if (unlikely(all_off))
 			cHrStatsI = 2;
 			cHrStatsI = 2;
+		cHrStats = cHrStatsOpt[cHrStatsI];
+		if (cHrStatsI == 2 && all_rdrv)
+			cHrStats = " RST ";
 		
 		
 		format_statline(buf, bufsz,
 		format_statline(buf, bufsz,
-		                cHrStatsOpt[cHrStatsI],
+		                cHrStats,
 		                aHr, uHr,
 		                aHr, uHr,
 		                accepted, rejected, stale,
 		                accepted, rejected, stale,
 		                wnotaccepted, waccepted,
 		                wnotaccepted, waccepted,

+ 4 - 3
miner.h

@@ -343,9 +343,10 @@ struct device_drv {
 
 
 enum dev_enable {
 enum dev_enable {
 	DEV_ENABLED,
 	DEV_ENABLED,
-	DEV_DISABLED,
-	DEV_RECOVER,
-	DEV_RECOVER_ERR,
+	DEV_DISABLED,     // Disabled by user
+	DEV_RECOVER,      // Disabled by temperature cutoff in watchdog
+	DEV_RECOVER_ERR,  // Disabled by communications error
+	DEV_RECOVER_DRV,  // Disabled by driver
 };
 };
 
 
 enum cl_kernels {
 enum cl_kernels {