Browse Source

Merge branch 'cg_merges_2012118' into bfgminer

Luke Dashjr 13 years ago
parent
commit
bbb60ee0e1
9 changed files with 106 additions and 78 deletions
  1. 6 9
      adl.c
  2. 6 12
      driver-bitforce.c
  3. 3 9
      driver-icarus.c
  4. 1 1
      driver-modminer.c
  5. 3 3
      driver-ztex.c
  6. 2 2
      libztex.c
  7. 30 40
      miner.c
  8. 52 2
      util.c
  9. 3 0
      util.h

+ 6 - 9
adl.c

@@ -1056,9 +1056,7 @@ static bool fan_autotune(int gpu, int temp, int fanpercent, int lasttemp, bool *
 		applog(LOG_WARNING, "Overheat detected on GPU %d, increasing fan to 100%", gpu);
 		newpercent = iMax;
 
-		cgpu->device_last_not_well = time(NULL);
-		cgpu->device_not_well_reason = REASON_DEV_OVER_HEAT;
-		cgpu->dev_over_heat_count++;
+		dev_error(cgpu, REASON_DEV_OVER_HEAT);
 	} else if (temp > gpus[gpu].targettemp && fanpercent < top && tdiff >= 0) {
 		applog(LOG_DEBUG, "Temperature over target, increasing fanspeed");
 		if (temp > gpus[gpu].targettemp + opt_hysteresis)
@@ -1166,19 +1164,18 @@ void gpu_autotune(int gpu, enum dev_enable *denable)
 			applog(LOG_WARNING, "Overheat detected, decreasing GPU %d clock speed", gpu);
 			newengine = ga->minspeed;
 
-			cgpu->device_last_not_well = time(NULL);
-			cgpu->device_not_well_reason = REASON_DEV_OVER_HEAT;
-			cgpu->dev_over_heat_count++;
+			dev_error(cgpu, REASON_DEV_OVER_HEAT);
 		} else if (temp > gpus[gpu].targettemp + opt_hysteresis && engine > ga->minspeed && fan_optimal) {
 			applog(LOG_DEBUG, "Temperature %d degrees over target, decreasing clock speed", opt_hysteresis);
 			newengine = engine - ga->lpOdParameters.sEngineClock.iStep;
 			/* Only try to tune engine speed up if this GPU is not disabled */
 		} else if (temp < gpus[gpu].targettemp && engine < ga->maxspeed && fan_window && *denable == DEV_ENABLED) {
+			int iStep = ga->lpOdParameters.sEngineClock.iStep;
+
 			applog(LOG_DEBUG, "Temperature below target, increasing clock speed");
 			if (temp < gpus[gpu].targettemp - opt_hysteresis)
-				newengine = ga->maxspeed;
-			else
-				newengine = engine + ga->lpOdParameters.sEngineClock.iStep;
+				iStep *= 2;
+			newengine = engine + iStep;
 		}
 
 		if (newengine > ga->maxspeed)

+ 6 - 12
driver-bitforce.c

@@ -29,7 +29,7 @@
 #define BITFORCE_LONG_TIMEOUT_MS (BITFORCE_LONG_TIMEOUT_S * 1000)
 #define BITFORCE_CHECK_INTERVAL_MS 10
 #define WORK_CHECK_INTERVAL_MS 50
-#define MAX_START_DELAY_US 100000
+#define MAX_START_DELAY_MS 100
 #define tv_to_ms(tval) (tval.tv_sec * 1000 + tval.tv_usec / 1000)
 #define TIME_AVG_CONSTANT 8
 
@@ -319,9 +319,7 @@ static bool bitforce_get_temp(struct cgpu_info *bitforce)
 		 * our responses are out of sync and flush the buffer to
 		 * hopefully recover */
 		applog(LOG_WARNING, "BFL%i: Garbled response probably throttling, clearing buffer", bitforce->device_id);
-		bitforce->device_last_not_well = time(NULL);
-		bitforce->device_not_well_reason = REASON_DEV_THROTTLE;
-		bitforce->dev_throttle_count++;
+		dev_error(bitforce, REASON_DEV_THROTTLE);
 		/* Count throttling episodes as hardware errors */
 		bitforce->hw_errors++;
 		bitforce_clear_buffer(bitforce);
@@ -465,9 +463,7 @@ static int64_t bitforce_get_result(struct thr_info *thr, struct work *work)
 	if (elapsed.tv_sec > BITFORCE_TIMEOUT_S) {
 		applog(LOG_ERR, "BFL%i: took %dms - longer than %dms", bitforce->device_id,
 			tv_to_ms(elapsed), BITFORCE_TIMEOUT_MS);
-		bitforce->device_last_not_well = time(NULL);
-		bitforce->device_not_well_reason = REASON_DEV_OVER_HEAT;
-		bitforce->dev_over_heat_count++;
+		dev_error(bitforce, REASON_DEV_OVER_HEAT);
 		++bitforce->hw_errors;
 		++hw_errors;
 
@@ -578,9 +574,7 @@ static int64_t bitforce_scanhash(struct thr_info *thr, struct work *work, int64_
 commerr:
 		ret = 0;
 		applog(LOG_ERR, "BFL%i: Comms error", bitforce->device_id);
-		bitforce->device_last_not_well = time(NULL);
-		bitforce->device_not_well_reason = REASON_DEV_COMMS_ERROR;
-		bitforce->dev_comms_error_count++;
+		dev_error(bitforce, REASON_DEV_COMMS_ERROR);
 		bitforce->hw_errors++;
 		BFclose(bitforce->device_fd);
 		int fd = bitforce->device_fd = BFopen(bitforce->device_path);
@@ -612,9 +606,9 @@ static bool bitforce_thread_init(struct thr_info *thr)
 
 	/* Pause each new thread at least 100ms between initialising
 	 * so the devices aren't making calls all at the same time. */
-	wait = thr->id * MAX_START_DELAY_US;
+	wait = thr->id * MAX_START_DELAY_MS;
 	applog(LOG_DEBUG, "BFL%i: Delaying start by %dms", bitforce->device_id, wait / 1000);
-	usleep(wait);
+	nmsleep(wait);
 
 	return true;
 }

+ 3 - 9
driver-icarus.c

@@ -707,9 +707,7 @@ static bool icarus_reopen(struct cgpu_info *icarus, struct icarus_state *state,
 	*fdp = icarus->device_fd = icarus_open(icarus->device_path, info->baud);
 	if (unlikely(-1 == *fdp)) {
 		applog(LOG_ERR, "%s %u: Failed to reopen on %s", icarus->api->name, icarus->device_id, icarus->device_path);
-		icarus->device_last_not_well = time(NULL);
-		icarus->device_not_well_reason = REASON_DEV_COMMS_ERROR;
-		icarus->dev_comms_error_count++;
+		dev_error(icarus, REASON_DEV_COMMS_ERROR);
 		state->firstrun = true;
 		return false;
 	}
@@ -730,9 +728,7 @@ static bool icarus_start_work(struct thr_info *thr, const unsigned char *ob_bin)
 	if (ret) {
 		do_icarus_close(thr);
 		applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id);
-		icarus->device_last_not_well = time(NULL);
-		icarus->device_not_well_reason = REASON_DEV_COMMS_ERROR;
-		icarus->dev_comms_error_count++;
+		dev_error(icarus, REASON_DEV_COMMS_ERROR);
 		return false;	/* This should never happen */
 	}
 
@@ -815,9 +811,7 @@ static int64_t icarus_scanhash(struct thr_info *thr, struct work *work,
 				case ICA_GETS_ERROR:
 					do_icarus_close(thr);
 					applog(LOG_ERR, "ICA%i: Comms error", icarus->device_id);
-					icarus->device_last_not_well = time(NULL);
-					icarus->device_not_well_reason = REASON_DEV_COMMS_ERROR;
-					icarus->dev_comms_error_count++;
+					dev_error(icarus, REASON_DEV_COMMS_ERROR);
 					if (!icarus_reopen(icarus, state, &fd))
 						return -1;
 					break;

+ 1 - 1
driver-modminer.c

@@ -668,7 +668,7 @@ modminer_process_results(struct thr_info*thr)
 		}
 		if (work_restart(thr) || !--iter)
 			break;
-		usleep(1000);
+		nmsleep(1);
 		if (work_restart(thr))
 			break;
 		mutex_lock(&modminer->device_mutex);

+ 3 - 3
driver-ztex.c

@@ -199,7 +199,7 @@ static int64_t ztex_scanhash(struct thr_info *thr, struct work *work,
 	if (i < 0) {
 		// Something wrong happened in send
 		applog(LOG_ERR, "%s: Failed to send hash data with err %d, retrying", ztex->repr, i);
-		usleep(500000);
+		nmsleep(500);
 		i = libztex_sendHashData(ztex, sendbuf);
 		if (i < 0) {
 			// And there's nothing we can do about it
@@ -234,7 +234,7 @@ static int64_t ztex_scanhash(struct thr_info *thr, struct work *work,
 
 	applog(LOG_DEBUG, "%s: entering poll loop", ztex->repr);
 	while (!(overflow || thr->work_restart)) {
-		usleep(250000);
+		nmsleep(250);
 		if (thr->work_restart) {
 			applog(LOG_DEBUG, "%s: New work detected", ztex->repr);
 			break;
@@ -244,7 +244,7 @@ static int64_t ztex_scanhash(struct thr_info *thr, struct work *work,
 		if (i < 0) {
 			// Something wrong happened in read
 			applog(LOG_ERR, "%s: Failed to read hash data with err %d, retrying", ztex->repr, i);
-			usleep(500000);
+			nmsleep(500);
 			i = libztex_readHashData(ztex, &hdata[0]);
 			if (i < 0) {
 				// And there's nothing we can do about it

+ 2 - 2
libztex.c

@@ -223,7 +223,7 @@ static int libztex_configureFpgaHS(struct libztex_device *ztex, const char* firm
 
 	libusb_release_interface(ztex->hndl, settings[1]);
 
-	usleep(200000);
+	nmsleep(200);
 	applog(LOG_INFO, "%s: HS FPGA configuration done", ztex->repr);
 	return 0;
 
@@ -312,7 +312,7 @@ static int libztex_configureFpgaLS(struct libztex_device *ztex, const char* firm
 		applog(LOG_ERR, "%s: FPGA configuration failed: DONE pin does not go high", ztex->repr);
 		return 3;
 	}
-	usleep(200000);
+	nmsleep(200);
 	applog(LOG_INFO, "%s: FPGA configuration done", ztex->repr);
 	return 0;
 }

+ 30 - 40
miner.c

@@ -2072,12 +2072,12 @@ static void curses_print_status(void)
 		total_getworks,
 		local_work, total_go, total_ro);
 	wclrtoeol(statuswin);
-	if (pool->has_stratum) {
-		mvwprintw(statuswin, 4, 0, " Connected to %s with stratum as user %s",
-			pool->sockaddr_url, pool->rpc_user);
-	} else if ((pool_strategy == POOL_LOADBALANCE  || pool_strategy == POOL_BALANCE) && total_pools > 1) {
+	if ((pool_strategy == POOL_LOADBALANCE  || pool_strategy == POOL_BALANCE) && total_pools > 1) {
 		mvwprintw(statuswin, 4, 0, " Connected to multiple pools with%s LP",
 			have_longpoll ? "": "out");
+	} else if (pool->has_stratum) {
+		mvwprintw(statuswin, 4, 0, " Connected to %s with stratum as user %s",
+			pool->sockaddr_url, pool->rpc_user);
 	} else {
 		mvwprintw(statuswin, 4, 0, " Connected to %s with%s LP as user %s",
 			pool->sockaddr_url, have_longpoll ? "": "out", pool->rpc_user);
@@ -3375,9 +3375,12 @@ static bool queue_request(void);
 static void pool_died(struct pool *pool)
 {
 	if (!pool_tset(pool, &pool->idle)) {
-		applog(LOG_WARNING, "Pool %d %s not responding!", pool->pool_no, pool->rpc_url);
 		gettimeofday(&pool->tv_idle, NULL);
-		switch_pools(NULL);
+		if (pool == current_pool()) {
+			applog(LOG_WARNING, "Pool %d %s not responding!", pool->pool_no, pool->rpc_url);
+			switch_pools(NULL);
+		} else
+			applog(LOG_INFO, "Pool %d %s failed to return work", pool->pool_no, pool->rpc_url);
 	}
 }
 
@@ -3666,21 +3669,10 @@ next_submit:
 			if (pool_tclear(pool, &pool->submit_fail))
 					applog(LOG_WARNING, "Pool %d communication resumed, submitting work", pool->pool_no);
 			applog(LOG_DEBUG, "Successfully submitted, adding to stratum_shares db");
-		} else {
-			applog(LOG_WARNING, "Failed to submit stratum share to pool %d", pool->pool_no);
-			mutex_lock(&sshare_lock);
-			HASH_DEL(stratum_shares, sshare);
-			mutex_unlock(&sshare_lock);
-			clear_work(&sshare->work);
-			free(sshare);
-			pool->stale_shares++;
-			total_stale++;
-
-			if (!pool_tset(pool, &pool->submit_fail)) {
-				total_ro++;
-				pool->remotefail_occasions++;
-				applog(LOG_WARNING, "Pool %d share submission failure", pool->pool_no);
-			}
+		} else if (!pool_tset(pool, &pool->submit_fail)) {
+			applog(LOG_WARNING, "Pool %d stratum share submission failure", pool->pool_no);
+			total_ro++;
+			pool->remotefail_occasions++;
 		}
 
 		goto out;
@@ -3944,12 +3936,17 @@ void switch_pools(struct pool *selected)
 	if (pool != last_pool)
 	{
 		pool->block_id = 0;
-		applog(LOG_WARNING, "Switching to %s", pool->rpc_url);
+		if (pool_strategy != POOL_LOADBALANCE && pool_strategy != POOL_BALANCE) {
+			applog(LOG_WARNING, "Switching to %s", pool->rpc_url);
+		}
 	}
 
 	mutex_lock(&lp_lock);
 	pthread_cond_broadcast(&lp_cond);
 	mutex_unlock(&lp_lock);
+
+	if (!pool->queued)
+		queue_request();
 }
 
 static void discard_work(struct work *work)
@@ -5760,9 +5757,11 @@ static inline int cp_prio(void)
 
 static void pool_resus(struct pool *pool)
 {
-	applog(LOG_WARNING, "Pool %d %s alive", pool->pool_no, pool->rpc_url);
-	if (pool->prio < cp_prio() && pool_strategy == POOL_FAILOVER)
+	if (pool->prio < cp_prio() && pool_strategy == POOL_FAILOVER) {
+		applog(LOG_WARNING, "Pool %d %s alive", pool->pool_no, pool->rpc_url);
 		switch_pools(NULL);
+	} else
+		applog(LOG_INFO, "Pool %d %s resumed returning work", pool->pool_no, pool->rpc_url);
 }
 
 static bool queue_request(void)
@@ -6051,6 +6050,8 @@ static void get_work(struct work *work, struct thr_info *thr, const int thr_id)
 	}
 
 retry:
+	if (pool_strategy == POOL_BALANCE || pool_strategy == POOL_LOADBALANCE)
+		switch_pools(NULL);
 	pool = current_pool();
 
 	if (reuse_work(work, pool))
@@ -6320,10 +6321,7 @@ void *miner_thread(void *userdata)
 	gettimeofday(&getwork_start, NULL);
 
 	if (api->thread_init && !api->thread_init(mythr)) {
-		cgpu->device_last_not_well = time(NULL);
-		cgpu->device_not_well_reason = REASON_THREAD_FAIL_INIT;
-		cgpu->thread_fail_init_count++;
-
+		dev_error(cgpu, REASON_THREAD_FAIL_INIT);
 		goto out;
 	}
 
@@ -6394,9 +6392,7 @@ void *miner_thread(void *userdata)
 			if (unlikely(hashes == -1)) {
 				time_t now = time(NULL);
 				if (difftime(now, cgpu->device_last_not_well) > 1.) {
-					cgpu->device_last_not_well = time(NULL);
-					cgpu->device_not_well_reason = REASON_THREAD_ZERO_HASH;
-					cgpu->thread_zero_hash_count++;
+					dev_error(cgpu, REASON_THREAD_ZERO_HASH);
 				}
 
 				if (scanhash_working && opt_restart) {
@@ -6977,9 +6973,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 				       cgpu->api->name, cgpu->device_id);
 				*denable = DEV_RECOVER;
 
-				cgpu->device_last_not_well = time(NULL);
-				cgpu->device_not_well_reason = REASON_DEV_THERMAL_CUTOFF;
-				++cgpu->dev_thermal_cutoff_count;
+				dev_error(cgpu, REASON_DEV_THERMAL_CUTOFF);
 			}
 
 			if (thr->getwork) {
@@ -7015,9 +7009,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 				applog(LOG_ERR, "%s: Idle for more than 60 seconds, declaring SICK!", dev_str);
 				gettimeofday(&thr->sick, NULL);
 
-				cgpu->device_last_not_well = time(NULL);
-				cgpu->device_not_well_reason = REASON_DEV_SICK_IDLE_60;
-				cgpu->dev_sick_idle_60_count++;
+				dev_error(cgpu, REASON_DEV_SICK_IDLE_60);
 #ifdef HAVE_ADL
 				if (adl_active && cgpu->has_adl && gpu_activity(gpu) > 50) {
 					applog(LOG_ERR, "GPU still showing activity suggesting a hard hang.");
@@ -7033,9 +7025,7 @@ static void *watchdog_thread(void __maybe_unused *userdata)
 				applog(LOG_ERR, "%s: Not responded for more than 10 minutes, declaring DEAD!", dev_str);
 				gettimeofday(&thr->sick, NULL);
 
-				cgpu->device_last_not_well = time(NULL);
-				cgpu->device_not_well_reason = REASON_DEV_DEAD_IDLE_600;
-				cgpu->dev_dead_idle_600_count++;
+				dev_error(cgpu, REASON_DEV_DEAD_IDLE_600);
 			} else if (now.tv_sec - thr->sick.tv_sec > 60 &&
 				   (cgpu->status == LIFE_SICK || cgpu->status == LIFE_DEAD)) {
 				/* Attempt to restart a GPU that's sick or dead once every minute */

+ 52 - 2
util.c

@@ -812,7 +812,7 @@ double tdiff(struct timeval *end, struct timeval *start)
 
 bool extract_sockaddr(struct pool *pool, char *url)
 {
-	char *url_begin, *url_end, *port_start = NULL;
+	char *url_begin, *url_end, *ipv6_begin, *ipv6_end, *port_start = NULL;
 	char url_address[256], port[6];
 	int url_len, port_len = 0;
 
@@ -821,7 +821,14 @@ bool extract_sockaddr(struct pool *pool, char *url)
 		url_begin = url;
 	else
 		url_begin += 2;
-	url_end = strstr(url_begin, ":");
+
+	/* Look for numeric ipv6 entries */
+	ipv6_begin = strstr(url_begin, "[");
+	ipv6_end = strstr(url_begin, "]");
+	if (ipv6_begin && ipv6_end && ipv6_end > ipv6_begin)
+		url_end = strstr(ipv6_end, ":");
+	else
+		url_end = strstr(url_begin, ":");
 	if (url_end) {
 		url_len = url_end - url_begin;
 		port_len = strlen(url_begin) - url_len - 1;
@@ -1413,3 +1420,46 @@ out:
 
 	return ret;
 }
+
+
+void dev_error(struct cgpu_info *dev, enum dev_reason reason)
+{
+	dev->device_last_not_well = time(NULL);
+	dev->device_not_well_reason = reason;
+
+
+	switch (reason)
+	{
+		case REASON_THREAD_FAIL_INIT:
+			dev->thread_fail_init_count++;
+		break;
+		case REASON_THREAD_ZERO_HASH:
+			dev->thread_zero_hash_count++;
+		break;
+		case REASON_THREAD_FAIL_QUEUE:
+			dev->thread_fail_queue_count++;
+		break;
+		case REASON_DEV_SICK_IDLE_60:
+			dev->dev_sick_idle_60_count++;
+		break;
+		case REASON_DEV_DEAD_IDLE_600:
+			dev->dev_dead_idle_600_count++;
+		break;
+		case REASON_DEV_NOSTART:
+			dev->dev_nostart_count++;
+		break;
+		case REASON_DEV_OVER_HEAT:
+			dev->dev_over_heat_count++;
+		break;
+		case REASON_DEV_THERMAL_CUTOFF:
+			dev->dev_thermal_cutoff_count++;
+		break;
+		case REASON_DEV_COMMS_ERROR:
+			dev->dev_comms_error_count++;
+		break;
+		case REASON_DEV_THROTTLE:
+			dev->dev_throttle_count++;
+		break;
+	}
+
+}

+ 3 - 0
util.h

@@ -43,11 +43,14 @@
 #endif
 
 struct pool;
+enum dev_reason;
+struct cgpu_info;
 bool stratum_send(struct pool *pool, char *s, ssize_t len);
 char *recv_line(struct pool *pool);
 bool parse_method(struct pool *pool, char *s);
 bool extract_sockaddr(struct pool *pool, char *url);
 bool auth_stratum(struct pool *pool);
 bool initiate_stratum(struct pool *pool);
+void dev_error(struct cgpu_info *dev, enum dev_reason reason);
 
 #endif /* __UTIL_H__ */