Browse Source

Implement a utf8_decode function to produce wchar_t needed by curses

Luke Dashjr 12 years ago
parent
commit
e2baab933c
3 changed files with 104 additions and 0 deletions
  1. 1 0
      miner.c
  2. 99 0
      util.c
  3. 4 0
      util.h

+ 1 - 0
miner.c

@@ -9630,6 +9630,7 @@ int main(int argc, char *argv[])
 	}
 	
 	test_intrange();
+	utf8_test();
 
 #ifdef HAVE_CURSES
 	if (opt_realquiet || opt_display_devs)

+ 99 - 0
util.c

@@ -1346,6 +1346,105 @@ double tdiff(struct timeval *end, struct timeval *start)
 }
 
 
+int32_t utf8_decode(const void *b, int *out_len)
+{
+	int32_t w;
+	const unsigned char *s = b;
+	
+	if (!(s[0] & 0x80))
+	{
+		// ASCII
+		*out_len = 1;
+		return s[0];
+	}
+	
+#ifdef STRICT_UTF8
+	if (unlikely(!(s[0] & 0x40)))
+		goto invalid;
+#endif
+	
+	if (!(s[0] & 0x20))
+		*out_len = 2;
+	else
+	if (!(s[0] & 0x10))
+		*out_len = 3;
+	else
+	if (likely(!(s[0] & 8)))
+		*out_len = 4;
+	else
+		goto invalid;
+	
+	w = s[0] & ((2 << (6 - *out_len)) - 1);
+	for (int i = 1; i < *out_len; ++i)
+	{
+#ifdef STRICT_UTF8
+		if (unlikely((s[i] & 0xc0) != 0x80))
+			goto invalid;
+#endif
+		w = (w << 6) | (s[i] & 0x3f);
+	}
+	
+#if defined(STRICT_UTF8)
+	if (unlikely(w > 0x10FFFF))
+		goto invalid;
+	
+	// FIXME: UTF-8 requires smallest possible encoding; check it
+#endif
+	
+	return w;
+
+invalid:
+	*out_len = 1;
+	return REPLACEMENT_CHAR;
+}
+
+static
+void _utf8_test(const char *s, const wchar_t expected, int expectedlen)
+{
+	int len;
+	wchar_t r;
+	
+	r = utf8_decode(s, &len);
+	if (unlikely(r != expected || expectedlen != len))
+		applog(LOG_ERR, "UTF-8 test U+%06lX (len %d) failed: got U+%06lX (len %d)", (unsigned long)expected, expectedlen, (unsigned long)r, len);
+}
+#define _test_intrange(s, ...)  _test_intrange(s, (int[]){ __VA_ARGS__ })
+
+void utf8_test()
+{
+	_utf8_test("", 0, 1);
+	_utf8_test("\1", 1, 1);
+	_utf8_test("\x7f", 0x7f, 1);
+#if WCHAR_MAX >= 0x80
+	_utf8_test("\xc2\x80", 0x80, 2);
+#if WCHAR_MAX >= 0xff
+	_utf8_test("\xc3\xbf", 0xff, 2);
+#if WCHAR_MAX >= 0x7ff
+	_utf8_test("\xdf\xbf", 0x7ff, 2);
+#if WCHAR_MAX >= 0x800
+	_utf8_test("\xe0\xa0\x80", 0x800, 3);
+#if WCHAR_MAX >= 0xffff
+	_utf8_test("\xef\xbf\xbf", 0xffff, 3);
+#if WCHAR_MAX >= 0x10000
+	_utf8_test("\xf0\x90\x80\x80", 0x10000, 4);
+#if WCHAR_MAX >= 0x10ffff
+	_utf8_test("\xf4\x8f\xbf\xbf", 0x10ffff, 4);
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+#endif
+#ifdef STRICT_UTF8
+	_utf8_test("\x80", REPLACEMENT_CHAR, 1);
+	_utf8_test("\xbf", REPLACEMENT_CHAR, 1);
+	_utf8_test("\xfe", REPLACEMENT_CHAR, 1);
+	_utf8_test("\xff", REPLACEMENT_CHAR, 1);
+#endif
+}
+
+
 int format_temperature(char * const buf, const int pad, const bool highprecision, const bool unicode, const float temp)
 {
 	return

+ 4 - 0
util.h

@@ -346,6 +346,10 @@ enum h2bs_fmt {
 	H2B_SPACED,  // "xxx.x Mh/s"
 };
 
+#define REPLACEMENT_CHAR (0xFFFD)
+extern int32_t utf8_decode(const void *, int *out_len);
+extern void utf8_test();
+
 extern char *format_unit(char *buf, bool floatprec, const char *measurement, enum h2bs_fmt fmt, float n, signed char unitin);
 extern void percentf3(char * const buf, double p, const double t);
 extern int format_temperature(char * const buf, const int pad, const bool highprecision, const bool unicode, const float temp);