From 16cc05d56bd9d75b857fe2c952b8381ce74d8913 Mon Sep 17 00:00:00 2001
From: crs <cschoeneman@gmail.com>
Date: Tue, 23 Jul 2002 09:33:50 +0000
Subject: [PATCH] checkpoint.  more UTF8 clipboard testing.

---
 base/CUnicode.cpp               | 210 +++++++++++++++++++++++---------
 base/CUnicode.h                 |   3 +-
 platform/CXWindowsClipboard.cpp |  51 ++++----
 3 files changed, 185 insertions(+), 79 deletions(-)
diff --git a/base/CUnicode.cpp b/base/CUnicode.cpp
index 8925752d..ccabd227 100644
--- a/base/CUnicode.cpp
+++ b/base/CUnicode.cpp
@@ -1,4 +1,5 @@
 #include "CUnicode.h"
+#include <limits.h>
 #include <string.h>
 
 //
@@ -39,7 +40,8 @@ decode32(const UInt8* n)
 // CUnicode
 //
 
-UInt32					CUnicode::s_invalid = 0x0000ffff;
+UInt32					CUnicode::s_invalid     = 0x0000ffff;
+UInt32					CUnicode::s_replacement = 0x0000fffd;
 
 CString
 CUnicode::UTF8ToUCS2(const CString& src)
@@ -54,10 +56,11 @@ CUnicode::UTF8ToUCS2(const CString& src)
 	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
 	while (n > 0) {
 		UInt32 c = fromUTF8(data, n);
-		if (c != s_invalid && c < 0x00010000) {
-			UInt16 ucs2 = static_cast<UInt16>(c);
-			dst.append(reinterpret_cast<const char*>(&ucs2), 2);
+		if (c == s_invalid || c >= 0x00010000) {
+			c = s_replacement;
 		}
+		UInt16 ucs2 = static_cast<UInt16>(c);
+		dst.append(reinterpret_cast<const char*>(&ucs2), 2);
 	}
 
 	return dst;
@@ -76,9 +79,10 @@ CUnicode::UTF8ToUCS4(const CString& src)
 	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
 	while (n > 0) {
 		UInt32 c = fromUTF8(data, n);
-		if (c != s_invalid) {
-			dst.append(reinterpret_cast<const char*>(&c), 4);
+		if (c == s_invalid) {
+			c = s_replacement;
 		}
+		dst.append(reinterpret_cast<const char*>(&c), 4);
 	}
 
 	return dst;
@@ -97,18 +101,19 @@ CUnicode::UTF8ToUTF16(const CString& src)
 	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
 	while (n > 0) {
 		UInt32 c = fromUTF8(data, n);
-		if (c != s_invalid && c < 0x0010ffff) {
-			if (c < 0x00010000) {
-				UInt16 ucs2 = static_cast<UInt16>(c);
-				dst.append(reinterpret_cast<const char*>(&ucs2), 2);
-			}
-			else {
-				c -= 0x00010000;
-				UInt16 utf16h = static_cast<UInt16>(c >> 10) + 0xd800;
-				UInt16 utf16l = (static_cast<UInt16>(c) & 0x03ff) + 0xdc00;
-				dst.append(reinterpret_cast<const char*>(&utf16h), 2);
-				dst.append(reinterpret_cast<const char*>(&utf16l), 2);
-			}
+		if (c == s_invalid || c >= 0x00110000) {
+			c = s_replacement;
+		}
+		if (c < 0x00010000) {
+			UInt16 ucs2 = static_cast<UInt16>(c);
+			dst.append(reinterpret_cast<const char*>(&ucs2), 2);
+		}
+		else {
+			c -= 0x00010000;
+			UInt16 utf16h = static_cast<UInt16>(c >> 10) + 0xd800;
+			UInt16 utf16l = (static_cast<UInt16>(c) & 0x03ff) + 0xdc00;
+			dst.append(reinterpret_cast<const char*>(&utf16h), 2);
+			dst.append(reinterpret_cast<const char*>(&utf16l), 2);
 		}
 	}
 
@@ -118,8 +123,23 @@ CUnicode::UTF8ToUTF16(const CString& src)
 CString
 CUnicode::UTF8ToUTF32(const CString& src)
 {
-	// FIXME -- should ensure dst has no characters over U-0010FFFF
-	return UTF8ToUCS4(src);
+	// get size of input string and reserve some space in output.
+	// include UTF8's nul terminator.
+	UInt32 n = src.size() + 1;
+	CString dst;
+	dst.reserve(4 * n);
+
+	// convert each character
+	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
+	while (n > 0) {
+		UInt32 c = fromUTF8(data, n);
+		if (c == s_invalid || c >= 0x00110000) {
+			c = s_replacement;
+		}
+		dst.append(reinterpret_cast<const char*>(&c), 4);
+	}
+
+	return dst;
 }
 
 CString
@@ -157,24 +177,48 @@ CUnicode::UTF8ToText(const CString& src)
 	wchar_t* tmp = UTF8ToWideChar(src);
 
 	// get length of multibyte string
+	size_t len = 0;
+	char mbc[MB_LEN_MAX];
 	mbstate_t state;
 	memset(&state, 0, sizeof(state));
-	const wchar_t* scratch = tmp;
-	size_t len = wcsrtombs(NULL, &scratch, 0, &state);
-	if (len == (size_t)-1) {
-		// invalid character in src
-		delete[] tmp;
-		return CString();
+	for (const wchar_t* scan = tmp; *scan != 0; ++scan) {
+		size_t mblen = wcrtomb(mbc, *scan, &state);
+		if (mblen == -1) {
+			// unconvertable character
+			len += 1;
+		}
+		else {
+			len += mblen;
+		}
 	}
 
+	// check if state is in initial state.  if not then count the
+	// bytes for returning it to the initial state.
+	if (mbsinit(&state) == 0) {
+		len += wcrtomb(mbc, L'\0', &state) - 1;
+	}
+	assert(mbsinit(&state) != 0);
+
+	// allocate multibyte string
+	char* mbs = new char[len + 1];
+
 	// convert to multibyte
-	scratch = tmp;
-	char* dst = new char[len + 1];
-	wcsrtombs(dst, &scratch, len + 1, &state);
-	CString text(dst);
+	char* dst = mbs;
+	for (const wchar_t* scan = tmp; *scan != 0; ++scan) {
+		size_t mblen = wcrtomb(dst, *scan, &state);
+		if (mblen == -1) {
+			// unconvertable character
+			*dst++ = '?';
+		}
+		else {
+			dst += len;
+		}
+	}
+	*dst = '\0';
+	CString text(mbs);
 
 	// clean up
-	delete[] dst;
+	delete[] mbs;
 	delete[] tmp;
 
 	return text;
@@ -297,6 +341,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
 		}
 		else if (n == 1) {
 			// error -- missing second word
+			toUTF8(dst, s_replacement);
 		}
 		else if (c >= 0x0000d800 && c <= 0x0000dbff) {
 			UInt32 c2 = decode16(data);
@@ -304,6 +349,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
 			--n;
 			if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
 				// error -- [d800,dbff] not followed by [dc00,dfff]
+				toUTF8(dst, s_replacement);
 			}
 			else {
 				c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
@@ -312,6 +358,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
 		}
 		else {
 			// error -- [dc00,dfff] without leading [d800,dbff]
+			toUTF8(dst, s_replacement);
 		}
 	}
 
@@ -326,8 +373,25 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
 CString
 CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n)
 {
-	// FIXME -- should check that src has no characters over U-0010FFFF
-	return doUCS4ToUTF8(data, n);
+	// make some space
+	CString dst;
+	dst.reserve(n);
+
+	// convert each character
+	for (; n > 0; data += 4, --n) {
+		UInt32 c = decode32(data);
+		if (c >= 0x00110000) {
+			c = s_replacement;
+		}
+		toUTF8(dst, c);
+	}
+
+	// remove extra trailing nul
+	if (dst.size() > 0 && dst[dst.size() - 1] == '\0') {
+		dst.resize(dst.size() - 1);
+	}
+
+	return dst;
 }
 
 UInt32
@@ -433,10 +497,54 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
 		assert(0 && "invalid size");
 	}
 
+	// check that all bytes after the first have the pattern 10xxxxxx.
+	// truncated sequences are treated as a single malformed character.
+	bool truncated = false;
+	switch (size) {
+	case 6:
+		if ((data[5] & 0xc0) != 0x80) {
+			truncated = true;
+			size = 5;
+		}
+		// fall through
+
+	case 5:
+		if ((data[4] & 0xc0) != 0x80) {
+			truncated = true;
+			size = 4;
+		}
+		// fall through
+
+	case 4:
+		if ((data[3] & 0xc0) != 0x80) {
+			truncated = true;
+			size = 3;
+		}
+		// fall through
+
+	case 3:
+		if ((data[2] & 0xc0) != 0x80) {
+			truncated = true;
+			size = 2;
+		}
+		// fall through
+
+	case 2:
+		if ((data[1] & 0xc0) != 0x80) {
+			truncated = true;
+			size = 1;
+		}
+	}
+
 	// update parameters
 	data += size;
 	n    -= size;
 
+	// invalid if sequence was truncated
+	if (truncated) {
+		return s_invalid;
+	}
+
 	// check for characters that didn't use the smallest possible encoding
 	static UInt32 s_minChar[] = {
 		0,
@@ -451,29 +559,11 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
 		return s_invalid;
 	}
 
-	// check that all bytes after the first have the pattern 10xxxxxx.
-	UInt8 a = 0x80;
-	switch (size) {
-	case 6:
-		a |= data[5];
-		// fall through
-
-	case 5:
-		a |= data[4];
-		// fall through
-
-	case 4:
-		a |= data[3];
-		// fall through
-
-	case 3:
-		a |= data[2];
-		// fall through
-
-	case 2:
-		a |= data[1];
+	// check for characters not in ISO-10646
+	if (c >= 0x0000d800 && c <= 0x0000dfff) {
+		return s_invalid;
 	}
-	if ((a & 0xc0) != 0x80) {
+	if (c >= 0x0000fffe && c <= 0x0000ffff) {
 		return s_invalid;
 	}
 
@@ -481,10 +571,16 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
 }
 
 void
-CUnicode::toUTF8(CString& dst, const UInt32 c)
+CUnicode::toUTF8(CString& dst, UInt32 c)
 {
 	UInt8 data[6];
 
+	// handle characters outside the valid range
+	if (c >= 0x80000000) {
+		c = s_replacement;
+	}
+
+	// convert to UTF-8
 	if (c < 0x00000080) {
 		data[0] = static_cast<UInt8>(c);
 		dst.append(reinterpret_cast<char*>(data), 1);
@@ -525,6 +621,6 @@ CUnicode::toUTF8(CString& dst, const UInt32 c)
 		dst.append(reinterpret_cast<char*>(data), 6);
 	}
 	else {
-		// invalid character
+		assert(0 && "character out of range");
 	}
 }
diff --git a/base/CUnicode.h b/base/CUnicode.h
index 4bcabe60..81ff95d7 100644
--- a/base/CUnicode.h
+++ b/base/CUnicode.h
@@ -39,10 +39,11 @@ private:
 
 	// convert characters to/from UTF8
 	static UInt32		fromUTF8(const UInt8*& src, UInt32& size);
-	static void			toUTF8(CString& dst, const UInt32 c);
+	static void			toUTF8(CString& dst, UInt32 c);
 
 private:
 	static UInt32		s_invalid;
+	static UInt32		s_replacement;
 };
 
 #endif
diff --git a/platform/CXWindowsClipboard.cpp b/platform/CXWindowsClipboard.cpp
index e1f19eff..76c766bf 100644
--- a/platform/CXWindowsClipboard.cpp
+++ b/platform/CXWindowsClipboard.cpp
@@ -161,11 +161,16 @@ log((CLOG_INFO "found converter"));
 log((CLOG_INFO "clipboard format: %d", clipboardFormat));
 			if (m_added[clipboardFormat]) {
 log((CLOG_INFO "added"));
-				type   = converter->getAtom();
-				format = converter->getDataSize();
-				data   = converter->fromIClipboard(m_data[clipboardFormat]);
+				try {
+					data   = converter->fromIClipboard(m_data[clipboardFormat]);
+					format = converter->getDataSize();
+					type   = converter->getAtom();
 log((CLOG_INFO "  src: (%d) %s", m_data[clipboardFormat].size(), m_data[clipboardFormat].c_str()));
 log((CLOG_INFO "  dst: (%d) %s", data.size(), data.c_str()));
+				}
+				catch (...) {
+					// ignore -- cannot convert
+				}
 			}
 		}
 	}
@@ -529,15 +534,17 @@ CXWindowsClipboard::icccmFillCache()
 		}
 
 		// add to clipboard and note we've done it
-		m_data[converter->getFormat()]  = converter->toIClipboard(targetData);
-		m_added[converter->getFormat()] = true;
-// XXX
-char* name = XGetAtomName(m_display, target);
-log((CLOG_INFO "src atom: %d %s", target, name));
-XFree(name);
-log((CLOG_INFO "src data size: %d", targetData.size()));
-log((CLOG_INFO "utf8 data size: %d", m_data[converter->getFormat()].size()));
-		log((CLOG_DEBUG "  added format %d for target %d", converter->getFormat(), target));
+		IClipboard::EFormat format = converter->getFormat();
+		try {
+			m_data[format] = converter->toIClipboard(targetData);
+			if (!m_data[format].empty()) {
+				m_added[format] = true;
+				log((CLOG_DEBUG "  added format %d for target %d", converter->getFormat(), target));
+			}
+		}
+		catch (...) {
+			// ignore -- could not convert data
+		}
 	}
 }
 
@@ -791,15 +798,17 @@ CXWindowsClipboard::motifFillCache()
 		targetData.erase(length);
 
 		// add to clipboard and note we've done it
-		m_data[converter->getFormat()]  = converter->toIClipboard(targetData);
-		m_added[converter->getFormat()] = true;
-// XXX
-char* name = XGetAtomName(m_display, target);
-log((CLOG_INFO "src atom: %d %s", target, name));
-XFree(name);
-log((CLOG_INFO "src data size: %d", targetData.size()));
-log((CLOG_INFO "utf8 data size: %d", m_data[converter->getFormat()].size()));
-		log((CLOG_DEBUG "  added format %d for target %d", converter->getFormat(), target));
+		IClipboard::EFormat format = converter->getFormat();
+		try {
+			m_data[format] = converter->toIClipboard(targetData);
+			if (!m_data[format].empty()) {
+				m_added[format] = true;
+				log((CLOG_DEBUG "  added format %d for target %d", converter->getFormat(), target));
+			}
+		}
+		catch (...) {
+			// ignore -- could not convert data
+		}
 	}
 }