checkpoint. more UTF8 clipboard testing.

2002-07-23 09:33:50 +00:00 · 2002-07-23 09:33:50 +00:00 · 16cc05d56b
parent fcd99c9510
commit 16cc05d56b
3 changed files with 185 additions and 79 deletions
--- a/base/CUnicode.cpp
+++ b/base/CUnicode.cpp
@ -1,4 +1,5 @@
 #include "CUnicode.h"
 #include <limits.h>
 #include <string.h>
 //
@ -39,7 +40,8 @@ decode32(const UInt8* n)
 // CUnicode
 //
-UInt32					CUnicode::s_invalid = 0x0000ffff;
+UInt32					CUnicode::s_invalid     = 0x0000ffff;
 UInt32					CUnicode::s_replacement = 0x0000fffd;
 CString
 CUnicode::UTF8ToUCS2(const CString& src)
@ -54,10 +56,11 @@ CUnicode::UTF8ToUCS2(const CString& src)
 	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
 	while (n > 0) {
 		UInt32 c = fromUTF8(data, n);
-		if (c != s_invalid && c < 0x00010000) {
+		if (c == s_invalid || c >= 0x00010000) {
-			UInt16 ucs2 = static_cast<UInt16>(c);
+			c = s_replacement;
 			dst.append(reinterpret_cast<const char*>(&ucs2), 2);
 		}
 		UInt16 ucs2 = static_cast<UInt16>(c);
 		dst.append(reinterpret_cast<const char*>(&ucs2), 2);
 	}
 	return dst;
@ -76,9 +79,10 @@ CUnicode::UTF8ToUCS4(const CString& src)
 	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
 	while (n > 0) {
 		UInt32 c = fromUTF8(data, n);
-		if (c != s_invalid) {
+		if (c == s_invalid) {
-			dst.append(reinterpret_cast<const char*>(&c), 4);
+			c = s_replacement;
 		}
 		dst.append(reinterpret_cast<const char*>(&c), 4);
 	}
 	return dst;
@ -97,18 +101,19 @@ CUnicode::UTF8ToUTF16(const CString& src)
 	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
 	while (n > 0) {
 		UInt32 c = fromUTF8(data, n);
-		if (c != s_invalid && c < 0x0010ffff) {
+		if (c == s_invalid || c >= 0x00110000) {
-			if (c < 0x00010000) {
+			c = s_replacement;
-				UInt16 ucs2 = static_cast<UInt16>(c);
+		}
-				dst.append(reinterpret_cast<const char*>(&ucs2), 2);
+		if (c < 0x00010000) {
-			}
+			UInt16 ucs2 = static_cast<UInt16>(c);
-			else {
+			dst.append(reinterpret_cast<const char*>(&ucs2), 2);
-				c -= 0x00010000;
+		}
-				UInt16 utf16h = static_cast<UInt16>(c >> 10) + 0xd800;
+		else {
-				UInt16 utf16l = (static_cast<UInt16>(c) & 0x03ff) + 0xdc00;
+			c -= 0x00010000;
-				dst.append(reinterpret_cast<const char*>(&utf16h), 2);
+			UInt16 utf16h = static_cast<UInt16>(c >> 10) + 0xd800;
-				dst.append(reinterpret_cast<const char*>(&utf16l), 2);
+			UInt16 utf16l = (static_cast<UInt16>(c) & 0x03ff) + 0xdc00;
-			}
+			dst.append(reinterpret_cast<const char*>(&utf16h), 2);
 			dst.append(reinterpret_cast<const char*>(&utf16l), 2);
 		}
 	}
@ -118,8 +123,23 @@ CUnicode::UTF8ToUTF16(const CString& src)
 CString
 CUnicode::UTF8ToUTF32(const CString& src)
 {
-	// FIXME -- should ensure dst has no characters over U-0010FFFF
+	// get size of input string and reserve some space in output.
-	return UTF8ToUCS4(src);
+	// include UTF8's nul terminator.
 	UInt32 n = src.size() + 1;
 	CString dst;
 	dst.reserve(4 * n);
 	// convert each character
 	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
 	while (n > 0) {
 		UInt32 c = fromUTF8(data, n);
 		if (c == s_invalid || c >= 0x00110000) {
 			c = s_replacement;
 		}
 		dst.append(reinterpret_cast<const char*>(&c), 4);
 	}
 	return dst;
 }
 CString
@ -157,24 +177,48 @@ CUnicode::UTF8ToText(const CString& src)
 	wchar_t* tmp = UTF8ToWideChar(src);
 	// get length of multibyte string
 	size_t len = 0;
 	char mbc[MB_LEN_MAX];
 	mbstate_t state;
 	memset(&state, 0, sizeof(state));
-	const wchar_t* scratch = tmp;
+	for (const wchar_t* scan = tmp; *scan != 0; ++scan) {
-	size_t len = wcsrtombs(NULL, &scratch, 0, &state);
+		size_t mblen = wcrtomb(mbc, *scan, &state);
-	if (len == (size_t)-1) {
+		if (mblen == -1) {
-		// invalid character in src
+			// unconvertable character
-		delete[] tmp;
+			len += 1;
-		return CString();
+		}
 		else {
 			len += mblen;
 		}
 	}
 	// check if state is in initial state.  if not then count the
 	// bytes for returning it to the initial state.
 	if (mbsinit(&state) == 0) {
 		len += wcrtomb(mbc, L'\0', &state) - 1;
 	}
 	assert(mbsinit(&state) != 0);
 	// allocate multibyte string
 	char* mbs = new char[len + 1];
 	// convert to multibyte
-	scratch = tmp;
+	char* dst = mbs;
-	char* dst = new char[len + 1];
+	for (const wchar_t* scan = tmp; *scan != 0; ++scan) {
-	wcsrtombs(dst, &scratch, len + 1, &state);
+		size_t mblen = wcrtomb(dst, *scan, &state);
-	CString text(dst);
+		if (mblen == -1) {
 			// unconvertable character
 			*dst++ = '?';
 		}
 		else {
 			dst += len;
 		}
 	}
 	*dst = '\0';
 	CString text(mbs);
 	// clean up
-	delete[] dst;
+	delete[] mbs;
 	delete[] tmp;
 	return text;
@ -297,6 +341,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
 		}
 		else if (n == 1) {
 			// error -- missing second word
 			toUTF8(dst, s_replacement);
 		}
 		else if (c >= 0x0000d800 && c <= 0x0000dbff) {
 			UInt32 c2 = decode16(data);
@ -304,6 +349,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
 			--n;
 			if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
 				// error -- [d800,dbff] not followed by [dc00,dfff]
 				toUTF8(dst, s_replacement);
 			}
 			else {
 				c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
@ -312,6 +358,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
 		}
 		else {
 			// error -- [dc00,dfff] without leading [d800,dbff]
 			toUTF8(dst, s_replacement);
 		}
 	}
@ -326,8 +373,25 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
 CString
 CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n)
 {
-	// FIXME -- should check that src has no characters over U-0010FFFF
+	// make some space
-	return doUCS4ToUTF8(data, n);
+	CString dst;
 	dst.reserve(n);
 	// convert each character
 	for (; n > 0; data += 4, --n) {
 		UInt32 c = decode32(data);
 		if (c >= 0x00110000) {
 			c = s_replacement;
 		}
 		toUTF8(dst, c);
 	}
 	// remove extra trailing nul
 	if (dst.size() > 0 && dst[dst.size() - 1] == '\0') {
 		dst.resize(dst.size() - 1);
 	}
 	return dst;
 }
 UInt32
@ -433,10 +497,54 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
 		assert(0 && "invalid size");
 	}
 	// check that all bytes after the first have the pattern 10xxxxxx.
 	// truncated sequences are treated as a single malformed character.
 	bool truncated = false;
 	switch (size) {
 	case 6:
 		if ((data[5] & 0xc0) != 0x80) {
 			truncated = true;
 			size = 5;
 		}
 		// fall through
 	case 5:
 		if ((data[4] & 0xc0) != 0x80) {
 			truncated = true;
 			size = 4;
 		}
 		// fall through
 	case 4:
 		if ((data[3] & 0xc0) != 0x80) {
 			truncated = true;
 			size = 3;
 		}
 		// fall through
 	case 3:
 		if ((data[2] & 0xc0) != 0x80) {
 			truncated = true;
 			size = 2;
 		}
 		// fall through
 	case 2:
 		if ((data[1] & 0xc0) != 0x80) {
 			truncated = true;
 			size = 1;
 		}
 	}
 	// update parameters
 	data += size;
 	n    -= size;
 	// invalid if sequence was truncated
 	if (truncated) {
 		return s_invalid;
 	}
 	// check for characters that didn't use the smallest possible encoding
 	static UInt32 s_minChar[] = {
 		0,
@ -451,29 +559,11 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
 		return s_invalid;
 	}
-	// check that all bytes after the first have the pattern 10xxxxxx.
+	// check for characters not in ISO-10646
-	UInt8 a = 0x80;
+	if (c >= 0x0000d800 && c <= 0x0000dfff) {
-	switch (size) {
+		return s_invalid;
 	case 6:
 		a |= data[5];
 		// fall through
 	case 5:
 		a |= data[4];
 		// fall through
 	case 4:
 		a |= data[3];
 		// fall through
 	case 3:
 		a |= data[2];
 		// fall through
 	case 2:
 		a |= data[1];
 	}
-	if ((a & 0xc0) != 0x80) {
+	if (c >= 0x0000fffe && c <= 0x0000ffff) {
 		return s_invalid;
 	}
@ -481,10 +571,16 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
 }
 void
-CUnicode::toUTF8(CString& dst, const UInt32 c)
+CUnicode::toUTF8(CString& dst, UInt32 c)
 {
 	UInt8 data[6];
 	// handle characters outside the valid range
 	if (c >= 0x80000000) {
 		c = s_replacement;
 	}
 	// convert to UTF-8
 	if (c < 0x00000080) {
 		data[0] = static_cast<UInt8>(c);
 		dst.append(reinterpret_cast<char*>(data), 1);
@ -525,6 +621,6 @@ CUnicode::toUTF8(CString& dst, const UInt32 c)
 		dst.append(reinterpret_cast<char*>(data), 6);
 	}
 	else {
-		// invalid character
+		assert(0 && "character out of range");
 	}
 }
--- a/base/CUnicode.h
+++ b/base/CUnicode.h
@ -39,10 +39,11 @@ private:
 	// convert characters to/from UTF8
 	static UInt32		fromUTF8(const UInt8*& src, UInt32& size);
-	static void			toUTF8(CString& dst, const UInt32 c);
+	static void			toUTF8(CString& dst, UInt32 c);
 private:
 	static UInt32		s_invalid;
 	static UInt32		s_replacement;
 };
 #endif
--- a/platform/CXWindowsClipboard.cpp
+++ b/platform/CXWindowsClipboard.cpp
@ -161,11 +161,16 @@ log((CLOG_INFO "found converter"));
 log((CLOG_INFO "clipboard format: %d", clipboardFormat));
 			if (m_added[clipboardFormat]) {
 log((CLOG_INFO "added"));
-				type   = converter->getAtom();
+				try {
-				format = converter->getDataSize();
+					data   = converter->fromIClipboard(m_data[clipboardFormat]);
-				data   = converter->fromIClipboard(m_data[clipboardFormat]);
+					format = converter->getDataSize();
 					type   = converter->getAtom();
 log((CLOG_INFO "  src: (%d) %s", m_data[clipboardFormat].size(), m_data[clipboardFormat].c_str()));
 log((CLOG_INFO "  dst: (%d) %s", data.size(), data.c_str()));
 				}
 				catch (...) {
 					// ignore -- cannot convert
 				}
 			}
 		}
 	}
@ -529,15 +534,17 @@ CXWindowsClipboard::icccmFillCache()
 		}
 		// add to clipboard and note we've done it
-		m_data[converter->getFormat()]  = converter->toIClipboard(targetData);
+		IClipboard::EFormat format = converter->getFormat();
-		m_added[converter->getFormat()] = true;
+		try {
-// XXX
+			m_data[format] = converter->toIClipboard(targetData);
-char* name = XGetAtomName(m_display, target);
+			if (!m_data[format].empty()) {
-log((CLOG_INFO "src atom: %d %s", target, name));
+				m_added[format] = true;
-XFree(name);
+				log((CLOG_DEBUG "  added format %d for target %d", converter->getFormat(), target));
-log((CLOG_INFO "src data size: %d", targetData.size()));
+			}
-log((CLOG_INFO "utf8 data size: %d", m_data[converter->getFormat()].size()));
+		}
-		log((CLOG_DEBUG "  added format %d for target %d", converter->getFormat(), target));
+		catch (...) {
 			// ignore -- could not convert data
 		}
 	}
 }
@ -791,15 +798,17 @@ CXWindowsClipboard::motifFillCache()
 		targetData.erase(length);
 		// add to clipboard and note we've done it
-		m_data[converter->getFormat()]  = converter->toIClipboard(targetData);
+		IClipboard::EFormat format = converter->getFormat();
-		m_added[converter->getFormat()] = true;
+		try {
-// XXX
+			m_data[format] = converter->toIClipboard(targetData);
-char* name = XGetAtomName(m_display, target);
+			if (!m_data[format].empty()) {
-log((CLOG_INFO "src atom: %d %s", target, name));
+				m_added[format] = true;
-XFree(name);
+				log((CLOG_DEBUG "  added format %d for target %d", converter->getFormat(), target));
-log((CLOG_INFO "src data size: %d", targetData.size()));
+			}
-log((CLOG_INFO "utf8 data size: %d", m_data[converter->getFormat()].size()));
+		}
-		log((CLOG_DEBUG "  added format %d for target %d", converter->getFormat(), target));
+		catch (...) {
 			// ignore -- could not convert data
 		}
 	}
 }