checkpoint. more UTF8 clipboard stuff.

2002-07-23 11:36:18 +00:00 · 2002-07-23 11:36:18 +00:00 · 1f5cb6a96f
parent 16cc05d56b
commit 1f5cb6a96f
4 changed files with 249 additions and 114 deletions
--- a/base/CUnicode.cpp
+++ b/base/CUnicode.cpp
@ -36,6 +36,26 @@ decode32(const UInt8* n)
 	return c.n32;
 }

+inline
+static
+void
+resetError(bool* errors)
+{
+	if (errors != NULL) {
+		*errors = false;
+	}
+}
+
+inline
+static
+void
+setError(bool* errors)
+{
+	if (errors != NULL) {
+		*errors = true;
+	}
+}
+
 //
 // CUnicode
 //
@ -43,9 +63,25 @@ decode32(const UInt8* n)
 UInt32					CUnicode::s_invalid     = 0x0000ffff;
 UInt32					CUnicode::s_replacement = 0x0000fffd;

-CString
-CUnicode::UTF8ToUCS2(const CString& src)
+bool
+CUnicode::isUTF8(const CString& src)
 {
+	// convert and test each character
+	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
+	for (UInt32 n = src.size(); n > 0; ) {
+		if (fromUTF8(data, n) == s_invalid) {
+			return false;
+		}
+	}
+	return true;
+}
+
+CString
+CUnicode::UTF8ToUCS2(const CString& src, bool* errors)
+{
+	// default to success
+	resetError(errors);
+
 	// get size of input string and reserve some space in output.
 	// include UTF8's nul terminator.
 	UInt32 n = src.size() + 1;
@ -56,7 +92,11 @@ CUnicode::UTF8ToUCS2(const CString& src)
 	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
 	while (n > 0) {
 		UInt32 c = fromUTF8(data, n);
-		if (c == s_invalid || c >= 0x00010000) {
+		if (c == s_invalid) {
+			c = s_replacement;
+		}
+		else if (c >= 0x00010000) {
+			setError(errors);
 			c = s_replacement;
 		}
 		UInt16 ucs2 = static_cast<UInt16>(c);
@ -67,8 +107,11 @@ CUnicode::UTF8ToUCS2(const CString& src)
 }

 CString
-CUnicode::UTF8ToUCS4(const CString& src)
+CUnicode::UTF8ToUCS4(const CString& src, bool* errors)
 {
+	// default to success
+	resetError(errors);
+
 	// get size of input string and reserve some space in output.
 	// include UTF8's nul terminator.
 	UInt32 n = src.size() + 1;
@ -89,8 +132,11 @@ CUnicode::UTF8ToUCS4(const CString& src)
 }

 CString
-CUnicode::UTF8ToUTF16(const CString& src)
+CUnicode::UTF8ToUTF16(const CString& src, bool* errors)
 {
+	// default to success
+	resetError(errors);
+
 	// get size of input string and reserve some space in output.
 	// include UTF8's nul terminator.
 	UInt32 n = src.size() + 1;
@ -101,7 +147,11 @@ CUnicode::UTF8ToUTF16(const CString& src)
 	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
 	while (n > 0) {
 		UInt32 c = fromUTF8(data, n);
-		if (c == s_invalid || c >= 0x00110000) {
+		if (c == s_invalid) {
+			c = s_replacement;
+		}
+		else if (c >= 0x00110000) {
+			setError(errors);
 			c = s_replacement;
 		}
 		if (c < 0x00010000) {
@ -121,8 +171,11 @@ CUnicode::UTF8ToUTF16(const CString& src)
 }

 CString
-CUnicode::UTF8ToUTF32(const CString& src)
+CUnicode::UTF8ToUTF32(const CString& src, bool* errors)
 {
+	// default to success
+	resetError(errors);
+
 	// get size of input string and reserve some space in output.
 	// include UTF8's nul terminator.
 	UInt32 n = src.size() + 1;
@ -133,7 +186,11 @@ CUnicode::UTF8ToUTF32(const CString& src)
 	const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
 	while (n > 0) {
 		UInt32 c = fromUTF8(data, n);
-		if (c == s_invalid || c >= 0x00110000) {
+		if (c == s_invalid) {
+			c = s_replacement;
+		}
+		else if (c >= 0x00110000) {
+			setError(errors);
 			c = s_replacement;
 		}
 		dst.append(reinterpret_cast<const char*>(&c), 4);
@ -143,38 +200,13 @@ CUnicode::UTF8ToUTF32(const CString& src)
 }

 CString
-CUnicode::UCS2ToUTF8(const CString& src)
+CUnicode::UTF8ToText(const CString& src, bool* errors)
 {
-	UInt32 n = src.size() >> 1;
-	return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n);
-}
+	// default to success
+	resetError(errors);

-CString
-CUnicode::UCS4ToUTF8(const CString& src)
-{
-	UInt32 n = src.size() >> 2;
-	return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n);
-}
-
-CString
-CUnicode::UTF16ToUTF8(const CString& src)
-{
-	UInt32 n = src.size() >> 1;
-	return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n);
-}
-
-CString
-CUnicode::UTF32ToUTF8(const CString& src)
-{
-	UInt32 n = src.size() >> 2;
-	return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n);
-}
-
-CString
-CUnicode::UTF8ToText(const CString& src)
-{
 	// convert to wide char
-	wchar_t* tmp = UTF8ToWideChar(src);
+	wchar_t* tmp = UTF8ToWideChar(src, errors);

 	// get length of multibyte string
 	size_t len = 0;
@ -185,6 +217,7 @@ CUnicode::UTF8ToText(const CString& src)
 		size_t mblen = wcrtomb(mbc, *scan, &state);
 		if (mblen == -1) {
 			// unconvertable character
+			setError(errors);
 			len += 1;
 		}
 		else {
@ -225,34 +258,116 @@ CUnicode::UTF8ToText(const CString& src)
 }

 CString
-CUnicode::textToUTF8(const CString& src)
+CUnicode::UCS2ToUTF8(const CString& src, bool* errors)
 {
-	// get length of wide char string
-	mbstate_t state;
-	memset(&state, 0, sizeof(state));
-	const char* scratch = src.c_str();
-	size_t len = mbsrtowcs(NULL, &scratch, 0, &state);
-	if (len == (size_t)-1) {
-		// invalid character in src
-		return CString();
+	UInt32 n = src.size() >> 1;
+	return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
 }

+CString
+CUnicode::UCS4ToUTF8(const CString& src, bool* errors)
+{
+	UInt32 n = src.size() >> 2;
+	return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
+}
+
+CString
+CUnicode::UTF16ToUTF8(const CString& src, bool* errors)
+{
+	UInt32 n = src.size() >> 1;
+	return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
+}
+
+CString
+CUnicode::UTF32ToUTF8(const CString& src, bool* errors)
+{
+	UInt32 n = src.size() >> 2;
+	return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
+}
+
+CString
+CUnicode::textToUTF8(const CString& src, bool* errors)
+{
+	// default to success
+	resetError(errors);
+
+	// get length of multibyte string
+	UInt32 n   = src.size();
+	size_t len = 0;
+	mbstate_t state;
+	memset(&state, 0, sizeof(state));
+	for (const char* scan = src.c_str(); n > 0 && *scan != 0; ) {
+		size_t mblen = mbrtowc(NULL, scan, n, &state);
+		switch (mblen) {
+		case (size_t)2:
+			// incomplete last character.  convert to unknown character.
+			setError(errors);
+			len += 1;
+			n    = 0;
+			break;
+
+		case (size_t)1:
+			// invalid character.  count one unknown character and
+			// start at the next byte.
+			setError(errors);
+			len  += 1;
+			scan += 1;
+			n    -= 1;
+			break;
+
+		default:
+			// normal character
+			len  += 1;
+			scan += mblen;
+			n    -= mblen;
+			break;
+		}
+	}
+	memset(&state, 0, sizeof(state));
+
+	// allocate wide character string
+	wchar_t* wcs = new wchar_t[len + 1];
+
 	// convert multibyte to wide char
-	scratch = src.c_str();
-	wchar_t* dst = new wchar_t[len + 1];
-	mbsrtowcs(dst, &scratch, len + 1, &state);
+	n = src.size();
+	wchar_t* dst = wcs;
+	for (const char* scan = src.c_str(); n > 0 && *scan != 0; ++dst) {
+		size_t mblen = mbrtowc(dst, scan, n, &state);
+		switch (mblen) {
+		case (size_t)2:
+			// incomplete character.  convert to unknown character.
+			*dst = (wchar_t)0xfffd;
+			n    = 0;
+			break;
+
+		case (size_t)1:
+			// invalid character.  count one unknown character and
+			// start at the next byte.
+			scan += 1;
+			n    -= 1;
+			*dst = (wchar_t)0xfffd;
+			break;
+
+		default:
+			// normal character
+			scan += mblen;
+			n    -= mblen;
+			break;
+		}
+	}
+	*dst = L'\0';

 	// convert to UTF8
-	CString utf8 = wideCharToUTF8(dst);
+	CString utf8 = wideCharToUTF8(wcs, errors);

 	// clean up
-	delete[] dst;
+	delete[] wcs;

 	return utf8;
 }

 wchar_t*
-CUnicode::UTF8ToWideChar(const CString& src)
+CUnicode::UTF8ToWideChar(const CString& src, bool* errors)
 {
 	// convert to platform's wide character encoding.
 	// note -- this must include a wide nul character (independent of
@ -272,21 +387,26 @@ CUnicode::UTF8ToWideChar(const CString& src)
 }

 CString
-CUnicode::wideCharToUTF8(const wchar_t* src)
+CUnicode::wideCharToUTF8(const wchar_t* src, bool* errors)
 {
 	// convert from platform's wide character encoding.
 	// note -- this must include a wide nul character (independent of
 	// the CString's nul character).
 #if WINDOWS_LIKE
-	return doUCS16ToUTF8(reinterpret_cast<const UInt8*>(src), wcslen(src));
+	return doUCS16ToUTF8(reinterpret_cast<const UInt8*>(src),
+								wcslen(src), errors);
 #elif UNIX_LIKE
-	return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src), wcslen(src));
+	return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src),
+								wcslen(src), errors);
 #endif
 }

 CString
-CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n)
+CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n, bool* errors)
 {
+	// default to success
+	resetError(errors);
+
 	// make some space
 	CString dst;
 	dst.reserve(n);
@ -294,7 +414,7 @@ CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n)
 	// convert each character
 	for (; n > 0; data += 2, --n) {
 		UInt32 c = decode16(data);
-		toUTF8(dst, c);
+		toUTF8(dst, c, errors);
 	}

 	// remove extra trailing nul
@ -306,8 +426,11 @@ CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n)
 }

 CString
-CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n)
+CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n, bool* errors)
 {
+	// default to success
+	resetError(errors);
+
 	// make some space
 	CString dst;
 	dst.reserve(n);
@ -315,7 +438,7 @@ CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n)
 	// convert each character
 	for (; n > 0; data += 4, --n) {
 		UInt32 c = decode32(data);
-		toUTF8(dst, c);
+		toUTF8(dst, c, errors);
 	}

 	// remove extra trailing nul
@ -327,8 +450,11 @@ CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n)
 }

 CString
-CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
+CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n, bool* errors)
 {
+	// default to success
+	resetError(errors);
+
 	// make some space
 	CString dst;
 	dst.reserve(n);
@ -337,11 +463,12 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
 	for (; n > 0; data += 2, --n) {
 		UInt32 c = decode16(data);
 		if (c < 0x0000d800 || c > 0x0000dfff) {
-			toUTF8(dst, c);
+			toUTF8(dst, c, errors);
 		}
 		else if (n == 1) {
 			// error -- missing second word
-			toUTF8(dst, s_replacement);
+			setError(errors);
+			toUTF8(dst, s_replacement, NULL);
 		}
 		else if (c >= 0x0000d800 && c <= 0x0000dbff) {
 			UInt32 c2 = decode16(data);
@ -349,16 +476,18 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
 			--n;
 			if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
 				// error -- [d800,dbff] not followed by [dc00,dfff]
-				toUTF8(dst, s_replacement);
+				setError(errors);
+				toUTF8(dst, s_replacement, NULL);
 			}
 			else {
 				c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
-				toUTF8(dst, c);
+				toUTF8(dst, c, errors);
 			}
 		}
 		else {
 			// error -- [dc00,dfff] without leading [d800,dbff]
-			toUTF8(dst, s_replacement);
+			setError(errors);
+			toUTF8(dst, s_replacement, NULL);
 		}
 	}

@ -371,8 +500,11 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
 }

 CString
-CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n)
+CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n, bool* errors)
 {
+	// default to success
+	resetError(errors);
+
 	// make some space
 	CString dst;
 	dst.reserve(n);
@ -381,9 +513,10 @@ CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n)
 	for (; n > 0; data += 4, --n) {
 		UInt32 c = decode32(data);
 		if (c >= 0x00110000) {
+			setError(errors);
 			c = s_replacement;
 		}
-		toUTF8(dst, c);
+		toUTF8(dst, c, errors);
 	}

 	// remove extra trailing nul
@ -571,12 +704,13 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
 }

 void
-CUnicode::toUTF8(CString& dst, UInt32 c)
+CUnicode::toUTF8(CString& dst, UInt32 c, bool* errors)
 {
 	UInt8 data[6];

 	// handle characters outside the valid range
-	if (c >= 0x80000000) {
+	if ((c >= 0x0000d800 && c <= 0x0000dfff) || c >= 0x80000000) {
+		setError(errors);
 		c = s_replacement;
 	}

--- a/base/CUnicode.h
+++ b/base/CUnicode.h
@ -7,39 +7,50 @@

 class CUnicode {
 public:
-	static CString		UTF8ToUCS2(const CString&);
-	static CString		UTF8ToUCS4(const CString&);
-	static CString		UTF8ToUTF16(const CString&);
-	static CString		UTF8ToUTF32(const CString&);
+	// returns true iff the string contains a valid sequence of UTF-8
+	// encoded characters.
+	static bool			isUTF8(const CString&);

-	static CString		UCS2ToUTF8(const CString&);
-	static CString		UCS4ToUTF8(const CString&);
-	static CString		UTF16ToUTF8(const CString&);
-	static CString		UTF32ToUTF8(const CString&);
+	// convert from UTF-8 encoding to other encodings.  if errors is
+	// not NULL then it gets true if any characters could not be
+	// encoded in the target encoding and false otherwise.  note
+	// that decoding errors do not set errors to error.  UTF8ToText()
+	// converts to the current locale's (multibyte) encoding.
+	static CString		UTF8ToUCS2(const CString&, bool* errors = NULL);
+	static CString		UTF8ToUCS4(const CString&, bool* errors = NULL);
+	static CString		UTF8ToUTF16(const CString&, bool* errors = NULL);
+	static CString		UTF8ToUTF32(const CString&, bool* errors = NULL);
+	static CString		UTF8ToText(const CString&, bool* errors = NULL);

-	// convert UTF-8 to/from the current locale's encoding
-	static CString		UTF8ToText(const CString&);
-	static CString		textToUTF8(const CString&);
+	// convert from some encoding to UTF-8.  if errors is not NULL
+	// then it gets true if any characters could not be decoded and
+	// false otherwise.  textToUTF8() converts from the current
+	// locale's (multibyte) encoding.
+	static CString		UCS2ToUTF8(const CString&, bool* errors = NULL);
+	static CString		UCS4ToUTF8(const CString&, bool* errors = NULL);
+	static CString		UTF16ToUTF8(const CString&, bool* errors = NULL);
+	static CString		UTF32ToUTF8(const CString&, bool* errors = NULL);
+	static CString		textToUTF8(const CString&, bool* errors = NULL);

 private:
 	// convert UTF8 to nul terminated wchar_t string (using whatever
 	// encoding is native to the platform).  caller must delete[]
 	// the returned string.
-	static wchar_t*		UTF8ToWideChar(const CString&);
+	static wchar_t*		UTF8ToWideChar(const CString&, bool* errors);

 	// convert nul terminated wchar_t string (in platform's native
 	// encoding) to UTF8.
-	static CString		wideCharToUTF8(const wchar_t*);
+	static CString		wideCharToUTF8(const wchar_t*, bool* errors);

 	// internal conversion to UTF8
-	static CString		doUCS2ToUTF8(const UInt8* src, UInt32 n);
-	static CString		doUCS4ToUTF8(const UInt8* src, UInt32 n);
-	static CString		doUTF16ToUTF8(const UInt8* src, UInt32 n);
-	static CString		doUTF32ToUTF8(const UInt8* src, UInt32 n);
+	static CString		doUCS2ToUTF8(const UInt8* src, UInt32 n, bool* errors);
+	static CString		doUCS4ToUTF8(const UInt8* src, UInt32 n, bool* errors);
+	static CString		doUTF16ToUTF8(const UInt8* src, UInt32 n, bool* errors);
+	static CString		doUTF32ToUTF8(const UInt8* src, UInt32 n, bool* errors);

 	// convert characters to/from UTF8
 	static UInt32		fromUTF8(const UInt8*& src, UInt32& size);
-	static void			toUTF8(CString& dst, UInt32 c);
+	static void			toUTF8(CString& dst, UInt32 c, bool* errors);

 private:
 	static UInt32		s_invalid;
--- a/platform/CXWindowsClipboard.cpp
+++ b/platform/CXWindowsClipboard.cpp
@ -151,22 +151,14 @@ CXWindowsClipboard::addSimpleRequest(Window requestor,
 		type = getTimestampData(data, &format);
 	}
 	else {
-char* name = XGetAtomName(m_display, target);
-log((CLOG_INFO "request target: %d %s", target, name));
-XFree(name);
 		IXWindowsClipboardConverter* converter = getConverter(target);
 		if (converter != NULL) {
-log((CLOG_INFO "found converter"));
 			IClipboard::EFormat clipboardFormat = converter->getFormat();
-log((CLOG_INFO "clipboard format: %d", clipboardFormat));
 			if (m_added[clipboardFormat]) {
-log((CLOG_INFO "added"));
 				try {
 					data   = converter->fromIClipboard(m_data[clipboardFormat]);
 					format = converter->getDataSize();
 					type   = converter->getAtom();
-log((CLOG_INFO "  src: (%d) %s", m_data[clipboardFormat].size(), m_data[clipboardFormat].c_str()));
-log((CLOG_INFO "  dst: (%d) %s", data.size(), data.c_str()));
 				}
 				catch (...) {
 					// ignore -- cannot convert
@ -535,18 +527,11 @@ CXWindowsClipboard::icccmFillCache()

 		// add to clipboard and note we've done it
 		IClipboard::EFormat format = converter->getFormat();
-		try {
 		m_data[format]  = converter->toIClipboard(targetData);
-			if (!m_data[format].empty()) {
 		m_added[format] = true;
 		log((CLOG_DEBUG "  added format %d for target %d", converter->getFormat(), target));
 	}
 }
-		catch (...) {
-			// ignore -- could not convert data
-		}
-	}
-}

 bool
 CXWindowsClipboard::icccmGetSelection(Atom target,
@ -799,18 +784,11 @@ CXWindowsClipboard::motifFillCache()

 		// add to clipboard and note we've done it
 		IClipboard::EFormat format = converter->getFormat();
-		try {
 		m_data[format]  = converter->toIClipboard(targetData);
-			if (!m_data[format].empty()) {
 		m_added[format] = true;
 		log((CLOG_DEBUG "  added format %d for target %d", converter->getFormat(), target));
 	}
 }
-		catch (...) {
-			// ignore -- could not convert data
-		}
-	}
-}

 IClipboard::Time
 CXWindowsClipboard::motifGetTime() const
--- a/platform/CXWindowsClipboardTextConverter.cpp
+++ b/platform/CXWindowsClipboardTextConverter.cpp
@ -44,5 +44,17 @@ CXWindowsClipboardTextConverter::fromIClipboard(const CString& data) const
 CString
 CXWindowsClipboardTextConverter::toIClipboard(const CString& data) const
 {
-	return CUnicode::textToUTF8(data);
+	// convert to UTF-8
+	bool errors;
+	CString utf8 = CUnicode::textToUTF8(data, &errors);
+
+	// if there were decoding errors then, to support old applications
+	// that don't understand UTF-8 but can report the exact binary
+	// UTF-8 representation, see if the data appears to be UTF-8.  if
+	// so then use it as is.
+	if (errors && CUnicode::isUTF8(data)) {
+		return data;
+	}
+
+	return utf8;
 }