From 1f5cb6a96faa685a26307130f7df694c524a8ccb Mon Sep 17 00:00:00 2001 From: crs Date: Tue, 23 Jul 2002 11:36:18 +0000 Subject: [PATCH] checkpoint. more UTF8 clipboard stuff. --- base/CUnicode.cpp | 268 ++++++++++++++----- base/CUnicode.h | 47 ++-- platform/CXWindowsClipboard.cpp | 34 +-- platform/CXWindowsClipboardTextConverter.cpp | 14 +- 4 files changed, 249 insertions(+), 114 deletions(-) diff --git a/base/CUnicode.cpp b/base/CUnicode.cpp index ccabd227..45a3c7e8 100644 --- a/base/CUnicode.cpp +++ b/base/CUnicode.cpp @@ -36,6 +36,26 @@ decode32(const UInt8* n) return c.n32; } +inline +static +void +resetError(bool* errors) +{ + if (errors != NULL) { + *errors = false; + } +} + +inline +static +void +setError(bool* errors) +{ + if (errors != NULL) { + *errors = true; + } +} + // // CUnicode // @@ -43,9 +63,25 @@ decode32(const UInt8* n) UInt32 CUnicode::s_invalid = 0x0000ffff; UInt32 CUnicode::s_replacement = 0x0000fffd; -CString -CUnicode::UTF8ToUCS2(const CString& src) +bool +CUnicode::isUTF8(const CString& src) { + // convert and test each character + const UInt8* data = reinterpret_cast(src.c_str()); + for (UInt32 n = src.size(); n > 0; ) { + if (fromUTF8(data, n) == s_invalid) { + return false; + } + } + return true; +} + +CString +CUnicode::UTF8ToUCS2(const CString& src, bool* errors) +{ + // default to success + resetError(errors); + // get size of input string and reserve some space in output. // include UTF8's nul terminator. UInt32 n = src.size() + 1; @@ -56,7 +92,11 @@ CUnicode::UTF8ToUCS2(const CString& src) const UInt8* data = reinterpret_cast(src.c_str()); while (n > 0) { UInt32 c = fromUTF8(data, n); - if (c == s_invalid || c >= 0x00010000) { + if (c == s_invalid) { + c = s_replacement; + } + else if (c >= 0x00010000) { + setError(errors); c = s_replacement; } UInt16 ucs2 = static_cast(c); @@ -67,8 +107,11 @@ CUnicode::UTF8ToUCS2(const CString& src) } CString -CUnicode::UTF8ToUCS4(const CString& src) +CUnicode::UTF8ToUCS4(const CString& src, bool* errors) { + // default to success + resetError(errors); + // get size of input string and reserve some space in output. // include UTF8's nul terminator. UInt32 n = src.size() + 1; @@ -89,8 +132,11 @@ CUnicode::UTF8ToUCS4(const CString& src) } CString -CUnicode::UTF8ToUTF16(const CString& src) +CUnicode::UTF8ToUTF16(const CString& src, bool* errors) { + // default to success + resetError(errors); + // get size of input string and reserve some space in output. // include UTF8's nul terminator. UInt32 n = src.size() + 1; @@ -101,7 +147,11 @@ CUnicode::UTF8ToUTF16(const CString& src) const UInt8* data = reinterpret_cast(src.c_str()); while (n > 0) { UInt32 c = fromUTF8(data, n); - if (c == s_invalid || c >= 0x00110000) { + if (c == s_invalid) { + c = s_replacement; + } + else if (c >= 0x00110000) { + setError(errors); c = s_replacement; } if (c < 0x00010000) { @@ -121,8 +171,11 @@ CUnicode::UTF8ToUTF16(const CString& src) } CString -CUnicode::UTF8ToUTF32(const CString& src) +CUnicode::UTF8ToUTF32(const CString& src, bool* errors) { + // default to success + resetError(errors); + // get size of input string and reserve some space in output. // include UTF8's nul terminator. UInt32 n = src.size() + 1; @@ -133,7 +186,11 @@ CUnicode::UTF8ToUTF32(const CString& src) const UInt8* data = reinterpret_cast(src.c_str()); while (n > 0) { UInt32 c = fromUTF8(data, n); - if (c == s_invalid || c >= 0x00110000) { + if (c == s_invalid) { + c = s_replacement; + } + else if (c >= 0x00110000) { + setError(errors); c = s_replacement; } dst.append(reinterpret_cast(&c), 4); @@ -143,38 +200,13 @@ CUnicode::UTF8ToUTF32(const CString& src) } CString -CUnicode::UCS2ToUTF8(const CString& src) +CUnicode::UTF8ToText(const CString& src, bool* errors) { - UInt32 n = src.size() >> 1; - return doUCS2ToUTF8(reinterpret_cast(src.data()), n); -} + // default to success + resetError(errors); -CString -CUnicode::UCS4ToUTF8(const CString& src) -{ - UInt32 n = src.size() >> 2; - return doUCS4ToUTF8(reinterpret_cast(src.data()), n); -} - -CString -CUnicode::UTF16ToUTF8(const CString& src) -{ - UInt32 n = src.size() >> 1; - return doUTF16ToUTF8(reinterpret_cast(src.data()), n); -} - -CString -CUnicode::UTF32ToUTF8(const CString& src) -{ - UInt32 n = src.size() >> 2; - return doUTF32ToUTF8(reinterpret_cast(src.data()), n); -} - -CString -CUnicode::UTF8ToText(const CString& src) -{ // convert to wide char - wchar_t* tmp = UTF8ToWideChar(src); + wchar_t* tmp = UTF8ToWideChar(src, errors); // get length of multibyte string size_t len = 0; @@ -185,6 +217,7 @@ CUnicode::UTF8ToText(const CString& src) size_t mblen = wcrtomb(mbc, *scan, &state); if (mblen == -1) { // unconvertable character + setError(errors); len += 1; } else { @@ -225,34 +258,116 @@ CUnicode::UTF8ToText(const CString& src) } CString -CUnicode::textToUTF8(const CString& src) +CUnicode::UCS2ToUTF8(const CString& src, bool* errors) { - // get length of wide char string + UInt32 n = src.size() >> 1; + return doUCS2ToUTF8(reinterpret_cast(src.data()), n, errors); +} + +CString +CUnicode::UCS4ToUTF8(const CString& src, bool* errors) +{ + UInt32 n = src.size() >> 2; + return doUCS4ToUTF8(reinterpret_cast(src.data()), n, errors); +} + +CString +CUnicode::UTF16ToUTF8(const CString& src, bool* errors) +{ + UInt32 n = src.size() >> 1; + return doUTF16ToUTF8(reinterpret_cast(src.data()), n, errors); +} + +CString +CUnicode::UTF32ToUTF8(const CString& src, bool* errors) +{ + UInt32 n = src.size() >> 2; + return doUTF32ToUTF8(reinterpret_cast(src.data()), n, errors); +} + +CString +CUnicode::textToUTF8(const CString& src, bool* errors) +{ + // default to success + resetError(errors); + + // get length of multibyte string + UInt32 n = src.size(); + size_t len = 0; mbstate_t state; memset(&state, 0, sizeof(state)); - const char* scratch = src.c_str(); - size_t len = mbsrtowcs(NULL, &scratch, 0, &state); - if (len == (size_t)-1) { - // invalid character in src - return CString(); + for (const char* scan = src.c_str(); n > 0 && *scan != 0; ) { + size_t mblen = mbrtowc(NULL, scan, n, &state); + switch (mblen) { + case (size_t)2: + // incomplete last character. convert to unknown character. + setError(errors); + len += 1; + n = 0; + break; + + case (size_t)1: + // invalid character. count one unknown character and + // start at the next byte. + setError(errors); + len += 1; + scan += 1; + n -= 1; + break; + + default: + // normal character + len += 1; + scan += mblen; + n -= mblen; + break; + } } + memset(&state, 0, sizeof(state)); + + // allocate wide character string + wchar_t* wcs = new wchar_t[len + 1]; // convert multibyte to wide char - scratch = src.c_str(); - wchar_t* dst = new wchar_t[len + 1]; - mbsrtowcs(dst, &scratch, len + 1, &state); + n = src.size(); + wchar_t* dst = wcs; + for (const char* scan = src.c_str(); n > 0 && *scan != 0; ++dst) { + size_t mblen = mbrtowc(dst, scan, n, &state); + switch (mblen) { + case (size_t)2: + // incomplete character. convert to unknown character. + *dst = (wchar_t)0xfffd; + n = 0; + break; + + case (size_t)1: + // invalid character. count one unknown character and + // start at the next byte. + scan += 1; + n -= 1; + *dst = (wchar_t)0xfffd; + break; + + default: + // normal character + scan += mblen; + n -= mblen; + break; + } + } + *dst = L'\0'; // convert to UTF8 - CString utf8 = wideCharToUTF8(dst); + CString utf8 = wideCharToUTF8(wcs, errors); // clean up - delete[] dst; + delete[] wcs; return utf8; } wchar_t* -CUnicode::UTF8ToWideChar(const CString& src) +CUnicode::UTF8ToWideChar(const CString& src, bool* errors) { // convert to platform's wide character encoding. // note -- this must include a wide nul character (independent of @@ -272,21 +387,26 @@ CUnicode::UTF8ToWideChar(const CString& src) } CString -CUnicode::wideCharToUTF8(const wchar_t* src) +CUnicode::wideCharToUTF8(const wchar_t* src, bool* errors) { // convert from platform's wide character encoding. // note -- this must include a wide nul character (independent of // the CString's nul character). #if WINDOWS_LIKE - return doUCS16ToUTF8(reinterpret_cast(src), wcslen(src)); + return doUCS16ToUTF8(reinterpret_cast(src), + wcslen(src), errors); #elif UNIX_LIKE - return doUCS4ToUTF8(reinterpret_cast(src), wcslen(src)); + return doUCS4ToUTF8(reinterpret_cast(src), + wcslen(src), errors); #endif } CString -CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n) +CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n, bool* errors) { + // default to success + resetError(errors); + // make some space CString dst; dst.reserve(n); @@ -294,7 +414,7 @@ CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n) // convert each character for (; n > 0; data += 2, --n) { UInt32 c = decode16(data); - toUTF8(dst, c); + toUTF8(dst, c, errors); } // remove extra trailing nul @@ -306,8 +426,11 @@ CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n) } CString -CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n) +CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n, bool* errors) { + // default to success + resetError(errors); + // make some space CString dst; dst.reserve(n); @@ -315,7 +438,7 @@ CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n) // convert each character for (; n > 0; data += 4, --n) { UInt32 c = decode32(data); - toUTF8(dst, c); + toUTF8(dst, c, errors); } // remove extra trailing nul @@ -327,8 +450,11 @@ CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n) } CString -CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n) +CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n, bool* errors) { + // default to success + resetError(errors); + // make some space CString dst; dst.reserve(n); @@ -337,11 +463,12 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n) for (; n > 0; data += 2, --n) { UInt32 c = decode16(data); if (c < 0x0000d800 || c > 0x0000dfff) { - toUTF8(dst, c); + toUTF8(dst, c, errors); } else if (n == 1) { // error -- missing second word - toUTF8(dst, s_replacement); + setError(errors); + toUTF8(dst, s_replacement, NULL); } else if (c >= 0x0000d800 && c <= 0x0000dbff) { UInt32 c2 = decode16(data); @@ -349,16 +476,18 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n) --n; if (c2 < 0x0000dc00 || c2 > 0x0000dfff) { // error -- [d800,dbff] not followed by [dc00,dfff] - toUTF8(dst, s_replacement); + setError(errors); + toUTF8(dst, s_replacement, NULL); } else { c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000; - toUTF8(dst, c); + toUTF8(dst, c, errors); } } else { // error -- [dc00,dfff] without leading [d800,dbff] - toUTF8(dst, s_replacement); + setError(errors); + toUTF8(dst, s_replacement, NULL); } } @@ -371,8 +500,11 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n) } CString -CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n) +CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n, bool* errors) { + // default to success + resetError(errors); + // make some space CString dst; dst.reserve(n); @@ -381,9 +513,10 @@ CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n) for (; n > 0; data += 4, --n) { UInt32 c = decode32(data); if (c >= 0x00110000) { + setError(errors); c = s_replacement; } - toUTF8(dst, c); + toUTF8(dst, c, errors); } // remove extra trailing nul @@ -571,12 +704,13 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n) } void -CUnicode::toUTF8(CString& dst, UInt32 c) +CUnicode::toUTF8(CString& dst, UInt32 c, bool* errors) { UInt8 data[6]; // handle characters outside the valid range - if (c >= 0x80000000) { + if ((c >= 0x0000d800 && c <= 0x0000dfff) || c >= 0x80000000) { + setError(errors); c = s_replacement; } diff --git a/base/CUnicode.h b/base/CUnicode.h index 81ff95d7..2676e086 100644 --- a/base/CUnicode.h +++ b/base/CUnicode.h @@ -7,39 +7,50 @@ class CUnicode { public: - static CString UTF8ToUCS2(const CString&); - static CString UTF8ToUCS4(const CString&); - static CString UTF8ToUTF16(const CString&); - static CString UTF8ToUTF32(const CString&); + // returns true iff the string contains a valid sequence of UTF-8 + // encoded characters. + static bool isUTF8(const CString&); - static CString UCS2ToUTF8(const CString&); - static CString UCS4ToUTF8(const CString&); - static CString UTF16ToUTF8(const CString&); - static CString UTF32ToUTF8(const CString&); + // convert from UTF-8 encoding to other encodings. if errors is + // not NULL then it gets true if any characters could not be + // encoded in the target encoding and false otherwise. note + // that decoding errors do not set errors to error. UTF8ToText() + // converts to the current locale's (multibyte) encoding. + static CString UTF8ToUCS2(const CString&, bool* errors = NULL); + static CString UTF8ToUCS4(const CString&, bool* errors = NULL); + static CString UTF8ToUTF16(const CString&, bool* errors = NULL); + static CString UTF8ToUTF32(const CString&, bool* errors = NULL); + static CString UTF8ToText(const CString&, bool* errors = NULL); - // convert UTF-8 to/from the current locale's encoding - static CString UTF8ToText(const CString&); - static CString textToUTF8(const CString&); + // convert from some encoding to UTF-8. if errors is not NULL + // then it gets true if any characters could not be decoded and + // false otherwise. textToUTF8() converts from the current + // locale's (multibyte) encoding. + static CString UCS2ToUTF8(const CString&, bool* errors = NULL); + static CString UCS4ToUTF8(const CString&, bool* errors = NULL); + static CString UTF16ToUTF8(const CString&, bool* errors = NULL); + static CString UTF32ToUTF8(const CString&, bool* errors = NULL); + static CString textToUTF8(const CString&, bool* errors = NULL); private: // convert UTF8 to nul terminated wchar_t string (using whatever // encoding is native to the platform). caller must delete[] // the returned string. - static wchar_t* UTF8ToWideChar(const CString&); + static wchar_t* UTF8ToWideChar(const CString&, bool* errors); // convert nul terminated wchar_t string (in platform's native // encoding) to UTF8. - static CString wideCharToUTF8(const wchar_t*); + static CString wideCharToUTF8(const wchar_t*, bool* errors); // internal conversion to UTF8 - static CString doUCS2ToUTF8(const UInt8* src, UInt32 n); - static CString doUCS4ToUTF8(const UInt8* src, UInt32 n); - static CString doUTF16ToUTF8(const UInt8* src, UInt32 n); - static CString doUTF32ToUTF8(const UInt8* src, UInt32 n); + static CString doUCS2ToUTF8(const UInt8* src, UInt32 n, bool* errors); + static CString doUCS4ToUTF8(const UInt8* src, UInt32 n, bool* errors); + static CString doUTF16ToUTF8(const UInt8* src, UInt32 n, bool* errors); + static CString doUTF32ToUTF8(const UInt8* src, UInt32 n, bool* errors); // convert characters to/from UTF8 static UInt32 fromUTF8(const UInt8*& src, UInt32& size); - static void toUTF8(CString& dst, UInt32 c); + static void toUTF8(CString& dst, UInt32 c, bool* errors); private: static UInt32 s_invalid; diff --git a/platform/CXWindowsClipboard.cpp b/platform/CXWindowsClipboard.cpp index 76c766bf..75efae08 100644 --- a/platform/CXWindowsClipboard.cpp +++ b/platform/CXWindowsClipboard.cpp @@ -151,22 +151,14 @@ CXWindowsClipboard::addSimpleRequest(Window requestor, type = getTimestampData(data, &format); } else { -char* name = XGetAtomName(m_display, target); -log((CLOG_INFO "request target: %d %s", target, name)); -XFree(name); IXWindowsClipboardConverter* converter = getConverter(target); if (converter != NULL) { -log((CLOG_INFO "found converter")); IClipboard::EFormat clipboardFormat = converter->getFormat(); -log((CLOG_INFO "clipboard format: %d", clipboardFormat)); if (m_added[clipboardFormat]) { -log((CLOG_INFO "added")); try { data = converter->fromIClipboard(m_data[clipboardFormat]); format = converter->getDataSize(); type = converter->getAtom(); -log((CLOG_INFO " src: (%d) %s", m_data[clipboardFormat].size(), m_data[clipboardFormat].c_str())); -log((CLOG_INFO " dst: (%d) %s", data.size(), data.c_str())); } catch (...) { // ignore -- cannot convert @@ -535,16 +527,9 @@ CXWindowsClipboard::icccmFillCache() // add to clipboard and note we've done it IClipboard::EFormat format = converter->getFormat(); - try { - m_data[format] = converter->toIClipboard(targetData); - if (!m_data[format].empty()) { - m_added[format] = true; - log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target)); - } - } - catch (...) { - // ignore -- could not convert data - } + m_data[format] = converter->toIClipboard(targetData); + m_added[format] = true; + log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target)); } } @@ -799,16 +784,9 @@ CXWindowsClipboard::motifFillCache() // add to clipboard and note we've done it IClipboard::EFormat format = converter->getFormat(); - try { - m_data[format] = converter->toIClipboard(targetData); - if (!m_data[format].empty()) { - m_added[format] = true; - log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target)); - } - } - catch (...) { - // ignore -- could not convert data - } + m_data[format] = converter->toIClipboard(targetData); + m_added[format] = true; + log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target)); } } diff --git a/platform/CXWindowsClipboardTextConverter.cpp b/platform/CXWindowsClipboardTextConverter.cpp index be3b8c93..ae791730 100644 --- a/platform/CXWindowsClipboardTextConverter.cpp +++ b/platform/CXWindowsClipboardTextConverter.cpp @@ -44,5 +44,17 @@ CXWindowsClipboardTextConverter::fromIClipboard(const CString& data) const CString CXWindowsClipboardTextConverter::toIClipboard(const CString& data) const { - return CUnicode::textToUTF8(data); + // convert to UTF-8 + bool errors; + CString utf8 = CUnicode::textToUTF8(data, &errors); + + // if there were decoding errors then, to support old applications + // that don't understand UTF-8 but can report the exact binary + // UTF-8 representation, see if the data appears to be UTF-8. if + // so then use it as is. + if (errors && CUnicode::isUTF8(data)) { + return data; + } + + return utf8; }