From 16cc05d56bd9d75b857fe2c952b8381ce74d8913 Mon Sep 17 00:00:00 2001 From: crs Date: Tue, 23 Jul 2002 09:33:50 +0000 Subject: [PATCH] checkpoint. more UTF8 clipboard testing. --- base/CUnicode.cpp | 210 +++++++++++++++++++++++--------- base/CUnicode.h | 3 +- platform/CXWindowsClipboard.cpp | 51 ++++---- 3 files changed, 185 insertions(+), 79 deletions(-) diff --git a/base/CUnicode.cpp b/base/CUnicode.cpp index 8925752d..ccabd227 100644 --- a/base/CUnicode.cpp +++ b/base/CUnicode.cpp @@ -1,4 +1,5 @@ #include "CUnicode.h" +#include #include // @@ -39,7 +40,8 @@ decode32(const UInt8* n) // CUnicode // -UInt32 CUnicode::s_invalid = 0x0000ffff; +UInt32 CUnicode::s_invalid = 0x0000ffff; +UInt32 CUnicode::s_replacement = 0x0000fffd; CString CUnicode::UTF8ToUCS2(const CString& src) @@ -54,10 +56,11 @@ CUnicode::UTF8ToUCS2(const CString& src) const UInt8* data = reinterpret_cast(src.c_str()); while (n > 0) { UInt32 c = fromUTF8(data, n); - if (c != s_invalid && c < 0x00010000) { - UInt16 ucs2 = static_cast(c); - dst.append(reinterpret_cast(&ucs2), 2); + if (c == s_invalid || c >= 0x00010000) { + c = s_replacement; } + UInt16 ucs2 = static_cast(c); + dst.append(reinterpret_cast(&ucs2), 2); } return dst; @@ -76,9 +79,10 @@ CUnicode::UTF8ToUCS4(const CString& src) const UInt8* data = reinterpret_cast(src.c_str()); while (n > 0) { UInt32 c = fromUTF8(data, n); - if (c != s_invalid) { - dst.append(reinterpret_cast(&c), 4); + if (c == s_invalid) { + c = s_replacement; } + dst.append(reinterpret_cast(&c), 4); } return dst; @@ -97,18 +101,19 @@ CUnicode::UTF8ToUTF16(const CString& src) const UInt8* data = reinterpret_cast(src.c_str()); while (n > 0) { UInt32 c = fromUTF8(data, n); - if (c != s_invalid && c < 0x0010ffff) { - if (c < 0x00010000) { - UInt16 ucs2 = static_cast(c); - dst.append(reinterpret_cast(&ucs2), 2); - } - else { - c -= 0x00010000; - UInt16 utf16h = static_cast(c >> 10) + 0xd800; - UInt16 utf16l = (static_cast(c) & 0x03ff) + 0xdc00; - dst.append(reinterpret_cast(&utf16h), 2); - dst.append(reinterpret_cast(&utf16l), 2); - } + if (c == s_invalid || c >= 0x00110000) { + c = s_replacement; + } + if (c < 0x00010000) { + UInt16 ucs2 = static_cast(c); + dst.append(reinterpret_cast(&ucs2), 2); + } + else { + c -= 0x00010000; + UInt16 utf16h = static_cast(c >> 10) + 0xd800; + UInt16 utf16l = (static_cast(c) & 0x03ff) + 0xdc00; + dst.append(reinterpret_cast(&utf16h), 2); + dst.append(reinterpret_cast(&utf16l), 2); } } @@ -118,8 +123,23 @@ CUnicode::UTF8ToUTF16(const CString& src) CString CUnicode::UTF8ToUTF32(const CString& src) { - // FIXME -- should ensure dst has no characters over U-0010FFFF - return UTF8ToUCS4(src); + // get size of input string and reserve some space in output. + // include UTF8's nul terminator. + UInt32 n = src.size() + 1; + CString dst; + dst.reserve(4 * n); + + // convert each character + const UInt8* data = reinterpret_cast(src.c_str()); + while (n > 0) { + UInt32 c = fromUTF8(data, n); + if (c == s_invalid || c >= 0x00110000) { + c = s_replacement; + } + dst.append(reinterpret_cast(&c), 4); + } + + return dst; } CString @@ -157,24 +177,48 @@ CUnicode::UTF8ToText(const CString& src) wchar_t* tmp = UTF8ToWideChar(src); // get length of multibyte string + size_t len = 0; + char mbc[MB_LEN_MAX]; mbstate_t state; memset(&state, 0, sizeof(state)); - const wchar_t* scratch = tmp; - size_t len = wcsrtombs(NULL, &scratch, 0, &state); - if (len == (size_t)-1) { - // invalid character in src - delete[] tmp; - return CString(); + for (const wchar_t* scan = tmp; *scan != 0; ++scan) { + size_t mblen = wcrtomb(mbc, *scan, &state); + if (mblen == -1) { + // unconvertable character + len += 1; + } + else { + len += mblen; + } } + // check if state is in initial state. if not then count the + // bytes for returning it to the initial state. + if (mbsinit(&state) == 0) { + len += wcrtomb(mbc, L'\0', &state) - 1; + } + assert(mbsinit(&state) != 0); + + // allocate multibyte string + char* mbs = new char[len + 1]; + // convert to multibyte - scratch = tmp; - char* dst = new char[len + 1]; - wcsrtombs(dst, &scratch, len + 1, &state); - CString text(dst); + char* dst = mbs; + for (const wchar_t* scan = tmp; *scan != 0; ++scan) { + size_t mblen = wcrtomb(dst, *scan, &state); + if (mblen == -1) { + // unconvertable character + *dst++ = '?'; + } + else { + dst += len; + } + } + *dst = '\0'; + CString text(mbs); // clean up - delete[] dst; + delete[] mbs; delete[] tmp; return text; @@ -297,6 +341,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n) } else if (n == 1) { // error -- missing second word + toUTF8(dst, s_replacement); } else if (c >= 0x0000d800 && c <= 0x0000dbff) { UInt32 c2 = decode16(data); @@ -304,6 +349,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n) --n; if (c2 < 0x0000dc00 || c2 > 0x0000dfff) { // error -- [d800,dbff] not followed by [dc00,dfff] + toUTF8(dst, s_replacement); } else { c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000; @@ -312,6 +358,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n) } else { // error -- [dc00,dfff] without leading [d800,dbff] + toUTF8(dst, s_replacement); } } @@ -326,8 +373,25 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n) CString CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n) { - // FIXME -- should check that src has no characters over U-0010FFFF - return doUCS4ToUTF8(data, n); + // make some space + CString dst; + dst.reserve(n); + + // convert each character + for (; n > 0; data += 4, --n) { + UInt32 c = decode32(data); + if (c >= 0x00110000) { + c = s_replacement; + } + toUTF8(dst, c); + } + + // remove extra trailing nul + if (dst.size() > 0 && dst[dst.size() - 1] == '\0') { + dst.resize(dst.size() - 1); + } + + return dst; } UInt32 @@ -433,10 +497,54 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n) assert(0 && "invalid size"); } + // check that all bytes after the first have the pattern 10xxxxxx. + // truncated sequences are treated as a single malformed character. + bool truncated = false; + switch (size) { + case 6: + if ((data[5] & 0xc0) != 0x80) { + truncated = true; + size = 5; + } + // fall through + + case 5: + if ((data[4] & 0xc0) != 0x80) { + truncated = true; + size = 4; + } + // fall through + + case 4: + if ((data[3] & 0xc0) != 0x80) { + truncated = true; + size = 3; + } + // fall through + + case 3: + if ((data[2] & 0xc0) != 0x80) { + truncated = true; + size = 2; + } + // fall through + + case 2: + if ((data[1] & 0xc0) != 0x80) { + truncated = true; + size = 1; + } + } + // update parameters data += size; n -= size; + // invalid if sequence was truncated + if (truncated) { + return s_invalid; + } + // check for characters that didn't use the smallest possible encoding static UInt32 s_minChar[] = { 0, @@ -451,29 +559,11 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n) return s_invalid; } - // check that all bytes after the first have the pattern 10xxxxxx. - UInt8 a = 0x80; - switch (size) { - case 6: - a |= data[5]; - // fall through - - case 5: - a |= data[4]; - // fall through - - case 4: - a |= data[3]; - // fall through - - case 3: - a |= data[2]; - // fall through - - case 2: - a |= data[1]; + // check for characters not in ISO-10646 + if (c >= 0x0000d800 && c <= 0x0000dfff) { + return s_invalid; } - if ((a & 0xc0) != 0x80) { + if (c >= 0x0000fffe && c <= 0x0000ffff) { return s_invalid; } @@ -481,10 +571,16 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n) } void -CUnicode::toUTF8(CString& dst, const UInt32 c) +CUnicode::toUTF8(CString& dst, UInt32 c) { UInt8 data[6]; + // handle characters outside the valid range + if (c >= 0x80000000) { + c = s_replacement; + } + + // convert to UTF-8 if (c < 0x00000080) { data[0] = static_cast(c); dst.append(reinterpret_cast(data), 1); @@ -525,6 +621,6 @@ CUnicode::toUTF8(CString& dst, const UInt32 c) dst.append(reinterpret_cast(data), 6); } else { - // invalid character + assert(0 && "character out of range"); } } diff --git a/base/CUnicode.h b/base/CUnicode.h index 4bcabe60..81ff95d7 100644 --- a/base/CUnicode.h +++ b/base/CUnicode.h @@ -39,10 +39,11 @@ private: // convert characters to/from UTF8 static UInt32 fromUTF8(const UInt8*& src, UInt32& size); - static void toUTF8(CString& dst, const UInt32 c); + static void toUTF8(CString& dst, UInt32 c); private: static UInt32 s_invalid; + static UInt32 s_replacement; }; #endif diff --git a/platform/CXWindowsClipboard.cpp b/platform/CXWindowsClipboard.cpp index e1f19eff..76c766bf 100644 --- a/platform/CXWindowsClipboard.cpp +++ b/platform/CXWindowsClipboard.cpp @@ -161,11 +161,16 @@ log((CLOG_INFO "found converter")); log((CLOG_INFO "clipboard format: %d", clipboardFormat)); if (m_added[clipboardFormat]) { log((CLOG_INFO "added")); - type = converter->getAtom(); - format = converter->getDataSize(); - data = converter->fromIClipboard(m_data[clipboardFormat]); + try { + data = converter->fromIClipboard(m_data[clipboardFormat]); + format = converter->getDataSize(); + type = converter->getAtom(); log((CLOG_INFO " src: (%d) %s", m_data[clipboardFormat].size(), m_data[clipboardFormat].c_str())); log((CLOG_INFO " dst: (%d) %s", data.size(), data.c_str())); + } + catch (...) { + // ignore -- cannot convert + } } } } @@ -529,15 +534,17 @@ CXWindowsClipboard::icccmFillCache() } // add to clipboard and note we've done it - m_data[converter->getFormat()] = converter->toIClipboard(targetData); - m_added[converter->getFormat()] = true; -// XXX -char* name = XGetAtomName(m_display, target); -log((CLOG_INFO "src atom: %d %s", target, name)); -XFree(name); -log((CLOG_INFO "src data size: %d", targetData.size())); -log((CLOG_INFO "utf8 data size: %d", m_data[converter->getFormat()].size())); - log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target)); + IClipboard::EFormat format = converter->getFormat(); + try { + m_data[format] = converter->toIClipboard(targetData); + if (!m_data[format].empty()) { + m_added[format] = true; + log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target)); + } + } + catch (...) { + // ignore -- could not convert data + } } } @@ -791,15 +798,17 @@ CXWindowsClipboard::motifFillCache() targetData.erase(length); // add to clipboard and note we've done it - m_data[converter->getFormat()] = converter->toIClipboard(targetData); - m_added[converter->getFormat()] = true; -// XXX -char* name = XGetAtomName(m_display, target); -log((CLOG_INFO "src atom: %d %s", target, name)); -XFree(name); -log((CLOG_INFO "src data size: %d", targetData.size())); -log((CLOG_INFO "utf8 data size: %d", m_data[converter->getFormat()].size())); - log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target)); + IClipboard::EFormat format = converter->getFormat(); + try { + m_data[format] = converter->toIClipboard(targetData); + if (!m_data[format].empty()) { + m_added[format] = true; + log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target)); + } + } + catch (...) { + // ignore -- could not convert data + } } }