checkpoint. more UTF8 clipboard testing.

This commit is contained in:
crs 2002-07-23 09:33:50 +00:00
parent fcd99c9510
commit 16cc05d56b
3 changed files with 185 additions and 79 deletions

View File

@ -1,4 +1,5 @@
#include "CUnicode.h" #include "CUnicode.h"
#include <limits.h>
#include <string.h> #include <string.h>
// //
@ -40,6 +41,7 @@ decode32(const UInt8* n)
// //
UInt32 CUnicode::s_invalid = 0x0000ffff; UInt32 CUnicode::s_invalid = 0x0000ffff;
UInt32 CUnicode::s_replacement = 0x0000fffd;
CString CString
CUnicode::UTF8ToUCS2(const CString& src) CUnicode::UTF8ToUCS2(const CString& src)
@ -54,11 +56,12 @@ CUnicode::UTF8ToUCS2(const CString& src)
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
while (n > 0) { while (n > 0) {
UInt32 c = fromUTF8(data, n); UInt32 c = fromUTF8(data, n);
if (c != s_invalid && c < 0x00010000) { if (c == s_invalid || c >= 0x00010000) {
c = s_replacement;
}
UInt16 ucs2 = static_cast<UInt16>(c); UInt16 ucs2 = static_cast<UInt16>(c);
dst.append(reinterpret_cast<const char*>(&ucs2), 2); dst.append(reinterpret_cast<const char*>(&ucs2), 2);
} }
}
return dst; return dst;
} }
@ -76,9 +79,10 @@ CUnicode::UTF8ToUCS4(const CString& src)
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
while (n > 0) { while (n > 0) {
UInt32 c = fromUTF8(data, n); UInt32 c = fromUTF8(data, n);
if (c != s_invalid) { if (c == s_invalid) {
dst.append(reinterpret_cast<const char*>(&c), 4); c = s_replacement;
} }
dst.append(reinterpret_cast<const char*>(&c), 4);
} }
return dst; return dst;
@ -97,7 +101,9 @@ CUnicode::UTF8ToUTF16(const CString& src)
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
while (n > 0) { while (n > 0) {
UInt32 c = fromUTF8(data, n); UInt32 c = fromUTF8(data, n);
if (c != s_invalid && c < 0x0010ffff) { if (c == s_invalid || c >= 0x00110000) {
c = s_replacement;
}
if (c < 0x00010000) { if (c < 0x00010000) {
UInt16 ucs2 = static_cast<UInt16>(c); UInt16 ucs2 = static_cast<UInt16>(c);
dst.append(reinterpret_cast<const char*>(&ucs2), 2); dst.append(reinterpret_cast<const char*>(&ucs2), 2);
@ -110,7 +116,6 @@ CUnicode::UTF8ToUTF16(const CString& src)
dst.append(reinterpret_cast<const char*>(&utf16l), 2); dst.append(reinterpret_cast<const char*>(&utf16l), 2);
} }
} }
}
return dst; return dst;
} }
@ -118,8 +123,23 @@ CUnicode::UTF8ToUTF16(const CString& src)
CString CString
CUnicode::UTF8ToUTF32(const CString& src) CUnicode::UTF8ToUTF32(const CString& src)
{ {
// FIXME -- should ensure dst has no characters over U-0010FFFF // get size of input string and reserve some space in output.
return UTF8ToUCS4(src); // include UTF8's nul terminator.
UInt32 n = src.size() + 1;
CString dst;
dst.reserve(4 * n);
// convert each character
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
while (n > 0) {
UInt32 c = fromUTF8(data, n);
if (c == s_invalid || c >= 0x00110000) {
c = s_replacement;
}
dst.append(reinterpret_cast<const char*>(&c), 4);
}
return dst;
} }
CString CString
@ -157,24 +177,48 @@ CUnicode::UTF8ToText(const CString& src)
wchar_t* tmp = UTF8ToWideChar(src); wchar_t* tmp = UTF8ToWideChar(src);
// get length of multibyte string // get length of multibyte string
size_t len = 0;
char mbc[MB_LEN_MAX];
mbstate_t state; mbstate_t state;
memset(&state, 0, sizeof(state)); memset(&state, 0, sizeof(state));
const wchar_t* scratch = tmp; for (const wchar_t* scan = tmp; *scan != 0; ++scan) {
size_t len = wcsrtombs(NULL, &scratch, 0, &state); size_t mblen = wcrtomb(mbc, *scan, &state);
if (len == (size_t)-1) { if (mblen == -1) {
// invalid character in src // unconvertable character
delete[] tmp; len += 1;
return CString(); }
else {
len += mblen;
}
} }
// check if state is in initial state. if not then count the
// bytes for returning it to the initial state.
if (mbsinit(&state) == 0) {
len += wcrtomb(mbc, L'\0', &state) - 1;
}
assert(mbsinit(&state) != 0);
// allocate multibyte string
char* mbs = new char[len + 1];
// convert to multibyte // convert to multibyte
scratch = tmp; char* dst = mbs;
char* dst = new char[len + 1]; for (const wchar_t* scan = tmp; *scan != 0; ++scan) {
wcsrtombs(dst, &scratch, len + 1, &state); size_t mblen = wcrtomb(dst, *scan, &state);
CString text(dst); if (mblen == -1) {
// unconvertable character
*dst++ = '?';
}
else {
dst += len;
}
}
*dst = '\0';
CString text(mbs);
// clean up // clean up
delete[] dst; delete[] mbs;
delete[] tmp; delete[] tmp;
return text; return text;
@ -297,6 +341,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
} }
else if (n == 1) { else if (n == 1) {
// error -- missing second word // error -- missing second word
toUTF8(dst, s_replacement);
} }
else if (c >= 0x0000d800 && c <= 0x0000dbff) { else if (c >= 0x0000d800 && c <= 0x0000dbff) {
UInt32 c2 = decode16(data); UInt32 c2 = decode16(data);
@ -304,6 +349,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
--n; --n;
if (c2 < 0x0000dc00 || c2 > 0x0000dfff) { if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
// error -- [d800,dbff] not followed by [dc00,dfff] // error -- [d800,dbff] not followed by [dc00,dfff]
toUTF8(dst, s_replacement);
} }
else { else {
c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000; c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
@ -312,6 +358,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
} }
else { else {
// error -- [dc00,dfff] without leading [d800,dbff] // error -- [dc00,dfff] without leading [d800,dbff]
toUTF8(dst, s_replacement);
} }
} }
@ -326,8 +373,25 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
CString CString
CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n) CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n)
{ {
// FIXME -- should check that src has no characters over U-0010FFFF // make some space
return doUCS4ToUTF8(data, n); CString dst;
dst.reserve(n);
// convert each character
for (; n > 0; data += 4, --n) {
UInt32 c = decode32(data);
if (c >= 0x00110000) {
c = s_replacement;
}
toUTF8(dst, c);
}
// remove extra trailing nul
if (dst.size() > 0 && dst[dst.size() - 1] == '\0') {
dst.resize(dst.size() - 1);
}
return dst;
} }
UInt32 UInt32
@ -433,10 +497,54 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
assert(0 && "invalid size"); assert(0 && "invalid size");
} }
// check that all bytes after the first have the pattern 10xxxxxx.
// truncated sequences are treated as a single malformed character.
bool truncated = false;
switch (size) {
case 6:
if ((data[5] & 0xc0) != 0x80) {
truncated = true;
size = 5;
}
// fall through
case 5:
if ((data[4] & 0xc0) != 0x80) {
truncated = true;
size = 4;
}
// fall through
case 4:
if ((data[3] & 0xc0) != 0x80) {
truncated = true;
size = 3;
}
// fall through
case 3:
if ((data[2] & 0xc0) != 0x80) {
truncated = true;
size = 2;
}
// fall through
case 2:
if ((data[1] & 0xc0) != 0x80) {
truncated = true;
size = 1;
}
}
// update parameters // update parameters
data += size; data += size;
n -= size; n -= size;
// invalid if sequence was truncated
if (truncated) {
return s_invalid;
}
// check for characters that didn't use the smallest possible encoding // check for characters that didn't use the smallest possible encoding
static UInt32 s_minChar[] = { static UInt32 s_minChar[] = {
0, 0,
@ -451,29 +559,11 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
return s_invalid; return s_invalid;
} }
// check that all bytes after the first have the pattern 10xxxxxx. // check for characters not in ISO-10646
UInt8 a = 0x80; if (c >= 0x0000d800 && c <= 0x0000dfff) {
switch (size) { return s_invalid;
case 6:
a |= data[5];
// fall through
case 5:
a |= data[4];
// fall through
case 4:
a |= data[3];
// fall through
case 3:
a |= data[2];
// fall through
case 2:
a |= data[1];
} }
if ((a & 0xc0) != 0x80) { if (c >= 0x0000fffe && c <= 0x0000ffff) {
return s_invalid; return s_invalid;
} }
@ -481,10 +571,16 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
} }
void void
CUnicode::toUTF8(CString& dst, const UInt32 c) CUnicode::toUTF8(CString& dst, UInt32 c)
{ {
UInt8 data[6]; UInt8 data[6];
// handle characters outside the valid range
if (c >= 0x80000000) {
c = s_replacement;
}
// convert to UTF-8
if (c < 0x00000080) { if (c < 0x00000080) {
data[0] = static_cast<UInt8>(c); data[0] = static_cast<UInt8>(c);
dst.append(reinterpret_cast<char*>(data), 1); dst.append(reinterpret_cast<char*>(data), 1);
@ -525,6 +621,6 @@ CUnicode::toUTF8(CString& dst, const UInt32 c)
dst.append(reinterpret_cast<char*>(data), 6); dst.append(reinterpret_cast<char*>(data), 6);
} }
else { else {
// invalid character assert(0 && "character out of range");
} }
} }

View File

@ -39,10 +39,11 @@ private:
// convert characters to/from UTF8 // convert characters to/from UTF8
static UInt32 fromUTF8(const UInt8*& src, UInt32& size); static UInt32 fromUTF8(const UInt8*& src, UInt32& size);
static void toUTF8(CString& dst, const UInt32 c); static void toUTF8(CString& dst, UInt32 c);
private: private:
static UInt32 s_invalid; static UInt32 s_invalid;
static UInt32 s_replacement;
}; };
#endif #endif

View File

@ -161,12 +161,17 @@ log((CLOG_INFO "found converter"));
log((CLOG_INFO "clipboard format: %d", clipboardFormat)); log((CLOG_INFO "clipboard format: %d", clipboardFormat));
if (m_added[clipboardFormat]) { if (m_added[clipboardFormat]) {
log((CLOG_INFO "added")); log((CLOG_INFO "added"));
type = converter->getAtom(); try {
format = converter->getDataSize();
data = converter->fromIClipboard(m_data[clipboardFormat]); data = converter->fromIClipboard(m_data[clipboardFormat]);
format = converter->getDataSize();
type = converter->getAtom();
log((CLOG_INFO " src: (%d) %s", m_data[clipboardFormat].size(), m_data[clipboardFormat].c_str())); log((CLOG_INFO " src: (%d) %s", m_data[clipboardFormat].size(), m_data[clipboardFormat].c_str()));
log((CLOG_INFO " dst: (%d) %s", data.size(), data.c_str())); log((CLOG_INFO " dst: (%d) %s", data.size(), data.c_str()));
} }
catch (...) {
// ignore -- cannot convert
}
}
} }
} }
@ -529,16 +534,18 @@ CXWindowsClipboard::icccmFillCache()
} }
// add to clipboard and note we've done it // add to clipboard and note we've done it
m_data[converter->getFormat()] = converter->toIClipboard(targetData); IClipboard::EFormat format = converter->getFormat();
m_added[converter->getFormat()] = true; try {
// XXX m_data[format] = converter->toIClipboard(targetData);
char* name = XGetAtomName(m_display, target); if (!m_data[format].empty()) {
log((CLOG_INFO "src atom: %d %s", target, name)); m_added[format] = true;
XFree(name);
log((CLOG_INFO "src data size: %d", targetData.size()));
log((CLOG_INFO "utf8 data size: %d", m_data[converter->getFormat()].size()));
log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target)); log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target));
} }
}
catch (...) {
// ignore -- could not convert data
}
}
} }
bool bool
@ -791,16 +798,18 @@ CXWindowsClipboard::motifFillCache()
targetData.erase(length); targetData.erase(length);
// add to clipboard and note we've done it // add to clipboard and note we've done it
m_data[converter->getFormat()] = converter->toIClipboard(targetData); IClipboard::EFormat format = converter->getFormat();
m_added[converter->getFormat()] = true; try {
// XXX m_data[format] = converter->toIClipboard(targetData);
char* name = XGetAtomName(m_display, target); if (!m_data[format].empty()) {
log((CLOG_INFO "src atom: %d %s", target, name)); m_added[format] = true;
XFree(name);
log((CLOG_INFO "src data size: %d", targetData.size()));
log((CLOG_INFO "utf8 data size: %d", m_data[converter->getFormat()].size()));
log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target)); log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target));
} }
}
catch (...) {
// ignore -- could not convert data
}
}
} }
IClipboard::Time IClipboard::Time