checkpoint. more UTF8 clipboard stuff.

This commit is contained in:
crs 2002-07-23 11:36:18 +00:00
parent 16cc05d56b
commit 1f5cb6a96f
4 changed files with 249 additions and 114 deletions

View File

@ -36,6 +36,26 @@ decode32(const UInt8* n)
return c.n32; return c.n32;
} }
inline
static
void
resetError(bool* errors)
{
if (errors != NULL) {
*errors = false;
}
}
inline
static
void
setError(bool* errors)
{
if (errors != NULL) {
*errors = true;
}
}
// //
// CUnicode // CUnicode
// //
@ -43,9 +63,25 @@ decode32(const UInt8* n)
UInt32 CUnicode::s_invalid = 0x0000ffff; UInt32 CUnicode::s_invalid = 0x0000ffff;
UInt32 CUnicode::s_replacement = 0x0000fffd; UInt32 CUnicode::s_replacement = 0x0000fffd;
CString bool
CUnicode::UTF8ToUCS2(const CString& src) CUnicode::isUTF8(const CString& src)
{ {
// convert and test each character
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
for (UInt32 n = src.size(); n > 0; ) {
if (fromUTF8(data, n) == s_invalid) {
return false;
}
}
return true;
}
CString
CUnicode::UTF8ToUCS2(const CString& src, bool* errors)
{
// default to success
resetError(errors);
// get size of input string and reserve some space in output. // get size of input string and reserve some space in output.
// include UTF8's nul terminator. // include UTF8's nul terminator.
UInt32 n = src.size() + 1; UInt32 n = src.size() + 1;
@ -56,7 +92,11 @@ CUnicode::UTF8ToUCS2(const CString& src)
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
while (n > 0) { while (n > 0) {
UInt32 c = fromUTF8(data, n); UInt32 c = fromUTF8(data, n);
if (c == s_invalid || c >= 0x00010000) { if (c == s_invalid) {
c = s_replacement;
}
else if (c >= 0x00010000) {
setError(errors);
c = s_replacement; c = s_replacement;
} }
UInt16 ucs2 = static_cast<UInt16>(c); UInt16 ucs2 = static_cast<UInt16>(c);
@ -67,8 +107,11 @@ CUnicode::UTF8ToUCS2(const CString& src)
} }
CString CString
CUnicode::UTF8ToUCS4(const CString& src) CUnicode::UTF8ToUCS4(const CString& src, bool* errors)
{ {
// default to success
resetError(errors);
// get size of input string and reserve some space in output. // get size of input string and reserve some space in output.
// include UTF8's nul terminator. // include UTF8's nul terminator.
UInt32 n = src.size() + 1; UInt32 n = src.size() + 1;
@ -89,8 +132,11 @@ CUnicode::UTF8ToUCS4(const CString& src)
} }
CString CString
CUnicode::UTF8ToUTF16(const CString& src) CUnicode::UTF8ToUTF16(const CString& src, bool* errors)
{ {
// default to success
resetError(errors);
// get size of input string and reserve some space in output. // get size of input string and reserve some space in output.
// include UTF8's nul terminator. // include UTF8's nul terminator.
UInt32 n = src.size() + 1; UInt32 n = src.size() + 1;
@ -101,7 +147,11 @@ CUnicode::UTF8ToUTF16(const CString& src)
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
while (n > 0) { while (n > 0) {
UInt32 c = fromUTF8(data, n); UInt32 c = fromUTF8(data, n);
if (c == s_invalid || c >= 0x00110000) { if (c == s_invalid) {
c = s_replacement;
}
else if (c >= 0x00110000) {
setError(errors);
c = s_replacement; c = s_replacement;
} }
if (c < 0x00010000) { if (c < 0x00010000) {
@ -121,8 +171,11 @@ CUnicode::UTF8ToUTF16(const CString& src)
} }
CString CString
CUnicode::UTF8ToUTF32(const CString& src) CUnicode::UTF8ToUTF32(const CString& src, bool* errors)
{ {
// default to success
resetError(errors);
// get size of input string and reserve some space in output. // get size of input string and reserve some space in output.
// include UTF8's nul terminator. // include UTF8's nul terminator.
UInt32 n = src.size() + 1; UInt32 n = src.size() + 1;
@ -133,7 +186,11 @@ CUnicode::UTF8ToUTF32(const CString& src)
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str()); const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
while (n > 0) { while (n > 0) {
UInt32 c = fromUTF8(data, n); UInt32 c = fromUTF8(data, n);
if (c == s_invalid || c >= 0x00110000) { if (c == s_invalid) {
c = s_replacement;
}
else if (c >= 0x00110000) {
setError(errors);
c = s_replacement; c = s_replacement;
} }
dst.append(reinterpret_cast<const char*>(&c), 4); dst.append(reinterpret_cast<const char*>(&c), 4);
@ -143,38 +200,13 @@ CUnicode::UTF8ToUTF32(const CString& src)
} }
CString CString
CUnicode::UCS2ToUTF8(const CString& src) CUnicode::UTF8ToText(const CString& src, bool* errors)
{ {
UInt32 n = src.size() >> 1; // default to success
return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n); resetError(errors);
}
CString
CUnicode::UCS4ToUTF8(const CString& src)
{
UInt32 n = src.size() >> 2;
return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n);
}
CString
CUnicode::UTF16ToUTF8(const CString& src)
{
UInt32 n = src.size() >> 1;
return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n);
}
CString
CUnicode::UTF32ToUTF8(const CString& src)
{
UInt32 n = src.size() >> 2;
return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n);
}
CString
CUnicode::UTF8ToText(const CString& src)
{
// convert to wide char // convert to wide char
wchar_t* tmp = UTF8ToWideChar(src); wchar_t* tmp = UTF8ToWideChar(src, errors);
// get length of multibyte string // get length of multibyte string
size_t len = 0; size_t len = 0;
@ -185,6 +217,7 @@ CUnicode::UTF8ToText(const CString& src)
size_t mblen = wcrtomb(mbc, *scan, &state); size_t mblen = wcrtomb(mbc, *scan, &state);
if (mblen == -1) { if (mblen == -1) {
// unconvertable character // unconvertable character
setError(errors);
len += 1; len += 1;
} }
else { else {
@ -225,34 +258,116 @@ CUnicode::UTF8ToText(const CString& src)
} }
CString CString
CUnicode::textToUTF8(const CString& src) CUnicode::UCS2ToUTF8(const CString& src, bool* errors)
{ {
// get length of wide char string UInt32 n = src.size() >> 1;
return doUCS2ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
}
CString
CUnicode::UCS4ToUTF8(const CString& src, bool* errors)
{
UInt32 n = src.size() >> 2;
return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
}
CString
CUnicode::UTF16ToUTF8(const CString& src, bool* errors)
{
UInt32 n = src.size() >> 1;
return doUTF16ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
}
CString
CUnicode::UTF32ToUTF8(const CString& src, bool* errors)
{
UInt32 n = src.size() >> 2;
return doUTF32ToUTF8(reinterpret_cast<const UInt8*>(src.data()), n, errors);
}
CString
CUnicode::textToUTF8(const CString& src, bool* errors)
{
// default to success
resetError(errors);
// get length of multibyte string
UInt32 n = src.size();
size_t len = 0;
mbstate_t state; mbstate_t state;
memset(&state, 0, sizeof(state)); memset(&state, 0, sizeof(state));
const char* scratch = src.c_str(); for (const char* scan = src.c_str(); n > 0 && *scan != 0; ) {
size_t len = mbsrtowcs(NULL, &scratch, 0, &state); size_t mblen = mbrtowc(NULL, scan, n, &state);
if (len == (size_t)-1) { switch (mblen) {
// invalid character in src case (size_t)2:
return CString(); // incomplete last character. convert to unknown character.
setError(errors);
len += 1;
n = 0;
break;
case (size_t)1:
// invalid character. count one unknown character and
// start at the next byte.
setError(errors);
len += 1;
scan += 1;
n -= 1;
break;
default:
// normal character
len += 1;
scan += mblen;
n -= mblen;
break;
} }
}
memset(&state, 0, sizeof(state));
// allocate wide character string
wchar_t* wcs = new wchar_t[len + 1];
// convert multibyte to wide char // convert multibyte to wide char
scratch = src.c_str(); n = src.size();
wchar_t* dst = new wchar_t[len + 1]; wchar_t* dst = wcs;
mbsrtowcs(dst, &scratch, len + 1, &state); for (const char* scan = src.c_str(); n > 0 && *scan != 0; ++dst) {
size_t mblen = mbrtowc(dst, scan, n, &state);
switch (mblen) {
case (size_t)2:
// incomplete character. convert to unknown character.
*dst = (wchar_t)0xfffd;
n = 0;
break;
case (size_t)1:
// invalid character. count one unknown character and
// start at the next byte.
scan += 1;
n -= 1;
*dst = (wchar_t)0xfffd;
break;
default:
// normal character
scan += mblen;
n -= mblen;
break;
}
}
*dst = L'\0';
// convert to UTF8 // convert to UTF8
CString utf8 = wideCharToUTF8(dst); CString utf8 = wideCharToUTF8(wcs, errors);
// clean up // clean up
delete[] dst; delete[] wcs;
return utf8; return utf8;
} }
wchar_t* wchar_t*
CUnicode::UTF8ToWideChar(const CString& src) CUnicode::UTF8ToWideChar(const CString& src, bool* errors)
{ {
// convert to platform's wide character encoding. // convert to platform's wide character encoding.
// note -- this must include a wide nul character (independent of // note -- this must include a wide nul character (independent of
@ -272,21 +387,26 @@ CUnicode::UTF8ToWideChar(const CString& src)
} }
CString CString
CUnicode::wideCharToUTF8(const wchar_t* src) CUnicode::wideCharToUTF8(const wchar_t* src, bool* errors)
{ {
// convert from platform's wide character encoding. // convert from platform's wide character encoding.
// note -- this must include a wide nul character (independent of // note -- this must include a wide nul character (independent of
// the CString's nul character). // the CString's nul character).
#if WINDOWS_LIKE #if WINDOWS_LIKE
return doUCS16ToUTF8(reinterpret_cast<const UInt8*>(src), wcslen(src)); return doUCS16ToUTF8(reinterpret_cast<const UInt8*>(src),
wcslen(src), errors);
#elif UNIX_LIKE #elif UNIX_LIKE
return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src), wcslen(src)); return doUCS4ToUTF8(reinterpret_cast<const UInt8*>(src),
wcslen(src), errors);
#endif #endif
} }
CString CString
CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n) CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n, bool* errors)
{ {
// default to success
resetError(errors);
// make some space // make some space
CString dst; CString dst;
dst.reserve(n); dst.reserve(n);
@ -294,7 +414,7 @@ CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n)
// convert each character // convert each character
for (; n > 0; data += 2, --n) { for (; n > 0; data += 2, --n) {
UInt32 c = decode16(data); UInt32 c = decode16(data);
toUTF8(dst, c); toUTF8(dst, c, errors);
} }
// remove extra trailing nul // remove extra trailing nul
@ -306,8 +426,11 @@ CUnicode::doUCS2ToUTF8(const UInt8* data, UInt32 n)
} }
CString CString
CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n) CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n, bool* errors)
{ {
// default to success
resetError(errors);
// make some space // make some space
CString dst; CString dst;
dst.reserve(n); dst.reserve(n);
@ -315,7 +438,7 @@ CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n)
// convert each character // convert each character
for (; n > 0; data += 4, --n) { for (; n > 0; data += 4, --n) {
UInt32 c = decode32(data); UInt32 c = decode32(data);
toUTF8(dst, c); toUTF8(dst, c, errors);
} }
// remove extra trailing nul // remove extra trailing nul
@ -327,8 +450,11 @@ CUnicode::doUCS4ToUTF8(const UInt8* data, UInt32 n)
} }
CString CString
CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n) CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n, bool* errors)
{ {
// default to success
resetError(errors);
// make some space // make some space
CString dst; CString dst;
dst.reserve(n); dst.reserve(n);
@ -337,11 +463,12 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
for (; n > 0; data += 2, --n) { for (; n > 0; data += 2, --n) {
UInt32 c = decode16(data); UInt32 c = decode16(data);
if (c < 0x0000d800 || c > 0x0000dfff) { if (c < 0x0000d800 || c > 0x0000dfff) {
toUTF8(dst, c); toUTF8(dst, c, errors);
} }
else if (n == 1) { else if (n == 1) {
// error -- missing second word // error -- missing second word
toUTF8(dst, s_replacement); setError(errors);
toUTF8(dst, s_replacement, NULL);
} }
else if (c >= 0x0000d800 && c <= 0x0000dbff) { else if (c >= 0x0000d800 && c <= 0x0000dbff) {
UInt32 c2 = decode16(data); UInt32 c2 = decode16(data);
@ -349,16 +476,18 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
--n; --n;
if (c2 < 0x0000dc00 || c2 > 0x0000dfff) { if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
// error -- [d800,dbff] not followed by [dc00,dfff] // error -- [d800,dbff] not followed by [dc00,dfff]
toUTF8(dst, s_replacement); setError(errors);
toUTF8(dst, s_replacement, NULL);
} }
else { else {
c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000; c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
toUTF8(dst, c); toUTF8(dst, c, errors);
} }
} }
else { else {
// error -- [dc00,dfff] without leading [d800,dbff] // error -- [dc00,dfff] without leading [d800,dbff]
toUTF8(dst, s_replacement); setError(errors);
toUTF8(dst, s_replacement, NULL);
} }
} }
@ -371,8 +500,11 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
} }
CString CString
CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n) CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n, bool* errors)
{ {
// default to success
resetError(errors);
// make some space // make some space
CString dst; CString dst;
dst.reserve(n); dst.reserve(n);
@ -381,9 +513,10 @@ CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n)
for (; n > 0; data += 4, --n) { for (; n > 0; data += 4, --n) {
UInt32 c = decode32(data); UInt32 c = decode32(data);
if (c >= 0x00110000) { if (c >= 0x00110000) {
setError(errors);
c = s_replacement; c = s_replacement;
} }
toUTF8(dst, c); toUTF8(dst, c, errors);
} }
// remove extra trailing nul // remove extra trailing nul
@ -571,12 +704,13 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
} }
void void
CUnicode::toUTF8(CString& dst, UInt32 c) CUnicode::toUTF8(CString& dst, UInt32 c, bool* errors)
{ {
UInt8 data[6]; UInt8 data[6];
// handle characters outside the valid range // handle characters outside the valid range
if (c >= 0x80000000) { if ((c >= 0x0000d800 && c <= 0x0000dfff) || c >= 0x80000000) {
setError(errors);
c = s_replacement; c = s_replacement;
} }

View File

@ -7,39 +7,50 @@
class CUnicode { class CUnicode {
public: public:
static CString UTF8ToUCS2(const CString&); // returns true iff the string contains a valid sequence of UTF-8
static CString UTF8ToUCS4(const CString&); // encoded characters.
static CString UTF8ToUTF16(const CString&); static bool isUTF8(const CString&);
static CString UTF8ToUTF32(const CString&);
static CString UCS2ToUTF8(const CString&); // convert from UTF-8 encoding to other encodings. if errors is
static CString UCS4ToUTF8(const CString&); // not NULL then it gets true if any characters could not be
static CString UTF16ToUTF8(const CString&); // encoded in the target encoding and false otherwise. note
static CString UTF32ToUTF8(const CString&); // that decoding errors do not set errors to error. UTF8ToText()
// converts to the current locale's (multibyte) encoding.
static CString UTF8ToUCS2(const CString&, bool* errors = NULL);
static CString UTF8ToUCS4(const CString&, bool* errors = NULL);
static CString UTF8ToUTF16(const CString&, bool* errors = NULL);
static CString UTF8ToUTF32(const CString&, bool* errors = NULL);
static CString UTF8ToText(const CString&, bool* errors = NULL);
// convert UTF-8 to/from the current locale's encoding // convert from some encoding to UTF-8. if errors is not NULL
static CString UTF8ToText(const CString&); // then it gets true if any characters could not be decoded and
static CString textToUTF8(const CString&); // false otherwise. textToUTF8() converts from the current
// locale's (multibyte) encoding.
static CString UCS2ToUTF8(const CString&, bool* errors = NULL);
static CString UCS4ToUTF8(const CString&, bool* errors = NULL);
static CString UTF16ToUTF8(const CString&, bool* errors = NULL);
static CString UTF32ToUTF8(const CString&, bool* errors = NULL);
static CString textToUTF8(const CString&, bool* errors = NULL);
private: private:
// convert UTF8 to nul terminated wchar_t string (using whatever // convert UTF8 to nul terminated wchar_t string (using whatever
// encoding is native to the platform). caller must delete[] // encoding is native to the platform). caller must delete[]
// the returned string. // the returned string.
static wchar_t* UTF8ToWideChar(const CString&); static wchar_t* UTF8ToWideChar(const CString&, bool* errors);
// convert nul terminated wchar_t string (in platform's native // convert nul terminated wchar_t string (in platform's native
// encoding) to UTF8. // encoding) to UTF8.
static CString wideCharToUTF8(const wchar_t*); static CString wideCharToUTF8(const wchar_t*, bool* errors);
// internal conversion to UTF8 // internal conversion to UTF8
static CString doUCS2ToUTF8(const UInt8* src, UInt32 n); static CString doUCS2ToUTF8(const UInt8* src, UInt32 n, bool* errors);
static CString doUCS4ToUTF8(const UInt8* src, UInt32 n); static CString doUCS4ToUTF8(const UInt8* src, UInt32 n, bool* errors);
static CString doUTF16ToUTF8(const UInt8* src, UInt32 n); static CString doUTF16ToUTF8(const UInt8* src, UInt32 n, bool* errors);
static CString doUTF32ToUTF8(const UInt8* src, UInt32 n); static CString doUTF32ToUTF8(const UInt8* src, UInt32 n, bool* errors);
// convert characters to/from UTF8 // convert characters to/from UTF8
static UInt32 fromUTF8(const UInt8*& src, UInt32& size); static UInt32 fromUTF8(const UInt8*& src, UInt32& size);
static void toUTF8(CString& dst, UInt32 c); static void toUTF8(CString& dst, UInt32 c, bool* errors);
private: private:
static UInt32 s_invalid; static UInt32 s_invalid;

View File

@ -151,22 +151,14 @@ CXWindowsClipboard::addSimpleRequest(Window requestor,
type = getTimestampData(data, &format); type = getTimestampData(data, &format);
} }
else { else {
char* name = XGetAtomName(m_display, target);
log((CLOG_INFO "request target: %d %s", target, name));
XFree(name);
IXWindowsClipboardConverter* converter = getConverter(target); IXWindowsClipboardConverter* converter = getConverter(target);
if (converter != NULL) { if (converter != NULL) {
log((CLOG_INFO "found converter"));
IClipboard::EFormat clipboardFormat = converter->getFormat(); IClipboard::EFormat clipboardFormat = converter->getFormat();
log((CLOG_INFO "clipboard format: %d", clipboardFormat));
if (m_added[clipboardFormat]) { if (m_added[clipboardFormat]) {
log((CLOG_INFO "added"));
try { try {
data = converter->fromIClipboard(m_data[clipboardFormat]); data = converter->fromIClipboard(m_data[clipboardFormat]);
format = converter->getDataSize(); format = converter->getDataSize();
type = converter->getAtom(); type = converter->getAtom();
log((CLOG_INFO " src: (%d) %s", m_data[clipboardFormat].size(), m_data[clipboardFormat].c_str()));
log((CLOG_INFO " dst: (%d) %s", data.size(), data.c_str()));
} }
catch (...) { catch (...) {
// ignore -- cannot convert // ignore -- cannot convert
@ -535,17 +527,10 @@ CXWindowsClipboard::icccmFillCache()
// add to clipboard and note we've done it // add to clipboard and note we've done it
IClipboard::EFormat format = converter->getFormat(); IClipboard::EFormat format = converter->getFormat();
try {
m_data[format] = converter->toIClipboard(targetData); m_data[format] = converter->toIClipboard(targetData);
if (!m_data[format].empty()) {
m_added[format] = true; m_added[format] = true;
log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target)); log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target));
} }
}
catch (...) {
// ignore -- could not convert data
}
}
} }
bool bool
@ -799,17 +784,10 @@ CXWindowsClipboard::motifFillCache()
// add to clipboard and note we've done it // add to clipboard and note we've done it
IClipboard::EFormat format = converter->getFormat(); IClipboard::EFormat format = converter->getFormat();
try {
m_data[format] = converter->toIClipboard(targetData); m_data[format] = converter->toIClipboard(targetData);
if (!m_data[format].empty()) {
m_added[format] = true; m_added[format] = true;
log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target)); log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target));
} }
}
catch (...) {
// ignore -- could not convert data
}
}
} }
IClipboard::Time IClipboard::Time

View File

@ -44,5 +44,17 @@ CXWindowsClipboardTextConverter::fromIClipboard(const CString& data) const
CString CString
CXWindowsClipboardTextConverter::toIClipboard(const CString& data) const CXWindowsClipboardTextConverter::toIClipboard(const CString& data) const
{ {
return CUnicode::textToUTF8(data); // convert to UTF-8
bool errors;
CString utf8 = CUnicode::textToUTF8(data, &errors);
// if there were decoding errors then, to support old applications
// that don't understand UTF-8 but can report the exact binary
// UTF-8 representation, see if the data appears to be UTF-8. if
// so then use it as is.
if (errors && CUnicode::isUTF8(data)) {
return data;
}
return utf8;
} }