checkpoint. more UTF8 clipboard testing.

This commit is contained in:
crs 2002-07-23 09:33:50 +00:00
parent fcd99c9510
commit 16cc05d56b
3 changed files with 185 additions and 79 deletions

View File

@ -1,4 +1,5 @@
#include "CUnicode.h"
#include <limits.h>
#include <string.h>
//
@ -39,7 +40,8 @@ decode32(const UInt8* n)
// CUnicode
//
UInt32 CUnicode::s_invalid = 0x0000ffff;
UInt32 CUnicode::s_invalid = 0x0000ffff;
UInt32 CUnicode::s_replacement = 0x0000fffd;
CString
CUnicode::UTF8ToUCS2(const CString& src)
@ -54,10 +56,11 @@ CUnicode::UTF8ToUCS2(const CString& src)
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
while (n > 0) {
UInt32 c = fromUTF8(data, n);
if (c != s_invalid && c < 0x00010000) {
UInt16 ucs2 = static_cast<UInt16>(c);
dst.append(reinterpret_cast<const char*>(&ucs2), 2);
if (c == s_invalid || c >= 0x00010000) {
c = s_replacement;
}
UInt16 ucs2 = static_cast<UInt16>(c);
dst.append(reinterpret_cast<const char*>(&ucs2), 2);
}
return dst;
@ -76,9 +79,10 @@ CUnicode::UTF8ToUCS4(const CString& src)
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
while (n > 0) {
UInt32 c = fromUTF8(data, n);
if (c != s_invalid) {
dst.append(reinterpret_cast<const char*>(&c), 4);
if (c == s_invalid) {
c = s_replacement;
}
dst.append(reinterpret_cast<const char*>(&c), 4);
}
return dst;
@ -97,18 +101,19 @@ CUnicode::UTF8ToUTF16(const CString& src)
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
while (n > 0) {
UInt32 c = fromUTF8(data, n);
if (c != s_invalid && c < 0x0010ffff) {
if (c < 0x00010000) {
UInt16 ucs2 = static_cast<UInt16>(c);
dst.append(reinterpret_cast<const char*>(&ucs2), 2);
}
else {
c -= 0x00010000;
UInt16 utf16h = static_cast<UInt16>(c >> 10) + 0xd800;
UInt16 utf16l = (static_cast<UInt16>(c) & 0x03ff) + 0xdc00;
dst.append(reinterpret_cast<const char*>(&utf16h), 2);
dst.append(reinterpret_cast<const char*>(&utf16l), 2);
}
if (c == s_invalid || c >= 0x00110000) {
c = s_replacement;
}
if (c < 0x00010000) {
UInt16 ucs2 = static_cast<UInt16>(c);
dst.append(reinterpret_cast<const char*>(&ucs2), 2);
}
else {
c -= 0x00010000;
UInt16 utf16h = static_cast<UInt16>(c >> 10) + 0xd800;
UInt16 utf16l = (static_cast<UInt16>(c) & 0x03ff) + 0xdc00;
dst.append(reinterpret_cast<const char*>(&utf16h), 2);
dst.append(reinterpret_cast<const char*>(&utf16l), 2);
}
}
@ -118,8 +123,23 @@ CUnicode::UTF8ToUTF16(const CString& src)
CString
CUnicode::UTF8ToUTF32(const CString& src)
{
// FIXME -- should ensure dst has no characters over U-0010FFFF
return UTF8ToUCS4(src);
// get size of input string and reserve some space in output.
// include UTF8's nul terminator.
UInt32 n = src.size() + 1;
CString dst;
dst.reserve(4 * n);
// convert each character
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
while (n > 0) {
UInt32 c = fromUTF8(data, n);
if (c == s_invalid || c >= 0x00110000) {
c = s_replacement;
}
dst.append(reinterpret_cast<const char*>(&c), 4);
}
return dst;
}
CString
@ -157,24 +177,48 @@ CUnicode::UTF8ToText(const CString& src)
wchar_t* tmp = UTF8ToWideChar(src);
// get length of multibyte string
size_t len = 0;
char mbc[MB_LEN_MAX];
mbstate_t state;
memset(&state, 0, sizeof(state));
const wchar_t* scratch = tmp;
size_t len = wcsrtombs(NULL, &scratch, 0, &state);
if (len == (size_t)-1) {
// invalid character in src
delete[] tmp;
return CString();
for (const wchar_t* scan = tmp; *scan != 0; ++scan) {
size_t mblen = wcrtomb(mbc, *scan, &state);
if (mblen == -1) {
// unconvertable character
len += 1;
}
else {
len += mblen;
}
}
// check if state is in initial state. if not then count the
// bytes for returning it to the initial state.
if (mbsinit(&state) == 0) {
len += wcrtomb(mbc, L'\0', &state) - 1;
}
assert(mbsinit(&state) != 0);
// allocate multibyte string
char* mbs = new char[len + 1];
// convert to multibyte
scratch = tmp;
char* dst = new char[len + 1];
wcsrtombs(dst, &scratch, len + 1, &state);
CString text(dst);
char* dst = mbs;
for (const wchar_t* scan = tmp; *scan != 0; ++scan) {
size_t mblen = wcrtomb(dst, *scan, &state);
if (mblen == -1) {
// unconvertable character
*dst++ = '?';
}
else {
dst += len;
}
}
*dst = '\0';
CString text(mbs);
// clean up
delete[] dst;
delete[] mbs;
delete[] tmp;
return text;
@ -297,6 +341,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
}
else if (n == 1) {
// error -- missing second word
toUTF8(dst, s_replacement);
}
else if (c >= 0x0000d800 && c <= 0x0000dbff) {
UInt32 c2 = decode16(data);
@ -304,6 +349,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
--n;
if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
// error -- [d800,dbff] not followed by [dc00,dfff]
toUTF8(dst, s_replacement);
}
else {
c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
@ -312,6 +358,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
}
else {
// error -- [dc00,dfff] without leading [d800,dbff]
toUTF8(dst, s_replacement);
}
}
@ -326,8 +373,25 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
CString
CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n)
{
// FIXME -- should check that src has no characters over U-0010FFFF
return doUCS4ToUTF8(data, n);
// make some space
CString dst;
dst.reserve(n);
// convert each character
for (; n > 0; data += 4, --n) {
UInt32 c = decode32(data);
if (c >= 0x00110000) {
c = s_replacement;
}
toUTF8(dst, c);
}
// remove extra trailing nul
if (dst.size() > 0 && dst[dst.size() - 1] == '\0') {
dst.resize(dst.size() - 1);
}
return dst;
}
UInt32
@ -433,10 +497,54 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
assert(0 && "invalid size");
}
// check that all bytes after the first have the pattern 10xxxxxx.
// truncated sequences are treated as a single malformed character.
bool truncated = false;
switch (size) {
case 6:
if ((data[5] & 0xc0) != 0x80) {
truncated = true;
size = 5;
}
// fall through
case 5:
if ((data[4] & 0xc0) != 0x80) {
truncated = true;
size = 4;
}
// fall through
case 4:
if ((data[3] & 0xc0) != 0x80) {
truncated = true;
size = 3;
}
// fall through
case 3:
if ((data[2] & 0xc0) != 0x80) {
truncated = true;
size = 2;
}
// fall through
case 2:
if ((data[1] & 0xc0) != 0x80) {
truncated = true;
size = 1;
}
}
// update parameters
data += size;
n -= size;
// invalid if sequence was truncated
if (truncated) {
return s_invalid;
}
// check for characters that didn't use the smallest possible encoding
static UInt32 s_minChar[] = {
0,
@ -451,29 +559,11 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
return s_invalid;
}
// check that all bytes after the first have the pattern 10xxxxxx.
UInt8 a = 0x80;
switch (size) {
case 6:
a |= data[5];
// fall through
case 5:
a |= data[4];
// fall through
case 4:
a |= data[3];
// fall through
case 3:
a |= data[2];
// fall through
case 2:
a |= data[1];
// check for characters not in ISO-10646
if (c >= 0x0000d800 && c <= 0x0000dfff) {
return s_invalid;
}
if ((a & 0xc0) != 0x80) {
if (c >= 0x0000fffe && c <= 0x0000ffff) {
return s_invalid;
}
@ -481,10 +571,16 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
}
void
CUnicode::toUTF8(CString& dst, const UInt32 c)
CUnicode::toUTF8(CString& dst, UInt32 c)
{
UInt8 data[6];
// handle characters outside the valid range
if (c >= 0x80000000) {
c = s_replacement;
}
// convert to UTF-8
if (c < 0x00000080) {
data[0] = static_cast<UInt8>(c);
dst.append(reinterpret_cast<char*>(data), 1);
@ -525,6 +621,6 @@ CUnicode::toUTF8(CString& dst, const UInt32 c)
dst.append(reinterpret_cast<char*>(data), 6);
}
else {
// invalid character
assert(0 && "character out of range");
}
}

View File

@ -39,10 +39,11 @@ private:
// convert characters to/from UTF8
static UInt32 fromUTF8(const UInt8*& src, UInt32& size);
static void toUTF8(CString& dst, const UInt32 c);
static void toUTF8(CString& dst, UInt32 c);
private:
static UInt32 s_invalid;
static UInt32 s_replacement;
};
#endif

View File

@ -161,11 +161,16 @@ log((CLOG_INFO "found converter"));
log((CLOG_INFO "clipboard format: %d", clipboardFormat));
if (m_added[clipboardFormat]) {
log((CLOG_INFO "added"));
type = converter->getAtom();
format = converter->getDataSize();
data = converter->fromIClipboard(m_data[clipboardFormat]);
try {
data = converter->fromIClipboard(m_data[clipboardFormat]);
format = converter->getDataSize();
type = converter->getAtom();
log((CLOG_INFO " src: (%d) %s", m_data[clipboardFormat].size(), m_data[clipboardFormat].c_str()));
log((CLOG_INFO " dst: (%d) %s", data.size(), data.c_str()));
}
catch (...) {
// ignore -- cannot convert
}
}
}
}
@ -529,15 +534,17 @@ CXWindowsClipboard::icccmFillCache()
}
// add to clipboard and note we've done it
m_data[converter->getFormat()] = converter->toIClipboard(targetData);
m_added[converter->getFormat()] = true;
// XXX
char* name = XGetAtomName(m_display, target);
log((CLOG_INFO "src atom: %d %s", target, name));
XFree(name);
log((CLOG_INFO "src data size: %d", targetData.size()));
log((CLOG_INFO "utf8 data size: %d", m_data[converter->getFormat()].size()));
log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target));
IClipboard::EFormat format = converter->getFormat();
try {
m_data[format] = converter->toIClipboard(targetData);
if (!m_data[format].empty()) {
m_added[format] = true;
log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target));
}
}
catch (...) {
// ignore -- could not convert data
}
}
}
@ -791,15 +798,17 @@ CXWindowsClipboard::motifFillCache()
targetData.erase(length);
// add to clipboard and note we've done it
m_data[converter->getFormat()] = converter->toIClipboard(targetData);
m_added[converter->getFormat()] = true;
// XXX
char* name = XGetAtomName(m_display, target);
log((CLOG_INFO "src atom: %d %s", target, name));
XFree(name);
log((CLOG_INFO "src data size: %d", targetData.size()));
log((CLOG_INFO "utf8 data size: %d", m_data[converter->getFormat()].size()));
log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target));
IClipboard::EFormat format = converter->getFormat();
try {
m_data[format] = converter->toIClipboard(targetData);
if (!m_data[format].empty()) {
m_added[format] = true;
log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target));
}
}
catch (...) {
// ignore -- could not convert data
}
}
}