checkpoint. more UTF8 clipboard testing.
This commit is contained in:
parent
fcd99c9510
commit
16cc05d56b
|
@ -1,4 +1,5 @@
|
||||||
#include "CUnicode.h"
|
#include "CUnicode.h"
|
||||||
|
#include <limits.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -40,6 +41,7 @@ decode32(const UInt8* n)
|
||||||
//
|
//
|
||||||
|
|
||||||
UInt32 CUnicode::s_invalid = 0x0000ffff;
|
UInt32 CUnicode::s_invalid = 0x0000ffff;
|
||||||
|
UInt32 CUnicode::s_replacement = 0x0000fffd;
|
||||||
|
|
||||||
CString
|
CString
|
||||||
CUnicode::UTF8ToUCS2(const CString& src)
|
CUnicode::UTF8ToUCS2(const CString& src)
|
||||||
|
@ -54,11 +56,12 @@ CUnicode::UTF8ToUCS2(const CString& src)
|
||||||
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
||||||
while (n > 0) {
|
while (n > 0) {
|
||||||
UInt32 c = fromUTF8(data, n);
|
UInt32 c = fromUTF8(data, n);
|
||||||
if (c != s_invalid && c < 0x00010000) {
|
if (c == s_invalid || c >= 0x00010000) {
|
||||||
|
c = s_replacement;
|
||||||
|
}
|
||||||
UInt16 ucs2 = static_cast<UInt16>(c);
|
UInt16 ucs2 = static_cast<UInt16>(c);
|
||||||
dst.append(reinterpret_cast<const char*>(&ucs2), 2);
|
dst.append(reinterpret_cast<const char*>(&ucs2), 2);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return dst;
|
return dst;
|
||||||
}
|
}
|
||||||
|
@ -76,9 +79,10 @@ CUnicode::UTF8ToUCS4(const CString& src)
|
||||||
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
||||||
while (n > 0) {
|
while (n > 0) {
|
||||||
UInt32 c = fromUTF8(data, n);
|
UInt32 c = fromUTF8(data, n);
|
||||||
if (c != s_invalid) {
|
if (c == s_invalid) {
|
||||||
dst.append(reinterpret_cast<const char*>(&c), 4);
|
c = s_replacement;
|
||||||
}
|
}
|
||||||
|
dst.append(reinterpret_cast<const char*>(&c), 4);
|
||||||
}
|
}
|
||||||
|
|
||||||
return dst;
|
return dst;
|
||||||
|
@ -97,7 +101,9 @@ CUnicode::UTF8ToUTF16(const CString& src)
|
||||||
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
||||||
while (n > 0) {
|
while (n > 0) {
|
||||||
UInt32 c = fromUTF8(data, n);
|
UInt32 c = fromUTF8(data, n);
|
||||||
if (c != s_invalid && c < 0x0010ffff) {
|
if (c == s_invalid || c >= 0x00110000) {
|
||||||
|
c = s_replacement;
|
||||||
|
}
|
||||||
if (c < 0x00010000) {
|
if (c < 0x00010000) {
|
||||||
UInt16 ucs2 = static_cast<UInt16>(c);
|
UInt16 ucs2 = static_cast<UInt16>(c);
|
||||||
dst.append(reinterpret_cast<const char*>(&ucs2), 2);
|
dst.append(reinterpret_cast<const char*>(&ucs2), 2);
|
||||||
|
@ -110,7 +116,6 @@ CUnicode::UTF8ToUTF16(const CString& src)
|
||||||
dst.append(reinterpret_cast<const char*>(&utf16l), 2);
|
dst.append(reinterpret_cast<const char*>(&utf16l), 2);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
return dst;
|
return dst;
|
||||||
}
|
}
|
||||||
|
@ -118,8 +123,23 @@ CUnicode::UTF8ToUTF16(const CString& src)
|
||||||
CString
|
CString
|
||||||
CUnicode::UTF8ToUTF32(const CString& src)
|
CUnicode::UTF8ToUTF32(const CString& src)
|
||||||
{
|
{
|
||||||
// FIXME -- should ensure dst has no characters over U-0010FFFF
|
// get size of input string and reserve some space in output.
|
||||||
return UTF8ToUCS4(src);
|
// include UTF8's nul terminator.
|
||||||
|
UInt32 n = src.size() + 1;
|
||||||
|
CString dst;
|
||||||
|
dst.reserve(4 * n);
|
||||||
|
|
||||||
|
// convert each character
|
||||||
|
const UInt8* data = reinterpret_cast<const UInt8*>(src.c_str());
|
||||||
|
while (n > 0) {
|
||||||
|
UInt32 c = fromUTF8(data, n);
|
||||||
|
if (c == s_invalid || c >= 0x00110000) {
|
||||||
|
c = s_replacement;
|
||||||
|
}
|
||||||
|
dst.append(reinterpret_cast<const char*>(&c), 4);
|
||||||
|
}
|
||||||
|
|
||||||
|
return dst;
|
||||||
}
|
}
|
||||||
|
|
||||||
CString
|
CString
|
||||||
|
@ -157,24 +177,48 @@ CUnicode::UTF8ToText(const CString& src)
|
||||||
wchar_t* tmp = UTF8ToWideChar(src);
|
wchar_t* tmp = UTF8ToWideChar(src);
|
||||||
|
|
||||||
// get length of multibyte string
|
// get length of multibyte string
|
||||||
|
size_t len = 0;
|
||||||
|
char mbc[MB_LEN_MAX];
|
||||||
mbstate_t state;
|
mbstate_t state;
|
||||||
memset(&state, 0, sizeof(state));
|
memset(&state, 0, sizeof(state));
|
||||||
const wchar_t* scratch = tmp;
|
for (const wchar_t* scan = tmp; *scan != 0; ++scan) {
|
||||||
size_t len = wcsrtombs(NULL, &scratch, 0, &state);
|
size_t mblen = wcrtomb(mbc, *scan, &state);
|
||||||
if (len == (size_t)-1) {
|
if (mblen == -1) {
|
||||||
// invalid character in src
|
// unconvertable character
|
||||||
delete[] tmp;
|
len += 1;
|
||||||
return CString();
|
}
|
||||||
|
else {
|
||||||
|
len += mblen;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check if state is in initial state. if not then count the
|
||||||
|
// bytes for returning it to the initial state.
|
||||||
|
if (mbsinit(&state) == 0) {
|
||||||
|
len += wcrtomb(mbc, L'\0', &state) - 1;
|
||||||
|
}
|
||||||
|
assert(mbsinit(&state) != 0);
|
||||||
|
|
||||||
|
// allocate multibyte string
|
||||||
|
char* mbs = new char[len + 1];
|
||||||
|
|
||||||
// convert to multibyte
|
// convert to multibyte
|
||||||
scratch = tmp;
|
char* dst = mbs;
|
||||||
char* dst = new char[len + 1];
|
for (const wchar_t* scan = tmp; *scan != 0; ++scan) {
|
||||||
wcsrtombs(dst, &scratch, len + 1, &state);
|
size_t mblen = wcrtomb(dst, *scan, &state);
|
||||||
CString text(dst);
|
if (mblen == -1) {
|
||||||
|
// unconvertable character
|
||||||
|
*dst++ = '?';
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
dst += len;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
*dst = '\0';
|
||||||
|
CString text(mbs);
|
||||||
|
|
||||||
// clean up
|
// clean up
|
||||||
delete[] dst;
|
delete[] mbs;
|
||||||
delete[] tmp;
|
delete[] tmp;
|
||||||
|
|
||||||
return text;
|
return text;
|
||||||
|
@ -297,6 +341,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
|
||||||
}
|
}
|
||||||
else if (n == 1) {
|
else if (n == 1) {
|
||||||
// error -- missing second word
|
// error -- missing second word
|
||||||
|
toUTF8(dst, s_replacement);
|
||||||
}
|
}
|
||||||
else if (c >= 0x0000d800 && c <= 0x0000dbff) {
|
else if (c >= 0x0000d800 && c <= 0x0000dbff) {
|
||||||
UInt32 c2 = decode16(data);
|
UInt32 c2 = decode16(data);
|
||||||
|
@ -304,6 +349,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
|
||||||
--n;
|
--n;
|
||||||
if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
|
if (c2 < 0x0000dc00 || c2 > 0x0000dfff) {
|
||||||
// error -- [d800,dbff] not followed by [dc00,dfff]
|
// error -- [d800,dbff] not followed by [dc00,dfff]
|
||||||
|
toUTF8(dst, s_replacement);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
|
c = (((c - 0x0000d800) << 10) | (c2 - 0x0000dc00)) + 0x00010000;
|
||||||
|
@ -312,6 +358,7 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// error -- [dc00,dfff] without leading [d800,dbff]
|
// error -- [dc00,dfff] without leading [d800,dbff]
|
||||||
|
toUTF8(dst, s_replacement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -326,8 +373,25 @@ CUnicode::doUTF16ToUTF8(const UInt8* data, UInt32 n)
|
||||||
CString
|
CString
|
||||||
CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n)
|
CUnicode::doUTF32ToUTF8(const UInt8* data, UInt32 n)
|
||||||
{
|
{
|
||||||
// FIXME -- should check that src has no characters over U-0010FFFF
|
// make some space
|
||||||
return doUCS4ToUTF8(data, n);
|
CString dst;
|
||||||
|
dst.reserve(n);
|
||||||
|
|
||||||
|
// convert each character
|
||||||
|
for (; n > 0; data += 4, --n) {
|
||||||
|
UInt32 c = decode32(data);
|
||||||
|
if (c >= 0x00110000) {
|
||||||
|
c = s_replacement;
|
||||||
|
}
|
||||||
|
toUTF8(dst, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
// remove extra trailing nul
|
||||||
|
if (dst.size() > 0 && dst[dst.size() - 1] == '\0') {
|
||||||
|
dst.resize(dst.size() - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return dst;
|
||||||
}
|
}
|
||||||
|
|
||||||
UInt32
|
UInt32
|
||||||
|
@ -433,10 +497,54 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
|
||||||
assert(0 && "invalid size");
|
assert(0 && "invalid size");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// check that all bytes after the first have the pattern 10xxxxxx.
|
||||||
|
// truncated sequences are treated as a single malformed character.
|
||||||
|
bool truncated = false;
|
||||||
|
switch (size) {
|
||||||
|
case 6:
|
||||||
|
if ((data[5] & 0xc0) != 0x80) {
|
||||||
|
truncated = true;
|
||||||
|
size = 5;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
|
||||||
|
case 5:
|
||||||
|
if ((data[4] & 0xc0) != 0x80) {
|
||||||
|
truncated = true;
|
||||||
|
size = 4;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
|
||||||
|
case 4:
|
||||||
|
if ((data[3] & 0xc0) != 0x80) {
|
||||||
|
truncated = true;
|
||||||
|
size = 3;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
|
||||||
|
case 3:
|
||||||
|
if ((data[2] & 0xc0) != 0x80) {
|
||||||
|
truncated = true;
|
||||||
|
size = 2;
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
|
|
||||||
|
case 2:
|
||||||
|
if ((data[1] & 0xc0) != 0x80) {
|
||||||
|
truncated = true;
|
||||||
|
size = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// update parameters
|
// update parameters
|
||||||
data += size;
|
data += size;
|
||||||
n -= size;
|
n -= size;
|
||||||
|
|
||||||
|
// invalid if sequence was truncated
|
||||||
|
if (truncated) {
|
||||||
|
return s_invalid;
|
||||||
|
}
|
||||||
|
|
||||||
// check for characters that didn't use the smallest possible encoding
|
// check for characters that didn't use the smallest possible encoding
|
||||||
static UInt32 s_minChar[] = {
|
static UInt32 s_minChar[] = {
|
||||||
0,
|
0,
|
||||||
|
@ -451,29 +559,11 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
|
||||||
return s_invalid;
|
return s_invalid;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check that all bytes after the first have the pattern 10xxxxxx.
|
// check for characters not in ISO-10646
|
||||||
UInt8 a = 0x80;
|
if (c >= 0x0000d800 && c <= 0x0000dfff) {
|
||||||
switch (size) {
|
return s_invalid;
|
||||||
case 6:
|
|
||||||
a |= data[5];
|
|
||||||
// fall through
|
|
||||||
|
|
||||||
case 5:
|
|
||||||
a |= data[4];
|
|
||||||
// fall through
|
|
||||||
|
|
||||||
case 4:
|
|
||||||
a |= data[3];
|
|
||||||
// fall through
|
|
||||||
|
|
||||||
case 3:
|
|
||||||
a |= data[2];
|
|
||||||
// fall through
|
|
||||||
|
|
||||||
case 2:
|
|
||||||
a |= data[1];
|
|
||||||
}
|
}
|
||||||
if ((a & 0xc0) != 0x80) {
|
if (c >= 0x0000fffe && c <= 0x0000ffff) {
|
||||||
return s_invalid;
|
return s_invalid;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -481,10 +571,16 @@ CUnicode::fromUTF8(const UInt8*& data, UInt32& n)
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
CUnicode::toUTF8(CString& dst, const UInt32 c)
|
CUnicode::toUTF8(CString& dst, UInt32 c)
|
||||||
{
|
{
|
||||||
UInt8 data[6];
|
UInt8 data[6];
|
||||||
|
|
||||||
|
// handle characters outside the valid range
|
||||||
|
if (c >= 0x80000000) {
|
||||||
|
c = s_replacement;
|
||||||
|
}
|
||||||
|
|
||||||
|
// convert to UTF-8
|
||||||
if (c < 0x00000080) {
|
if (c < 0x00000080) {
|
||||||
data[0] = static_cast<UInt8>(c);
|
data[0] = static_cast<UInt8>(c);
|
||||||
dst.append(reinterpret_cast<char*>(data), 1);
|
dst.append(reinterpret_cast<char*>(data), 1);
|
||||||
|
@ -525,6 +621,6 @@ CUnicode::toUTF8(CString& dst, const UInt32 c)
|
||||||
dst.append(reinterpret_cast<char*>(data), 6);
|
dst.append(reinterpret_cast<char*>(data), 6);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// invalid character
|
assert(0 && "character out of range");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,10 +39,11 @@ private:
|
||||||
|
|
||||||
// convert characters to/from UTF8
|
// convert characters to/from UTF8
|
||||||
static UInt32 fromUTF8(const UInt8*& src, UInt32& size);
|
static UInt32 fromUTF8(const UInt8*& src, UInt32& size);
|
||||||
static void toUTF8(CString& dst, const UInt32 c);
|
static void toUTF8(CString& dst, UInt32 c);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static UInt32 s_invalid;
|
static UInt32 s_invalid;
|
||||||
|
static UInt32 s_replacement;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -161,12 +161,17 @@ log((CLOG_INFO "found converter"));
|
||||||
log((CLOG_INFO "clipboard format: %d", clipboardFormat));
|
log((CLOG_INFO "clipboard format: %d", clipboardFormat));
|
||||||
if (m_added[clipboardFormat]) {
|
if (m_added[clipboardFormat]) {
|
||||||
log((CLOG_INFO "added"));
|
log((CLOG_INFO "added"));
|
||||||
type = converter->getAtom();
|
try {
|
||||||
format = converter->getDataSize();
|
|
||||||
data = converter->fromIClipboard(m_data[clipboardFormat]);
|
data = converter->fromIClipboard(m_data[clipboardFormat]);
|
||||||
|
format = converter->getDataSize();
|
||||||
|
type = converter->getAtom();
|
||||||
log((CLOG_INFO " src: (%d) %s", m_data[clipboardFormat].size(), m_data[clipboardFormat].c_str()));
|
log((CLOG_INFO " src: (%d) %s", m_data[clipboardFormat].size(), m_data[clipboardFormat].c_str()));
|
||||||
log((CLOG_INFO " dst: (%d) %s", data.size(), data.c_str()));
|
log((CLOG_INFO " dst: (%d) %s", data.size(), data.c_str()));
|
||||||
}
|
}
|
||||||
|
catch (...) {
|
||||||
|
// ignore -- cannot convert
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -529,16 +534,18 @@ CXWindowsClipboard::icccmFillCache()
|
||||||
}
|
}
|
||||||
|
|
||||||
// add to clipboard and note we've done it
|
// add to clipboard and note we've done it
|
||||||
m_data[converter->getFormat()] = converter->toIClipboard(targetData);
|
IClipboard::EFormat format = converter->getFormat();
|
||||||
m_added[converter->getFormat()] = true;
|
try {
|
||||||
// XXX
|
m_data[format] = converter->toIClipboard(targetData);
|
||||||
char* name = XGetAtomName(m_display, target);
|
if (!m_data[format].empty()) {
|
||||||
log((CLOG_INFO "src atom: %d %s", target, name));
|
m_added[format] = true;
|
||||||
XFree(name);
|
|
||||||
log((CLOG_INFO "src data size: %d", targetData.size()));
|
|
||||||
log((CLOG_INFO "utf8 data size: %d", m_data[converter->getFormat()].size()));
|
|
||||||
log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target));
|
log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
catch (...) {
|
||||||
|
// ignore -- could not convert data
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool
|
bool
|
||||||
|
@ -791,16 +798,18 @@ CXWindowsClipboard::motifFillCache()
|
||||||
targetData.erase(length);
|
targetData.erase(length);
|
||||||
|
|
||||||
// add to clipboard and note we've done it
|
// add to clipboard and note we've done it
|
||||||
m_data[converter->getFormat()] = converter->toIClipboard(targetData);
|
IClipboard::EFormat format = converter->getFormat();
|
||||||
m_added[converter->getFormat()] = true;
|
try {
|
||||||
// XXX
|
m_data[format] = converter->toIClipboard(targetData);
|
||||||
char* name = XGetAtomName(m_display, target);
|
if (!m_data[format].empty()) {
|
||||||
log((CLOG_INFO "src atom: %d %s", target, name));
|
m_added[format] = true;
|
||||||
XFree(name);
|
|
||||||
log((CLOG_INFO "src data size: %d", targetData.size()));
|
|
||||||
log((CLOG_INFO "utf8 data size: %d", m_data[converter->getFormat()].size()));
|
|
||||||
log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target));
|
log((CLOG_DEBUG " added format %d for target %d", converter->getFormat(), target));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
catch (...) {
|
||||||
|
// ignore -- could not convert data
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
IClipboard::Time
|
IClipboard::Time
|
||||||
|
|
Loading…
Reference in New Issue