From ba300f42bdbf1e033616ee8a8054d84613b55aca Mon Sep 17 00:00:00 2001 From: Konstantin Ritt Date: Tue, 15 May 2012 20:48:20 +0300 Subject: QChar: add isSurrogate() and isNonCharacter() to the public API + QChar::LastValidCodePoint enum value that supercede the UNICODE_LAST_CODEPOINT macro replace uses of hardcoded values with the new API; remove leftovers Change-Id: I1395c9840b85fcb6b08e241b131794a98773c952 Reviewed-by: Thiago Macieira --- src/corelib/codecs/qutfcodec.cpp | 7 +- src/corelib/io/qurlrecode.cpp | 15 +--- src/corelib/json/qjsonparser.cpp | 5 +- src/corelib/json/qjsonwriter.cpp | 3 +- src/corelib/tools/qchar.cpp | 100 ++++++++++++++++++------ src/corelib/tools/qchar.h | 19 +++-- src/corelib/tools/qunicodetables_p.h | 14 ---- src/corelib/xml/qxmlstream.cpp | 4 +- tests/auto/corelib/tools/qchar/tst_qchar.cpp | 8 +- tests/benchmarks/corelib/tools/qstring/main.cpp | 25 ++---- util/unicode/main.cpp | 31 ++------ 11 files changed, 112 insertions(+), 119 deletions(-) diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index c3d9dbbd31..0fb4bb6761 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -43,7 +43,6 @@ #include "qlist.h" #include "qendian.h" #include "qchar.h" -#include QT_BEGIN_NAMESPACE @@ -108,7 +107,7 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve *cursor++ = 0xc0 | ((uchar) (u >> 6)); } else { // is it one of the Unicode non-characters? - if (QUnicodeTables::isNonCharacter(u)) { + if (QChar::isNonCharacter(u)) { *cursor++ = replacement; ++ch; ++invalid; @@ -184,12 +183,12 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte bool nonCharacter; if (!headerdone && uc == 0xfeff) { // don't do anything, just skip the BOM - } else if (!(nonCharacter = QUnicodeTables::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc < 0x110000) { + } else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { // surrogate pair Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); *qch++ = QChar::highSurrogate(uc); *qch++ = QChar::lowSurrogate(uc); - } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) { + } else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) { // error: overlong sequence, UTF16 surrogate or non-character *qch++ = replacement; ++invalid; diff --git a/src/corelib/io/qurlrecode.cpp b/src/corelib/io/qurlrecode.cpp index de9c5aa7db..3d985dbdc0 100644 --- a/src/corelib/io/qurlrecode.cpp +++ b/src/corelib/io/qurlrecode.cpp @@ -285,19 +285,6 @@ static void ensureDetached(QString &result, ushort *&output, const ushort *begin } } -static inline bool isUnicodeNonCharacter(uint ucs4) -{ - // Unicode has a couple of "non-characters" that one can use internally, - // but are not allowed to be used for text interchange. - // - // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, - // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and - // U+FDEF (inclusive) - - return (ucs4 & 0xfffe) == 0xfffe - || (ucs4 - 0xfdd0U) < 16; -} - // returns true if we performed an UTF-8 decoding static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *begin, const ushort *&input, const ushort *end, ushort decoded) @@ -370,7 +357,7 @@ static bool encodedUtf8ToUtf16(QString &result, ushort *&output, const ushort *b // we've decoded something; safety-check it if (uc < min_uc) return false; - if (isUnicodeNonCharacter(uc) || (uc >= 0xD800 && uc <= 0xDFFF) || uc >= 0x110000) + if (QChar::isSurrogate(uc) || QChar::isNonCharacter(uc) || uc > QChar::LastValidCodePoint) return false; if (!QChar::requiresSurrogates(uc)) { diff --git a/src/corelib/json/qjsonparser.cpp b/src/corelib/json/qjsonparser.cpp index 5fecb8d4e7..6706f12b24 100644 --- a/src/corelib/json/qjsonparser.cpp +++ b/src/corelib/json/qjsonparser.cpp @@ -45,7 +45,6 @@ #include #include "qjsonparser_p.h" #include "qjson_p.h" -#include //#define PARSER_DEBUG #ifdef PARSER_DEBUG @@ -769,8 +768,8 @@ static inline bool scanUtf8Char(const char *&json, const char *end, uint *result uc = (uc << 6) | (ch & 0x3f); } - if (uc < min_uc || QUnicodeTables::isNonCharacter(uc) || - (uc >= 0xd800 && uc <= 0xdfff) || uc >= 0x110000) { + if (uc < min_uc || QChar::isNonCharacter(uc) || + QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) { return false; } diff --git a/src/corelib/json/qjsonwriter.cpp b/src/corelib/json/qjsonwriter.cpp index b086cbdea9..c591657f4c 100644 --- a/src/corelib/json/qjsonwriter.cpp +++ b/src/corelib/json/qjsonwriter.cpp @@ -41,7 +41,6 @@ #include "qjsonwriter_p.h" #include "qjson_p.h" -#include QT_BEGIN_NAMESPACE @@ -140,7 +139,7 @@ static QByteArray escapedString(const QString &s) *cursor++ = 0xc0 | ((uchar) (u >> 6)); } else { // is it one of the Unicode non-characters? - if (QUnicodeTables::isNonCharacter(u)) { + if (QChar::isNonCharacter(u)) { *cursor++ = replacement; ++ch; continue; diff --git a/src/corelib/tools/qchar.cpp b/src/corelib/tools/qchar.cpp index 138c3a69ee..9c5a515dd7 100644 --- a/src/corelib/tools/qchar.cpp +++ b/src/corelib/tools/qchar.cpp @@ -377,6 +377,7 @@ QT_BEGIN_NAMESPACE \value ByteOrderSwapped \value ParagraphSeparator \value LineSeparator + \value LastValidCodePoint */ /*! @@ -499,7 +500,7 @@ QT_BEGIN_NAMESPACE */ bool QChar::isPrint(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return false; const int test = FLAG(Other_Control) | FLAG(Other_Format) | @@ -532,7 +533,7 @@ bool QChar::isPrint(uint ucs4) */ bool QT_FASTCALL QChar::isSpace_helper(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return false; const int test = FLAG(Separator_Space) | FLAG(Separator_Line) | @@ -558,7 +559,7 @@ bool QT_FASTCALL QChar::isSpace_helper(uint ucs4) */ bool QChar::isMark(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return false; const int test = FLAG(Mark_NonSpacing) | FLAG(Mark_SpacingCombining) | @@ -582,7 +583,7 @@ bool QChar::isMark(uint ucs4) */ bool QChar::isPunct(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return false; const int test = FLAG(Punctuation_Connector) | FLAG(Punctuation_Dash) | @@ -610,7 +611,7 @@ bool QChar::isPunct(uint ucs4) */ bool QChar::isSymbol(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return false; const int test = FLAG(Symbol_Math) | FLAG(Symbol_Currency) | @@ -640,7 +641,7 @@ bool QChar::isSymbol(uint ucs4) */ bool QT_FASTCALL QChar::isLetter_helper(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return false; const int test = FLAG(Letter_Uppercase) | FLAG(Letter_Lowercase) | @@ -675,7 +676,7 @@ bool QT_FASTCALL QChar::isLetter_helper(uint ucs4) */ bool QT_FASTCALL QChar::isNumber_helper(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return false; const int test = FLAG(Number_DecimalDigit) | FLAG(Number_Letter) | @@ -704,7 +705,7 @@ bool QT_FASTCALL QChar::isNumber_helper(uint ucs4) */ bool QT_FASTCALL QChar::isLetterOrNumber_helper(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return false; const int test = FLAG(Letter_Uppercase) | FLAG(Letter_Lowercase) | @@ -737,18 +738,55 @@ bool QT_FASTCALL QChar::isLetterOrNumber_helper(uint ucs4) \sa isNumber() */ +/*! + \fn bool QChar::isNonCharacter() const + \since 5.0 + + Returns true if the QChar is a non-character; false otherwise. + + Unicode has a certain number of code points that are classified + as "non-characters:" that is, they can be used for internal purposes + in applications but cannot be used for text interchange. + Those are the last two entries each Unicode Plane ([0xfffe..0xffff], + [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef]. +*/ + /*! \fn bool QChar::isHighSurrogate() const Returns true if the QChar is the high part of a UTF16 surrogate - (i.e. if it's code point in range [0xd800..0xdbff]). + (i.e. if its code point is in range [0xd800..0xdbff]); false otherwise. */ /*! \fn bool QChar::isLowSurrogate() const Returns true if the QChar is the low part of a UTF16 surrogate - (i.e. if it's code point in range [0xdc00..0xdfff]). + (i.e. if its code point is in range [0xdc00..0xdfff]); false otherwise. +*/ + +/*! + \fn bool QChar::isSurrogate() const + \since 5.0 + + Returns true if the QChar contains a code point that is in either + the high or the low part of the UTF-16 surrogate range + (i.e. if its code point is in range [0xd800..0xdfff]); false otherwise. +*/ + +/*! + \fn static bool isNonCharacter(uint ucs4) + \overload + \since 5.0 + + Returns true if the UCS-4-encoded character specified by \a ucs4 + is a non-character; false otherwise. + + Unicode has a certain number of code points that are classified + as "non-characters:" that is, they can be used for internal purposes + in applications but cannot be used for text interchange. + Those are the last two entries each Unicode Plane ([0xfffe..0xffff], + [0x1fffe..0x1ffff], etc.) as well as the entries in range [0xfdd0..0xfdef]. */ /*! @@ -757,7 +795,7 @@ bool QT_FASTCALL QChar::isLetterOrNumber_helper(uint ucs4) Returns true if the UCS-4-encoded character specified by \a ucs4 is the high part of a UTF16 surrogate - (i.e. if it's code point in range [0xd800..0xdbff]). + (i.e. if its code point is in range [0xd800..0xdbff]); false otherwise. */ /*! @@ -766,7 +804,18 @@ bool QT_FASTCALL QChar::isLetterOrNumber_helper(uint ucs4) Returns true if the UCS-4-encoded character specified by \a ucs4 is the low part of a UTF16 surrogate - (i.e. if it's code point in range [0xdc00..0xdfff]). + (i.e. if its code point is in range [0xdc00..0xdfff]); false otherwise. +*/ + +/*! + \fn static bool QChar::isSurrogate(uint ucs4) + \overload + \since 5.0 + + Returns true if the UCS-4-encoded character specified by \a ucs4 + contains a code point that is in either the high or the low part of the + UTF-16 surrogate range (i.e. if its code point is in range [0xd800..0xdfff]); + false otherwise. */ /*! @@ -774,7 +823,8 @@ bool QT_FASTCALL QChar::isLetterOrNumber_helper(uint ucs4) Returns true if the UCS-4-encoded character specified by \a ucs4 can be split into the high and low parts of a UTF16 surrogate - (i.e. if it's code point is greater than or equals to 0x10000). + (i.e. if its code point is greater than or equals to 0x10000); + false otherwise. */ /*! @@ -818,7 +868,7 @@ bool QT_FASTCALL QChar::isLetterOrNumber_helper(uint ucs4) */ int QChar::digitValue(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return -1; return qGetProp(ucs4)->digitValue; } @@ -835,7 +885,7 @@ int QChar::digitValue(uint ucs4) */ QChar::Category QChar::category(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return QChar::Other_NotAssigned; return (QChar::Category) qGetProp(ucs4)->category; } @@ -852,7 +902,7 @@ QChar::Category QChar::category(uint ucs4) */ QChar::Direction QChar::direction(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return QChar::DirL; return (QChar::Direction) qGetProp(ucs4)->direction; } @@ -871,7 +921,7 @@ QChar::Direction QChar::direction(uint ucs4) */ QChar::Joining QChar::joining(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return QChar::OtherJoining; return (QChar::Joining) qGetProp(ucs4)->joining; } @@ -900,7 +950,7 @@ QChar::Joining QChar::joining(uint ucs4) */ bool QChar::hasMirrored(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return false; return qGetProp(ucs4)->mirrorDiff != 0; } @@ -950,7 +1000,7 @@ bool QChar::hasMirrored(uint ucs4) */ uint QChar::mirroredChar(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return ucs4; return ucs4 + qGetProp(ucs4)->mirrorDiff; } @@ -1060,7 +1110,7 @@ QChar::Decomposition QChar::decompositionTag(uint ucs4) */ unsigned char QChar::combiningClass(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return 0; return (unsigned char) qGetProp(ucs4)->combiningClass; } @@ -1078,7 +1128,7 @@ unsigned char QChar::combiningClass(uint ucs4) */ QChar::UnicodeVersion QChar::unicodeVersion(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return QChar::Unicode_Unassigned; return (QChar::UnicodeVersion) qGetProp(ucs4)->unicodeVersion; } @@ -1155,7 +1205,7 @@ static inline T toCaseFolded_helper(T uc) */ uint QChar::toLower(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return ucs4; return toLowerCase_helper(ucs4); } @@ -1175,7 +1225,7 @@ uint QChar::toLower(uint ucs4) */ uint QChar::toUpper(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return ucs4; return toUpperCase_helper(ucs4); } @@ -1195,7 +1245,7 @@ uint QChar::toUpper(uint ucs4) */ uint QChar::toTitleCase(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return ucs4; return toTitleCase_helper(ucs4); } @@ -1236,7 +1286,7 @@ static inline ushort foldCase(ushort ch) */ uint QChar::toCaseFolded(uint ucs4) { - if (ucs4 > UNICODE_LAST_CODEPOINT) + if (ucs4 > LastValidCodePoint) return ucs4; return toCaseFolded_helper(ucs4); } diff --git a/src/corelib/tools/qchar.h b/src/corelib/tools/qchar.h index 07333c9535..6c423859ec 100644 --- a/src/corelib/tools/qchar.h +++ b/src/corelib/tools/qchar.h @@ -73,7 +73,8 @@ public: ByteOrderMark = 0xfeff, ByteOrderSwapped = 0xfffe, ParagraphSeparator = 0x2029, - LineSeparator = 0x2028 + LineSeparator = 0x2028, + LastValidCodePoint = 0x10ffff }; Q_DECL_CONSTEXPR QChar() : ucs(0) {} @@ -245,24 +246,28 @@ public: inline bool isUpper() const { return QChar::isUpper(ucs); } inline bool isTitleCase() const { return QChar::isTitleCase(ucs); } - inline bool isHighSurrogate() const { - return ((ucs & 0xfc00) == 0xd800); - } - inline bool isLowSurrogate() const { - return ((ucs & 0xfc00) == 0xdc00); - } + inline bool isNonCharacter() const { return QChar::isNonCharacter(ucs); } + inline bool isHighSurrogate() const { return QChar::isHighSurrogate(ucs); } + inline bool isLowSurrogate() const { return QChar::isLowSurrogate(ucs); } + inline bool isSurrogate() const { return QChar::isSurrogate(ucs); } inline uchar cell() const { return uchar(ucs & 0xff); } inline uchar row() const { return uchar((ucs>>8)&0xff); } inline void setCell(uchar cell); inline void setRow(uchar row); + static inline bool isNonCharacter(uint ucs4) { + return ucs4 >= 0xfdd0 && (ucs4 <= 0xfdef || (ucs4 & 0xfffe) == 0xfffe); + } static inline bool isHighSurrogate(uint ucs4) { return ((ucs4 & 0xfffffc00) == 0xd800); } static inline bool isLowSurrogate(uint ucs4) { return ((ucs4 & 0xfffffc00) == 0xdc00); } + static inline bool isSurrogate(uint ucs4) { + return (ucs4 - 0xd800u < 2048u); + } static inline bool requiresSurrogates(uint ucs4) { return (ucs4 >= 0x10000); } diff --git a/src/corelib/tools/qunicodetables_p.h b/src/corelib/tools/qunicodetables_p.h index 15d5415b0b..293f03b94f 100644 --- a/src/corelib/tools/qunicodetables_p.h +++ b/src/corelib/tools/qunicodetables_p.h @@ -61,8 +61,6 @@ QT_BEGIN_NAMESPACE #define UNICODE_DATA_VERSION QChar::Unicode_5_0 -#define UNICODE_LAST_CODEPOINT 0x10ffff - namespace QUnicodeTables { struct Properties { @@ -237,18 +235,6 @@ namespace QUnicodeTables { inline int script(QChar ch) { return script(ch.unicode()); } - - inline bool isNonCharacter(uint ucs4) - { - // Noncharacter_Code_Point: - // Unicode has a couple of "non-characters" that one can use internally, - // but are not allowed to be used for text interchange. - // Those are the last two entries each Unicode Plane (U+FFFE..U+FFFF, - // U+1FFFE..U+1FFFF, etc.) as well as the entries in range U+FDD0..U+FDEF - - return ucs4 >= 0xfdd0 && (ucs4 <= 0xfdef || (ucs4 & 0xfffe) == 0xfffe); - } - } // namespace QUnicodeTables QT_END_NAMESPACE diff --git a/src/corelib/xml/qxmlstream.cpp b/src/corelib/xml/qxmlstream.cpp index 769cdc9495..b37675fea9 100644 --- a/src/corelib/xml/qxmlstream.cpp +++ b/src/corelib/xml/qxmlstream.cpp @@ -986,7 +986,7 @@ bool QXmlStreamReaderPrivate::scanUntil(const char *str, short tokenToInject) textBuffer += QChar(c); continue; default: - if(c < 0x20 || (c > 0xFFFD && c < 0x10000) || c > 0x10FFFF ) { + if (c < 0x20 || (c > 0xFFFD && c < 0x10000) || c > QChar::LastValidCodePoint ) { raiseWellFormedError(QXmlStream::tr("Invalid XML character.")); lineNumber = oldLineNumber; return false; @@ -1718,7 +1718,7 @@ uint QXmlStreamReaderPrivate::resolveCharRef(int symbolIndex) s = symString(symbolIndex).toString().toUInt(&ok, 10); ok &= (s == 0x9 || s == 0xa || s == 0xd || (s >= 0x20 && s <= 0xd7ff) - || (s >= 0xe000 && s <= 0xfffd) || (s >= 0x10000 && s <= 0x10ffff)); + || (s >= 0xe000 && s <= 0xfffd) || (s >= 0x10000 && s <= QChar::LastValidCodePoint)); return ok ? s : 0; } diff --git a/tests/auto/corelib/tools/qchar/tst_qchar.cpp b/tests/auto/corelib/tools/qchar/tst_qchar.cpp index ae2622eb07..142b7fbefe 100644 --- a/tests/auto/corelib/tools/qchar/tst_qchar.cpp +++ b/tests/auto/corelib/tools/qchar/tst_qchar.cpp @@ -339,7 +339,7 @@ void tst_QChar::isUpper() QVERIFY(QChar(0xC2).isUpper()); // A with ^ QVERIFY(!QChar(0xE2).isUpper()); // a with ^ - for (uint codepoint = 0; codepoint <= UNICODE_LAST_CODEPOINT; ++codepoint) { + for (uint codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) { if (QChar::isUpper(codepoint)) QVERIFY(codepoint == QChar::toUpper(codepoint)); } @@ -355,7 +355,7 @@ void tst_QChar::isLower() QVERIFY(!QChar(0xC2).isLower()); // A with ^ QVERIFY(QChar(0xE2).isLower()); // a with ^ - for (uint codepoint = 0; codepoint <= UNICODE_LAST_CODEPOINT; ++codepoint) { + for (uint codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) { if (QChar::isLower(codepoint)) QVERIFY(codepoint == QChar::toLower(codepoint)); } @@ -363,7 +363,7 @@ void tst_QChar::isLower() void tst_QChar::isTitleCase() { - for (uint codepoint = 0; codepoint <= UNICODE_LAST_CODEPOINT; ++codepoint) { + for (uint codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) { if (QChar::isTitleCase(codepoint)) QVERIFY(codepoint == QChar::toTitleCase(codepoint)); } @@ -575,7 +575,7 @@ void tst_QChar::digitValue() QVERIFY(QChar::digitValue((uint)0x1040) == 0); QVERIFY(QChar::digitValue((ushort)0xd800) == -1); - QVERIFY(QChar::digitValue((uint)UNICODE_LAST_CODEPOINT + 1) == -1); + QVERIFY(QChar::digitValue((uint)0x110000u) == -1); } void tst_QChar::mirroredChar() diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp index 9f9e7e41d5..62a905f7b3 100644 --- a/tests/benchmarks/corelib/tools/qstring/main.cpp +++ b/tests/benchmarks/corelib/tools/qstring/main.cpp @@ -1934,19 +1934,6 @@ int fromUtf8_latin1_sse2_improved(ushort *dst, const char *chars, int len) } #endif -static inline bool isUnicodeNonCharacter(uint ucs4) -{ - // Unicode has a couple of "non-characters" that one can use internally, - // but are not allowed to be used for text interchange. - // - // Those are the last two entries each Unicode Plane (U+FFFE, U+FFFF, - // U+1FFFE, U+1FFFF, etc.) as well as the entries between U+FDD0 and - // U+FDEF (inclusive) - - return (ucs4 & 0xfffe) == 0xfffe - || (ucs4 - 0xfdd0U) < 32; -} - int fromUtf8_qt47(ushort *dst, const char *chars, int len) { // this is almost the code found in Qt 4.7's qutfcodec.cpp QUtf8Codec::convertToUnicode @@ -1996,12 +1983,12 @@ int fromUtf8_qt47(ushort *dst, const char *chars, int len) bool nonCharacter; if (!headerdone && uc == 0xfeff) { // don't do anything, just skip the BOM - } else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc < 0x110000) { + } else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { // surrogate pair //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); *qch++ = QChar::highSurrogate(uc); *qch++ = QChar::lowSurrogate(uc); - } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) { + } else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) { // error: overlong sequence, UTF16 surrogate or non-character *qch++ = replacement; ++invalid; @@ -2102,12 +2089,12 @@ int fromUtf8_qt47_stateless(ushort *dst, const char *chars, int len) bool nonCharacter; if (!headerdone && uc == 0xfeff) { // don't do anything, just skip the BOM - } else if (!(nonCharacter = isUnicodeNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc < 0x110000) { + } else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) { // surrogate pair //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length()); *qch++ = QChar::highSurrogate(uc); *qch++ = QChar::lowSurrogate(uc); - } else if ((uc < min_uc) || (uc >= 0xd800 && uc <= 0xdfff) || nonCharacter || uc >= 0x110000) { + } else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) { // error: overlong sequence, UTF16 surrogate or non-character *qch++ = replacement; ++invalid; @@ -2227,7 +2214,7 @@ static inline void extract_utf8_multibyte(ushort *&dst, const char *&chars, qptr chars += 2; len -= 2; if (!trusted && - (ucs < 0x800 || isUnicodeNonCharacter(ucs) || (ucs >= 0xd800 && ucs <= 0xdfff))) + (ucs < 0x800 || QChar::isNonCharacter(ucs) || QChar::isSurrogate(ucs))) dst[counter] = QChar::ReplacementCharacter; else dst[counter] = ucs; @@ -2258,7 +2245,7 @@ static inline void extract_utf8_multibyte(ushort *&dst, const char *&chars, qptr // dst[counter] will correspond to chars[counter..counter+2], so adjust chars += 3; len -= 3; - if (trusted || (QChar::requiresSurrogates(ucs) && ucs < 0x110000 && !isUnicodeNonCharacter(ucs))) { + if (trusted || (QChar::requiresSurrogates(ucs) && ucs <= QChar::LastValidCodePoint && !QChar::isNonCharacter(ucs))) { dst[counter + 0] = QChar::highSurrogate(ucs); dst[counter + 1] = QChar::lowSurrogate(ucs); counter += 2; diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp index 7480180901..4b3a52b088 100644 --- a/util/unicode/main.cpp +++ b/util/unicode/main.cpp @@ -54,9 +54,6 @@ #define DATA_VERSION_S "5.0" #define DATA_VERSION_STR "QChar::Unicode_5_0" -#define LAST_CODEPOINT 0x10ffff -#define LAST_CODEPOINT_STR "0x10ffff" - static QHash age_map; @@ -417,18 +414,6 @@ static const char *methods = " inline int script(QChar ch)\n" " { return script(ch.unicode()); }\n\n"; -static const char *generated_methods = - " inline bool isNonCharacter(uint ucs4)\n" - " {\n" - " // Noncharacter_Code_Point:\n" - " // Unicode has a couple of \"non-characters\" that one can use internally,\n" - " // but are not allowed to be used for text interchange.\n" - " // Those are the last two entries each Unicode Plane (U+FFFE..U+FFFF,\n" - " // U+1FFFE..U+1FFFF, etc.) as well as the entries in range U+FDD0..U+FDEF\n" - "\n" - " return ucs4 >= 0xfdd0 && (ucs4 <= 0xfdef || (ucs4 & 0xfffe) == 0xfffe);\n" - " }\n\n"; - static const int SizeOfPropertiesStruct = 20; struct PropertyFlags { @@ -592,8 +577,8 @@ UnicodeData &UnicodeData::valueRef(int codepoint) { static bool initialized = false; if (!initialized) { - unicodeData.reserve(LAST_CODEPOINT + 1); - for (int uc = 0; uc <= LAST_CODEPOINT; ++uc) + unicodeData.reserve(QChar::LastValidCodePoint + 1); + for (int uc = 0; uc <= QChar::LastValidCodePoint; ++uc) unicodeData.append(UnicodeData(uc)); initialized = true; } @@ -795,7 +780,7 @@ static void readUnicodeData() bool ok; int codepoint = properties[UD_Value].toInt(&ok, 16); Q_ASSERT(ok); - Q_ASSERT(codepoint <= LAST_CODEPOINT); + Q_ASSERT(codepoint <= QChar::LastValidCodePoint); int lastCodepoint = codepoint; QByteArray name = properties[UD_Name]; @@ -807,7 +792,7 @@ static void readUnicodeData() Q_ASSERT(properties[UD_Name].startsWith('<') && properties[UD_Name].contains("Last")); lastCodepoint = properties[UD_Value].toInt(&ok, 16); Q_ASSERT(ok); - Q_ASSERT(lastCodepoint <= LAST_CODEPOINT); + Q_ASSERT(lastCodepoint <= QChar::LastValidCodePoint); } UnicodeData &data = UnicodeData::valueRef(codepoint); @@ -1106,7 +1091,7 @@ static void readDerivedNormalizationProps() } } - for (int codepoint = 0; codepoint <= LAST_CODEPOINT; ++codepoint) { + for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) { UnicodeData &d = UnicodeData::valueRef(codepoint); if (!d.excludedComposition && d.decompositionType == QChar::Canonical @@ -1208,9 +1193,8 @@ static QList uniqueProperties; static void computeUniqueProperties() { qDebug("computeUniqueProperties:"); - for (int codepoint = 0; codepoint <= LAST_CODEPOINT; ++codepoint) { + for (int codepoint = 0; codepoint <= QChar::LastValidCodePoint; ++codepoint) { UnicodeData &d = UnicodeData::valueRef(codepoint); - int index = uniqueProperties.indexOf(d.p); if (index == -1) { index = uniqueProperties.size(); @@ -2898,7 +2882,6 @@ int main(int, char **) "#include \n\n" "QT_BEGIN_NAMESPACE\n\n"); f.write("#define UNICODE_DATA_VERSION "DATA_VERSION_STR"\n\n"); - f.write("#define UNICODE_LAST_CODEPOINT "LAST_CODEPOINT_STR"\n\n"); f.write("namespace QUnicodeTables {\n\n"); f.write(property_string); f.write("\n"); @@ -2913,8 +2896,6 @@ int main(int, char **) f.write(line_break_class_string); f.write("\n"); f.write(methods); - f.write("\n"); - f.write(generated_methods); f.write("} // namespace QUnicodeTables\n\n" "QT_END_NAMESPACE\n\n" "#endif // QUNICODETABLES_P_H\n"); -- cgit v1.2.3