Allow non-character codes in utf8 strings

Changed the processing of non-character code handling in the UTF8 codec. Non-character codes are now accepted in QStrings, QUrls and QJson strings. Unit tests were adapted accordingly. For more info about non-character codes, see: http://www.unicode.org/versions/corrigendum9.html [ChangeLog][QtCore][QUtf8] UTF-8 now accepts non-character unicode points; these are not replaced by the replacement character anymore [ChangeLog][QtCore][QUrl] QUrl now fully accepts non-character unicode points; they are encoded as percent characters; they can also be pretty decoded [ChangeLog][QtCore][QJson] The Writer and the Parser now fully accept non-character unicode points. Change-Id: I77cf4f0e6210741eac8082912a0b6118eced4f77 Task-number: QTBUG-33229 Reviewed-by: Lars Knoll <lars.knoll@digia.com> Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
author: Kurt Pattyn <pattyn.kurt@gmail.com> 2013-10-06 11:40:47 +0200
committer: The Qt Project <gerrit-noreply@qt-project.org> 2013-10-17 09:50:58 +0200
commit: add2bf739ae96603cb919b908cbb53c00d0628cc (patch)
tree: 9702a95d145fc9f429aa6f2ec104cfab75cae753 /tests/benchmarks/corelib/tools/qstring/main.cpp
parent: e8853506bf82e569009e68a23437d6a134176f63 (diff)
1 files changed, 8 insertions, 10 deletions
diff --git a/tests/benchmarks/corelib/tools/qstring/main.cpp b/tests/benchmarks/corelib/tools/qstring/main.cpp
index 67ed4c32b9..6101cfe8fb 100644
--- a/tests/benchmarks/corelib/tools/qstring/main.cpp
+++ b/tests/benchmarks/corelib/tools/qstring/main.cpp
@@ -1980,16 +1980,15 @@ int fromUtf8_qt47(ushort *dst, const char *chars, int len)
                 --need;
                 if (!need) {
                     // utf-8 bom composes into 0xfeff code point
-                    bool nonCharacter;
                     if (!headerdone && uc == 0xfeff) {
                         // don't do anything, just skip the BOM
-                    } else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
+                    } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
                         // surrogate pair
                         //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
                         *qch++ = QChar::highSurrogate(uc);
                         *qch++ = QChar::lowSurrogate(uc);
-                    } else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) {
-                        // error: overlong sequence, UTF16 surrogate or non-character
+                    } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
+                        // error: overlong sequence or UTF16 surrogate
                         *qch++ = replacement;
                         ++invalid;
                     } else {
@@ -2086,16 +2085,15 @@ int fromUtf8_qt47_stateless(ushort *dst, const char *chars, int len)
                 --need;
                 if (!need) {
                     // utf-8 bom composes into 0xfeff code point
-                    bool nonCharacter;
                     if (!headerdone && uc == 0xfeff) {
                         // don't do anything, just skip the BOM
-                    } else if (!(nonCharacter = QChar::isNonCharacter(uc)) && QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
+                    } else if (QChar::requiresSurrogates(uc) && uc <= QChar::LastValidCodePoint) {
                         // surrogate pair
                         //Q_ASSERT((qch - (ushort*)result.unicode()) + 2 < result.length());
                         *qch++ = QChar::highSurrogate(uc);
                         *qch++ = QChar::lowSurrogate(uc);
-                    } else if ((uc < min_uc) || QChar::isSurrogate(uc) || nonCharacter || uc > QChar::LastValidCodePoint) {
-                        // error: overlong sequence, UTF16 surrogate or non-character
+                    } else if ((uc < min_uc) || QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint) {
+                        // error: overlong sequence or UTF16 surrogate
                         *qch++ = replacement;
                         ++invalid;
                     } else {
@@ -2214,7 +2212,7 @@ static inline void extract_utf8_multibyte(ushort *&dst, const char *&chars, qptr
         chars += 2;
         len -= 2;
         if (!trusted &&
-            (ucs < 0x800 || QChar::isNonCharacter(ucs) || QChar::isSurrogate(ucs)))
+            (ucs < 0x800 || QChar::isSurrogate(ucs)))
             dst[counter] = QChar::ReplacementCharacter;
         else
             dst[counter] = ucs;
@@ -2245,7 +2243,7 @@ static inline void extract_utf8_multibyte(ushort *&dst, const char *&chars, qptr
         // dst[counter] will correspond to chars[counter..counter+2], so adjust
         chars += 3;
         len -= 3;
-        if (trusted || (QChar::requiresSurrogates(ucs) && ucs <= QChar::LastValidCodePoint && !QChar::isNonCharacter(ucs))) {
+        if (trusted || (QChar::requiresSurrogates(ucs) && ucs <= QChar::LastValidCodePoint)) {
             dst[counter + 0] = QChar::highSurrogate(ucs);
             dst[counter + 1] = QChar::lowSurrogate(ucs);
             counter += 2;
author	Kurt Pattyn <pattyn.kurt@gmail.com>	2013-10-06 11:40:47 +0200
committer	The Qt Project <gerrit-noreply@qt-project.org>	2013-10-17 09:50:58 +0200
commit	add2bf739ae96603cb919b908cbb53c00d0628cc (patch)
tree	9702a95d145fc9f429aa6f2ec104cfab75cae753 /tests/benchmarks/corelib/tools/qstring/main.cpp
parent	e8853506bf82e569009e68a23437d6a134176f63 (diff)