1 files changed, 88 insertions, 153 deletions
diff --git a/src/corelib/text/qbytearray.cpp b/src/corelib/text/qbytearray.cpp
index 16635c4dd9..ae73c5977d 100644
--- a/src/corelib/text/qbytearray.cpp
+++ b/src/corelib/text/qbytearray.cpp
@@ -1,6 +1,6 @@
 /****************************************************************************
 **
-** Copyright (C) 2019 The Qt Company Ltd.
+** Copyright (C) 2020 The Qt Company Ltd.
 ** Copyright (C) 2016 Intel Corporation.
 ** Copyright (C) 2019 Klarälvdalens Datakonsult AB, a KDAB Group company, info@kdab.com, author Giuseppe D'Angelo <giuseppe.dangelo@kdab.com>
 ** Contact: https://www.qt.io/licensing/
@@ -69,64 +69,16 @@
 
 QT_BEGIN_NAMESPACE
 
-// Latin 1 case system, used by QByteArray::to{Upper,Lower}() and qstr(n)icmp():
-/*
-#!/usr/bin/perl -l
-use feature "unicode_strings";
-for (0..255) {
-    $up = uc(chr($_));
-    $up = chr($_) if ord($up) > 0x100 || length $up > 1;
-    printf "0x%02x,", ord($up);
-    print "" if ($_ & 0xf) == 0xf;
-}
-*/
-static const uchar latin1_uppercased[256] = {
-    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
-    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
-    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
-    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
-    0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
-    0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x5b,0x5c,0x5d,0x5e,0x5f,
-    0x60,0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,0x4e,0x4f,
-    0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,0x7b,0x7c,0x7d,0x7e,0x7f,
-    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
-    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
-    0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
-    0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
-    0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
-    0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xd7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xdf,
-    0xc0,0xc1,0xc2,0xc3,0xc4,0xc5,0xc6,0xc7,0xc8,0xc9,0xca,0xcb,0xcc,0xcd,0xce,0xcf,
-    0xd0,0xd1,0xd2,0xd3,0xd4,0xd5,0xd6,0xf7,0xd8,0xd9,0xda,0xdb,0xdc,0xdd,0xde,0xff
-};
+// ASCII case system, used by QByteArray::to{Upper,Lower}() and qstr(n)icmp():
+static constexpr inline uchar asciiUpper(uchar c)
+{
+    return c >= 'a' && c <= 'z' ? c & ~0x20 : c;
+}
 
-/*
-#!/usr/bin/perl -l
-use feature "unicode_strings";
-for (0..255) {
-    $up = lc(chr($_));
-    $up = chr($_) if ord($up) > 0x100 || length $up > 1;
-    printf "0x%02x,", ord($up);
-    print "" if ($_ & 0xf) == 0xf;
-}
-*/
-static const uchar latin1_lowercased[256] = {
-    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
-    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f,
-    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,0x28,0x29,0x2a,0x2b,0x2c,0x2d,0x2e,0x2f,
-    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x3a,0x3b,0x3c,0x3d,0x3e,0x3f,
-    0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
-    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x5b,0x5c,0x5d,0x5e,0x5f,
-    0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,0x6e,0x6f,
-    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,0x7b,0x7c,0x7d,0x7e,0x7f,
-    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x8a,0x8b,0x8c,0x8d,0x8e,0x8f,
-    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,0x98,0x99,0x9a,0x9b,0x9c,0x9d,0x9e,0x9f,
-    0xa0,0xa1,0xa2,0xa3,0xa4,0xa5,0xa6,0xa7,0xa8,0xa9,0xaa,0xab,0xac,0xad,0xae,0xaf,
-    0xb0,0xb1,0xb2,0xb3,0xb4,0xb5,0xb6,0xb7,0xb8,0xb9,0xba,0xbb,0xbc,0xbd,0xbe,0xbf,
-    0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
-    0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xd7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xdf,
-    0xe0,0xe1,0xe2,0xe3,0xe4,0xe5,0xe6,0xe7,0xe8,0xe9,0xea,0xeb,0xec,0xed,0xee,0xef,
-    0xf0,0xf1,0xf2,0xf3,0xf4,0xf5,0xf6,0xf7,0xf8,0xf9,0xfa,0xfb,0xfc,0xfd,0xfe,0xff
-};
+static constexpr inline uchar asciiLower(uchar c)
+{
+    return c >= 'A' && c <= 'Z' ? c | 0x20 : c;
+}
 
 int qFindByteArray(
     const char *haystack0, int haystackLen, int from,
@@ -293,8 +245,8 @@ int qstrcmp(const char *str1, const char *str2)
 
     A safe \c stricmp() function.
 
-    Compares \a str1 and \a str2 ignoring the case of the
-    characters. The encoding of the strings is assumed to be Latin-1.
+    Compares \a str1 and \a str2, ignoring differences in the case of any ASCII
+    characters.
 
     Returns a negative value if \a str1 is less than \a str2, 0 if \a
     str1 is equal to \a str2 or a positive value if \a str1 is greater
@@ -323,11 +275,10 @@ int qstricmp(const char *str1, const char *str2)
     auto innerCompare = [=, &offset](qptrdiff max, bool unlimited) {
         max += offset;
         do {
-            uchar c = latin1_lowercased[s1[offset]];
-            int res = c - latin1_lowercased[s2[offset]];
-            if (Q_UNLIKELY(res))
+            uchar c = s1[offset];
+            if (int res = asciiLower(c) - asciiLower(s2[offset]))
                 return res;
-            if (Q_UNLIKELY(!c))
+            if (!c)
                 return 0;
             ++offset;
         } while (unlimited || offset < max);
@@ -385,9 +336,8 @@ int qstricmp(const char *str1, const char *str2)
 
     A safe \c strnicmp() function.
 
-    Compares at most \a len bytes of \a str1 and \a str2 ignoring the
-    case of the characters. The encoding of the strings is assumed to
-    be Latin-1.
+    Compares at most \a len bytes of \a str1 and \a str2, ignoring differences
+    in the case of any ASCII characters.
 
     Returns a negative value if \a str1 is less than \a str2, 0 if \a str1
     is equal to \a str2 or a positive value if \a str1 is greater than \a
@@ -406,12 +356,11 @@ int qstrnicmp(const char *str1, const char *str2, uint len)
 {
     const uchar *s1 = reinterpret_cast<const uchar *>(str1);
     const uchar *s2 = reinterpret_cast<const uchar *>(str2);
-    int res;
-    uchar c;
     if (!s1 || !s2)
         return s1 ? 1 : (s2 ? -1 : 0);
-    for (; len--; s1++, s2++) {
-        if ((res = (c = latin1_lowercased[*s1]) - latin1_lowercased[*s2]))
+    for (; len--; ++s1, ++s2) {
+        const uchar c = *s1;
+        if (int res = asciiLower(c) - asciiLower(*s2))
             return res;
         if (!c)                                // strings are equal
             break;
@@ -437,28 +386,23 @@ int qstrnicmp(const char *str1, qsizetype len1, const char *str2, qsizetype len2
     if (!s2)
         return len1 == 0 ? 0 : 1;
 
-    int res;
-    uchar c;
     if (len2 == -1) {
         // null-terminated str2
         qsizetype i;
         for (i = 0; i < len1; ++i) {
-            c = latin1_lowercased[s2[i]];
+            const uchar c = s2[i];
             if (!c)
                 return 1;
 
-            res = latin1_lowercased[s1[i]] - c;
-            if (res)
+            if (int res = asciiLower(s1[i]) - asciiLower(c))
                 return res;
         }
-        c = latin1_lowercased[s2[i]];
-        return c ? -1 : 0;
+        return s2[i] ? -1 : 0;
     } else {
         // not null-terminated
-        for (qsizetype i = 0; i < qMin(len1, len2); ++i) {
-            c = latin1_lowercased[s2[i]];
-            res = latin1_lowercased[s1[i]] - c;
-            if (res)
+        const qsizetype len = qMin(len1, len2);
+        for (qsizetype i = 0; i < len; ++i) {
+            if (int res = asciiLower(s1[i]) - asciiLower(s2[i]))
                 return res;
         }
         if (len1 == len2)
@@ -786,14 +730,14 @@ QByteArray qUncompress(const uchar* data, int nbytes)
     terminator, and uses \l{implicit sharing} (copy-on-write) to
     reduce memory usage and avoid needless copying of data.
 
-    In addition to QByteArray, Qt also provides the QString class to
-    store string data. For most purposes, QString is the class you
-    want to use. It stores 16-bit Unicode characters, making it easy
-    to store non-ASCII/non-Latin-1 characters in your application.
-    Furthermore, QString is used throughout in the Qt API. The two
-    main cases where QByteArray is appropriate are when you need to
-    store raw binary data, and when memory conservation is critical
-    (e.g., with Qt for Embedded Linux).
+    In addition to QByteArray, Qt also provides the QString class to store
+    string data. For most purposes, QString is the class you want to use. It
+    understands its content as Unicode text (encoded using UTF-16) where
+    QByteArray aims to avoid assumptions about the encoding or semantics of the
+    bytes it stores (aside from a few legacy cases where it uses ASCII).
+    Furthermore, QString is used throughout in the Qt API. The two main cases
+    where QByteArray is appropriate are when you need to store raw binary data,
+    and when memory conservation is critical (e.g., with Qt for Embedded Linux).
 
     One way to initialize a QByteArray is simply to pass a \c{const
     char *} to its constructor. For example, the following code
@@ -868,13 +812,6 @@ QByteArray qUncompress(const uchar* data, int nbytes)
     memory QByteArray actually allocated. Data appended to an empty
     array is not copied.
 
-    A frequent requirement is to remove whitespace characters from a
-    byte array ('\\n', '\\t', ' ', etc.). If you want to remove
-    whitespace from both ends of a QByteArray, use trimmed(). If you
-    want to remove whitespace from both ends and replace multiple
-    consecutive whitespaces with a single space character within the
-    byte array, use simplified().
-
     If you want to find all occurrences of a particular character or
     substring in a QByteArray, use indexOf() or lastIndexOf(). The
     former searches forward starting from a given index position, the
@@ -932,29 +869,40 @@ QByteArray qUncompress(const uchar* data, int nbytes)
     Such considerations, the configuration of such behavior or any mitigation
     are outside the scope of the QByteArray API.
 
-    \section1 Notes on Locale
+    \section1 C locale and ASCII functions
+
+    QByteArray generally handles data as bytes, without presuming any semantics;
+    where it does presume semantics, it uses the C locale and ASCII encoding.
+    Standard Unicode encodings are supported by QString, other encodings may be
+    supported using QStringEncoder and QStringDecoder to convert to Unicode. For
+    locale-specific interpretation of text, use QLocale or QString.
+
+    \section2 Spacing Characters
+
+    A frequent requirement is to remove spacing characters from a byte array
+    ('\\n', '\\t', ' ', etc.). If you want to remove spacing from both ends of a
+    QByteArray, use trimmed(). If you want to remove spacing from both ends and
+    replace each run of spacing characters with a single space character within
+    the byte array, use simplified(). Only ASCII spacing characters are
+    recognized for these purposes.
 
     \section2 Number-String Conversions
 
-    Functions that perform conversions between numeric data types and
-    strings are performed in the C locale, irrespective of the user's
-    locale settings. Use QString to perform locale-aware conversions
-    between numbers and strings.
+    Functions that perform conversions between numeric data types and strings
+    are performed in the C locale, regardless of the user's locale settings. Use
+    QLocale to perform locale-aware conversions between numbers and strings.
 
-    \section2 8-bit Character Comparisons
+    \section2 Character Case
 
-    In QByteArray, the notion of uppercase and lowercase and of which
-    character is greater than or less than another character is done
-    in the Latin-1 locale. This affects functions that support a case
-    insensitive option or that compare or lowercase or uppercase
-    their arguments. Case insensitive operations and comparisons will
-    be accurate if both strings contain only Latin-1 characters.
-    Functions that this affects include contains(), indexOf(),
-    lastIndexOf(), operator<(), operator<=(), operator>(),
-    operator>=(), isLower(), isUpper(), toLower() and toUpper().
+    In QByteArray, the notion of uppercase and lowercase and of case-independent
+    comparison is limited to ASCII. Non-ASCII characters are treated as
+    caseless, since their case depends on encoding. This affects functions that
+    support a case insensitive option or that change the case of their
+    arguments. Functions that this affects include contains(), indexOf(),
+    lastIndexOf(), isLower(), isUpper(), toLower() and toUpper().
 
-    This issue does not apply to \l{QString}s since they represent
-    characters using Unicode.
+    This issue does not apply to \l{QString}s since they represent characters
+    using Unicode.
 
     \sa QString, QBitArray
 */
@@ -2899,22 +2847,16 @@ bool QByteArray::endsWith(const char *str) const
 }
 
 /*
-    Returns true if \a c is an uppercase Latin1 letter.
-    \note The multiplication sign 0xD7 and the sz ligature 0xDF are not
-    treated as uppercase Latin1.
+    Returns true if \a c is an uppercase ASCII letter.
  */
-static inline bool isUpperCaseLatin1(char c)
+static constexpr inline bool isUpperCaseAscii(char c)
 {
-    if (c >= 'A' && c <= 'Z')
-        return true;
-
-    return (uchar(c) >= 0xC0 && uchar(c) <= 0xDE && uchar(c) != 0xD7);
+    return c >= 'A' && c <= 'Z';
 }
 
 /*!
-    Returns \c true if this byte array contains only uppercase letters,
-    otherwise returns \c false. The byte array is interpreted as a Latin-1
-    encoded string.
+    Returns \c true if this byte array contains only ASCII uppercase letters,
+    otherwise returns \c false.
     \since 5.12
 
     \sa isLower(), toUpper()
@@ -2927,7 +2869,7 @@ bool QByteArray::isUpper() const
     const char *d = data();
 
     for (int i = 0, max = size(); i < max; ++i) {
-        if (!isUpperCaseLatin1(d[i]))
+        if (!isUpperCaseAscii(d[i]))
             return false;
     }
 
@@ -2935,22 +2877,16 @@ bool QByteArray::isUpper() const
 }
 
 /*
-    Returns true if \a c is an lowercase Latin1 letter.
-    \note The division sign 0xF7 is not treated as lowercase Latin1,
-    but the small y dieresis 0xFF is.
+    Returns true if \a c is an lowercase ASCII letter.
  */
-static inline bool isLowerCaseLatin1(char c)
+static constexpr inline bool isLowerCaseAscii(char c)
 {
-    if (c >= 'a' && c <= 'z')
-        return true;
-
-    return (uchar(c) >= 0xD0 && uchar(c) != 0xF7);
+    return c >= 'a' && c <= 'z';
 }
 
 /*!
-    Returns \c true if this byte array contains only lowercase letters,
-    otherwise returns \c false. The byte array is interpreted as a Latin-1
-    encoded string.
+    Returns \c true if this byte array contains only lowercase ASCII letters,
+    otherwise returns \c false.
     \since 5.12
 
     \sa isUpper(), toLower()
@@ -2963,7 +2899,7 @@ bool QByteArray::isLower() const
     const char *d = data();
 
     for (int i = 0, max = size(); i < max; ++i) {
-        if (!isLowerCaseLatin1(d[i]))
+        if (!isLowerCaseAscii(d[i]))
             return false;
     }
 
@@ -3076,8 +3012,8 @@ QByteArray QByteArray::mid(int pos, int len) const
 /*!
     \fn QByteArray QByteArray::toLower() const
 
-    Returns a lowercase copy of the byte array. The bytearray is
-    interpreted as a Latin-1 encoded string.
+    Returns a copy of the byte array in which each ASCII uppercase letter
+    converted to lowercase.
 
     Example:
     \snippet code/src_corelib_text_qbytearray.cpp 30
@@ -3090,7 +3026,7 @@ QByteArray QByteArray::mid(int pos, int len) const
 // (even with constant propagation, there's no gain in performance).
 template <typename T>
 Q_NEVER_INLINE
-static QByteArray toCase_template(T &input, const uchar * table)
+static QByteArray toCase_template(T &input, uchar (*lookup)(uchar))
 {
     // find the first bad character in input
     const char *orig_begin = input.constBegin();
@@ -3098,7 +3034,7 @@ static QByteArray toCase_template(T &input, const uchar * table)
     const char *e = input.constEnd();
     for ( ; firstBad != e ; ++firstBad) {
         uchar ch = uchar(*firstBad);
-        uchar converted = table[ch];
+        uchar converted = lookup(ch);
         if (ch != converted)
             break;
     }
@@ -3111,27 +3047,26 @@ static QByteArray toCase_template(T &input, const uchar * table)
     char *b = s.begin();            // will detach if necessary
     char *p = b + (firstBad - orig_begin);
     e = b + s.size();
-    for ( ; p != e; ++p) {
-        *p = char(uchar(table[uchar(*p)]));
-    }
+    for ( ; p != e; ++p)
+        *p = char(lookup(uchar(*p)));
     return s;
 }
 
 QByteArray QByteArray::toLower_helper(const QByteArray &a)
 {
-    return toCase_template(a, latin1_lowercased);
+    return toCase_template(a, asciiLower);
 }
 
 QByteArray QByteArray::toLower_helper(QByteArray &a)
 {
-    return toCase_template(a, latin1_lowercased);
+    return toCase_template(a, asciiLower);
 }
 
 /*!
     \fn QByteArray QByteArray::toUpper() const
 
-    Returns an uppercase copy of the byte array. The bytearray is
-    interpreted as a Latin-1 encoded string.
+    Returns a copy of the byte array in which each ASCII lowercase letter
+    converted to uppercase.
 
     Example:
     \snippet code/src_corelib_text_qbytearray.cpp 31
@@ -3141,12 +3076,12 @@ QByteArray QByteArray::toLower_helper(QByteArray &a)
 
 QByteArray QByteArray::toUpper_helper(const QByteArray &a)
 {
-    return toCase_template(a, latin1_uppercased);
+    return toCase_template(a, asciiUpper);
 }
 
 QByteArray QByteArray::toUpper_helper(QByteArray &a)
 {
-    return toCase_template(a, latin1_uppercased);
+    return toCase_template(a, asciiUpper);
 }
 
 /*! \fn void QByteArray::clear()
@@ -4226,7 +4161,7 @@ QByteArray &QByteArray::setNum(double n, char f, int prec)
     QLocaleData::DoubleForm form = QLocaleData::DFDecimal;
     uint flags = QLocaleData::ZeroPadExponent;
 
-    char lower = latin1_lowercased[uchar(f)];
+    char lower = asciiLower(uchar(f));
     if (f != lower)
         flags |= QLocaleData::CapitalEorX;
     f = lower;
@@ -4248,7 +4183,7 @@ QByteArray &QByteArray::setNum(double n, char f, int prec)
             break;
     }
 
-    *this = QLocaleData::c()->doubleToString(n, prec, form, -1, flags).toLatin1();
+    *this = QLocaleData::c()->doubleToString(n, prec, form, -1, flags).toUtf8();
     return *this;
 }