summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMarc Mutz <marc.mutz@kdab.com>2020-04-24 12:54:20 +0200
committerMarc Mutz <marc.mutz@kdab.com>2020-05-09 06:25:05 +0000
commit19e7c0d2b5a807e194be1d65b16041f48136c9be (patch)
tree4d6df4cf5e84e4fc6bcad63c4aa07222fc0c36bc
parentf6b96bc34749e4478e75c081bbd0af406cd737b5 (diff)
QChar/QString: centralize case folding in qchar.cpp
There are (at least) two implementations of the low-level case-folding algorithm, one of which (for QChar::toLower()) seems to be wrong (it doesn't deal with special cases which expand to more than one code point). The algoithm hidden in QString and entangled with the QString detaching code makes reusing the code much harder. At the same time, the dependency of the algorithm on the unicode tables makes exposing a non-allocating result type in the public API hard. std::u16string would be an alternative if we can assure that all implementations use SSO with at least four characters. So, for the time being, leave this as internal API for use in an upcoming QStringView::toLower() as well as case-insensitive hashing. Change-Id: Iabb2611846f6176776aa20e634f44d8464f3305c Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
-rw-r--r--src/corelib/text/qchar.cpp23
-rw-r--r--src/corelib/text/qstring.cpp34
-rw-r--r--src/corelib/text/qunicodetables.cpp2
-rw-r--r--util/unicode/main.cpp6
4 files changed, 45 insertions, 20 deletions
diff --git a/src/corelib/text/qchar.cpp b/src/corelib/text/qchar.cpp
index 6b1bb33949..1b8cfb642e 100644
--- a/src/corelib/text/qchar.cpp
+++ b/src/corelib/text/qchar.cpp
@@ -1550,6 +1550,29 @@ QChar::UnicodeVersion QChar::currentUnicodeVersion() noexcept
return UNICODE_DATA_VERSION;
}
+using FullConvertCaseResult = std::array<char16_t, MaxSpecialCaseLength + 1>;
+static FullConvertCaseResult fullConvertCase(char32_t uc, QUnicodeTables::Case which) noexcept
+{
+ FullConvertCaseResult result = {};
+ auto pp = result.begin();
+
+ const auto fold = qGetProp(uc)->cases[which];
+ const auto caseDiff = fold.diff;
+
+ if (Q_UNLIKELY(fold.special)) {
+ const auto *specialCase = specialCaseMap + caseDiff;
+ auto length = *specialCase++;
+ while (length--)
+ *pp++ = *specialCase++;
+ } else if (Q_UNLIKELY(QChar::requiresSurrogates(uc))) {
+ // so far, case convertion never changes planes (guaranteed by the qunicodetables generator)
+ *pp++ = QChar::highSurrogate(uc);
+ *pp++ = QChar::lowSurrogate(uc + caseDiff);
+ } else {
+ *pp++ = uc + caseDiff;
+ }
+ return result;
+}
template <typename T>
Q_DECL_CONST_FUNCTION static inline T convertCase_helper(T uc, QUnicodeTables::Case which) noexcept
diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp
index 49ddb9279b..d8c74aac62 100644
--- a/src/corelib/text/qstring.cpp
+++ b/src/corelib/text/qstring.cpp
@@ -6533,36 +6533,32 @@ static QString detachAndConvertCase(T &str, QStringIterator it, QUnicodeTables::
QChar *pp = s.begin() + it.index(); // will detach if necessary
do {
- auto uc = it.nextUnchecked();
-
- const auto fold = qGetProp(uc)->cases[which];
- signed short caseDiff = fold.diff;
-
- if (Q_UNLIKELY(fold.special)) {
- const ushort *specialCase = specialCaseMap + caseDiff;
- ushort length = *specialCase++;
-
- if (Q_LIKELY(length == 1)) {
- *pp++ = QChar(*specialCase);
+ const auto folded = fullConvertCase(it.nextUnchecked(), which);
+ if (Q_UNLIKELY(folded[1])) {
+ if (folded[0] == *pp && !folded[2]) {
+ // special case: only second actually changed (e.g. surrogate pairs),
+ // avoid slow case
+ ++pp;
+ *pp++ = folded[1];
} else {
// slow path: the string is growing
int inpos = it.index() - 1;
int outpos = pp - s.constBegin();
- s.replace(outpos, 1, reinterpret_cast<const QChar *>(specialCase), length);
- pp = const_cast<QChar *>(s.constBegin()) + outpos + length;
+ int foldedSize = 2; // must be at least 2, b/c folded[1] != NUL
+ while (folded[foldedSize])
+ ++foldedSize;
+
+ s.replace(outpos, 1, reinterpret_cast<const QChar *>(folded.data()), foldedSize);
+ pp = const_cast<QChar *>(s.constBegin()) + outpos + foldedSize;
// do we need to adjust the input iterator too?
// if it is pointing to s's data, str is empty
if (str.isEmpty())
- it = QStringIterator(s.constBegin(), inpos + length, s.constEnd());
+ it = QStringIterator(s.constBegin(), inpos + foldedSize, s.constEnd());
}
- } else if (Q_UNLIKELY(QChar::requiresSurrogates(uc))) {
- // so far, case convertion never changes planes (guaranteed by the qunicodetables generator)
- pp++;
- *pp++ = QChar(QChar::lowSurrogate(uc + caseDiff));
} else {
- *pp++ = QChar(uc + caseDiff);
+ *pp++ = folded[0];
}
} while (it.hasNext());
diff --git a/src/corelib/text/qunicodetables.cpp b/src/corelib/text/qunicodetables.cpp
index e6f6487126..865f93ba52 100644
--- a/src/corelib/text/qunicodetables.cpp
+++ b/src/corelib/text/qunicodetables.cpp
@@ -9946,6 +9946,8 @@ static const unsigned short specialCaseMap[] = {
0x1, 0xa64b
};
+const unsigned int MaxSpecialCaseLength = 3;
+
static const unsigned short uc_decomposition_trie[] = {
// 0 - 0x3400
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp
index e89d4e4706..3d66af911c 100644
--- a/util/unicode/main.cpp
+++ b/util/unicode/main.cpp
@@ -2581,6 +2581,7 @@ static QByteArray createSpecialCaseMap()
out += "static const unsigned short specialCaseMap[] = {\n"
" 0x0, // placeholder";
int i = 1;
+ int maxN = 0;
while (i < specialCaseMap.size()) {
out += "\n ";
int n = specialCaseMap.at(i);
@@ -2589,9 +2590,12 @@ static QByteArray createSpecialCaseMap()
out += ",";
}
i += n + 1;
+ maxN = std::max(maxN, n);
}
out.chop(1);
- out += "\n};\n\n";
+ out += "\n};\n\nconst unsigned int MaxSpecialCaseLength = ";
+ out += QByteArray::number(maxN);
+ out += ";\n\n\n";
qDebug(" memory usage: %ld bytes", specialCaseMap.size()*sizeof(unsigned short));