summaryrefslogtreecommitdiffstats
path: root/src/corelib/tools/qchar.cpp
diff options
context:
space:
mode:
authorKonstantin Ritt <ritt.ks@gmail.com>2013-08-13 05:02:34 +0300
committerThe Qt Project <gerrit-noreply@qt-project.org>2013-08-14 22:46:58 +0200
commit2e0a4b13addf1f56112bac38448be96fb02f650d (patch)
tree6fbd02132bdba115dd762aba7b0502a5d983d982 /src/corelib/tools/qchar.cpp
parent252bad7c589e03d3e12df02354b00a84d8e3159a (diff)
[2/2] Implement Unicode Normalization Form Quick Check (NF QC)
Use QuickCheck data from DerivedNormalizationProps.txt to check if the input text is already in the desired Normalization Form. \sa http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms Using NF QC makes a significant boost to most operations that rely on normalized input data, i.e. file path conversions on Mac, where "native" form is a decomposed Unicode string. Change-Id: I292a9da479c6beed730528fc7000c45bf1befc34 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src/corelib/tools/qchar.cpp')
-rw-r--r--src/corelib/tools/qchar.cpp61
1 files changed, 61 insertions, 0 deletions
diff --git a/src/corelib/tools/qchar.cpp b/src/corelib/tools/qchar.cpp
index 9ab7155c2d..dbd8a81b92 100644
--- a/src/corelib/tools/qchar.cpp
+++ b/src/corelib/tools/qchar.cpp
@@ -1884,4 +1884,65 @@ static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, in
}
}
+// returns true if the text is in a desired Normalization Form already; false otherwise.
+// sets lastStable to the position of the last stable code point
+static bool normalizationQuickCheckHelper(QString *str, QString::NormalizationForm mode, int from, int *lastStable)
+{
+ Q_STATIC_ASSERT(QString::NormalizationForm_D == 0);
+ Q_STATIC_ASSERT(QString::NormalizationForm_C == 1);
+ Q_STATIC_ASSERT(QString::NormalizationForm_KD == 2);
+ Q_STATIC_ASSERT(QString::NormalizationForm_KC == 3);
+
+ enum { NFQC_YES = 0, NFQC_NO = 1, NFQC_MAYBE = 3 };
+
+ const ushort *string = reinterpret_cast<const ushort *>(str->constData());
+ int length = str->length();
+
+ // this avoids one out of bounds check in the loop
+ while (length > from && QChar::isHighSurrogate(string[length - 1]))
+ --length;
+
+ uchar lastCombining = 0;
+ for (int i = from; i < length; ++i) {
+ int pos = i;
+ uint uc = string[i];
+ if (uc < 0x80) {
+ // ASCII characters are stable code points
+ lastCombining = 0;
+ *lastStable = pos;
+ continue;
+ }
+
+ if (QChar::isHighSurrogate(uc)) {
+ ushort low = string[i + 1];
+ if (!QChar::isLowSurrogate(low)) {
+ // treat surrogate like stable code point
+ lastCombining = 0;
+ *lastStable = pos;
+ continue;
+ }
+ ++i;
+ uc = QChar::surrogateToUcs4(uc, low);
+ }
+
+ const QUnicodeTables::Properties *p = qGetProp(uc);
+
+ if (p->combiningClass < lastCombining && p->combiningClass > 0)
+ return false;
+
+ const uchar check = (p->nfQuickCheck >> (mode << 1)) & 0x03;
+ if (check != NFQC_YES)
+ return false; // ### can we quick check NFQC_MAYBE ?
+
+ lastCombining = p->combiningClass;
+ if (lastCombining == 0)
+ *lastStable = pos;
+ }
+
+ if (length != str->length()) // low surrogate parts at the end of text
+ *lastStable = str->length() - 1;
+
+ return true;
+}
+
QT_END_NAMESPACE