From 2e0a4b13addf1f56112bac38448be96fb02f650d Mon Sep 17 00:00:00 2001 From: Konstantin Ritt Date: Tue, 13 Aug 2013 05:02:34 +0300 Subject: [2/2] Implement Unicode Normalization Form Quick Check (NF QC) Use QuickCheck data from DerivedNormalizationProps.txt to check if the input text is already in the desired Normalization Form. \sa http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms Using NF QC makes a significant boost to most operations that rely on normalized input data, i.e. file path conversions on Mac, where "native" form is a decomposed Unicode string. Change-Id: I292a9da479c6beed730528fc7000c45bf1befc34 Reviewed-by: Thiago Macieira --- src/corelib/tools/qchar.cpp | 61 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'src/corelib/tools/qchar.cpp') diff --git a/src/corelib/tools/qchar.cpp b/src/corelib/tools/qchar.cpp index 9ab7155c2d..dbd8a81b92 100644 --- a/src/corelib/tools/qchar.cpp +++ b/src/corelib/tools/qchar.cpp @@ -1884,4 +1884,65 @@ static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, in } } +// returns true if the text is in a desired Normalization Form already; false otherwise. +// sets lastStable to the position of the last stable code point +static bool normalizationQuickCheckHelper(QString *str, QString::NormalizationForm mode, int from, int *lastStable) +{ + Q_STATIC_ASSERT(QString::NormalizationForm_D == 0); + Q_STATIC_ASSERT(QString::NormalizationForm_C == 1); + Q_STATIC_ASSERT(QString::NormalizationForm_KD == 2); + Q_STATIC_ASSERT(QString::NormalizationForm_KC == 3); + + enum { NFQC_YES = 0, NFQC_NO = 1, NFQC_MAYBE = 3 }; + + const ushort *string = reinterpret_cast(str->constData()); + int length = str->length(); + + // this avoids one out of bounds check in the loop + while (length > from && QChar::isHighSurrogate(string[length - 1])) + --length; + + uchar lastCombining = 0; + for (int i = from; i < length; ++i) { + int pos = i; + uint uc = string[i]; + if (uc < 0x80) { + // ASCII characters are stable code points + lastCombining = 0; + *lastStable = pos; + continue; + } + + if (QChar::isHighSurrogate(uc)) { + ushort low = string[i + 1]; + if (!QChar::isLowSurrogate(low)) { + // treat surrogate like stable code point + lastCombining = 0; + *lastStable = pos; + continue; + } + ++i; + uc = QChar::surrogateToUcs4(uc, low); + } + + const QUnicodeTables::Properties *p = qGetProp(uc); + + if (p->combiningClass < lastCombining && p->combiningClass > 0) + return false; + + const uchar check = (p->nfQuickCheck >> (mode << 1)) & 0x03; + if (check != NFQC_YES) + return false; // ### can we quick check NFQC_MAYBE ? + + lastCombining = p->combiningClass; + if (lastCombining == 0) + *lastStable = pos; + } + + if (length != str->length()) // low surrogate parts at the end of text + *lastStable = str->length() - 1; + + return true; +} + QT_END_NAMESPACE -- cgit v1.2.3