diff options
author | Konstantin Ritt <ritt.ks@gmail.com> | 2013-08-13 05:02:34 +0300 |
---|---|---|
committer | The Qt Project <gerrit-noreply@qt-project.org> | 2013-08-14 22:46:58 +0200 |
commit | 2e0a4b13addf1f56112bac38448be96fb02f650d (patch) | |
tree | 6fbd02132bdba115dd762aba7b0502a5d983d982 /src/corelib/tools/qchar.cpp | |
parent | 252bad7c589e03d3e12df02354b00a84d8e3159a (diff) |
[2/2] Implement Unicode Normalization Form Quick Check (NF QC)
Use QuickCheck data from DerivedNormalizationProps.txt to check
if the input text is already in the desired Normalization Form.
\sa http://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
Using NF QC makes a significant boost to most operations that rely on
normalized input data, i.e. file path conversions on Mac, where "native"
form is a decomposed Unicode string.
Change-Id: I292a9da479c6beed730528fc7000c45bf1befc34
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src/corelib/tools/qchar.cpp')
-rw-r--r-- | src/corelib/tools/qchar.cpp | 61 |
1 files changed, 61 insertions, 0 deletions
diff --git a/src/corelib/tools/qchar.cpp b/src/corelib/tools/qchar.cpp index 9ab7155c2d..dbd8a81b92 100644 --- a/src/corelib/tools/qchar.cpp +++ b/src/corelib/tools/qchar.cpp @@ -1884,4 +1884,65 @@ static void canonicalOrderHelper(QString *str, QChar::UnicodeVersion version, in } } +// returns true if the text is in a desired Normalization Form already; false otherwise. +// sets lastStable to the position of the last stable code point +static bool normalizationQuickCheckHelper(QString *str, QString::NormalizationForm mode, int from, int *lastStable) +{ + Q_STATIC_ASSERT(QString::NormalizationForm_D == 0); + Q_STATIC_ASSERT(QString::NormalizationForm_C == 1); + Q_STATIC_ASSERT(QString::NormalizationForm_KD == 2); + Q_STATIC_ASSERT(QString::NormalizationForm_KC == 3); + + enum { NFQC_YES = 0, NFQC_NO = 1, NFQC_MAYBE = 3 }; + + const ushort *string = reinterpret_cast<const ushort *>(str->constData()); + int length = str->length(); + + // this avoids one out of bounds check in the loop + while (length > from && QChar::isHighSurrogate(string[length - 1])) + --length; + + uchar lastCombining = 0; + for (int i = from; i < length; ++i) { + int pos = i; + uint uc = string[i]; + if (uc < 0x80) { + // ASCII characters are stable code points + lastCombining = 0; + *lastStable = pos; + continue; + } + + if (QChar::isHighSurrogate(uc)) { + ushort low = string[i + 1]; + if (!QChar::isLowSurrogate(low)) { + // treat surrogate like stable code point + lastCombining = 0; + *lastStable = pos; + continue; + } + ++i; + uc = QChar::surrogateToUcs4(uc, low); + } + + const QUnicodeTables::Properties *p = qGetProp(uc); + + if (p->combiningClass < lastCombining && p->combiningClass > 0) + return false; + + const uchar check = (p->nfQuickCheck >> (mode << 1)) & 0x03; + if (check != NFQC_YES) + return false; // ### can we quick check NFQC_MAYBE ? + + lastCombining = p->combiningClass; + if (lastCombining == 0) + *lastStable = pos; + } + + if (length != str->length()) // low surrogate parts at the end of text + *lastStable = str->length() - 1; + + return true; +} + QT_END_NAMESPACE |