From 824180a12249e48c0e3280fec64940825ce0aa6e Mon Sep 17 00:00:00 2001 From: Konstantin Ritt Date: Tue, 5 Jun 2012 11:19:22 +0300 Subject: Set the whiteSpace flag outside the grapheme and the line breaking loop The white spaces determination doesn't belong to the text breaking algorithm. A proper breaking implementation shouldn't assume spaces are break opportunities (actually, space is allowed to be a grapheme base); However, the whiteSpace flag should never be checked alone while iterating over the text to find the space sequence; the grapheme boundaries should always be taken into account. This covers the SMP code points in UTF-16 text and graphemes that consist of a space followed with one or more grapheme extenders. This introduces a minor overhead that would be eliminated some later. Change-Id: Ic2cc7f485631fd0b436fc256ce112ded5f94fc07 Reviewed-by: Lars Knoll --- src/corelib/tools/qunicodetools.cpp | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) (limited to 'src/corelib') diff --git a/src/corelib/tools/qunicodetools.cpp b/src/corelib/tools/qunicodetools.cpp index e503ecd65a..a311213ede 100644 --- a/src/corelib/tools/qunicodetools.cpp +++ b/src/corelib/tools/qunicodetools.cpp @@ -155,12 +155,10 @@ static void calcGraphemeAndLineBreaks(const ushort *string, quint32 len, HB_Char if (cls == QUnicodeTables::LineBreak_LF) cls = QUnicodeTables::LineBreak_BK; - attributes[0].whiteSpace = (cls == QUnicodeTables::LineBreak_SP || cls == QUnicodeTables::LineBreak_BK); attributes[0].charStop = true; int lcls = cls; for (quint32 i = 1; i < len; ++i) { - attributes[i].whiteSpace = false; attributes[i].charStop = true; uint ucs4 = string[i]; @@ -183,10 +181,6 @@ static void calcGraphemeAndLineBreaks(const ushort *string, quint32 len, HB_Char } } - // set white space and char stop flag - if (ncls >= QUnicodeTables::LineBreak_SP) - attributes[i].whiteSpace = true; - HB_LineBreakType lineBreakType = HB_NoBreak; if (cls >= QUnicodeTables::LineBreak_CR) { @@ -378,6 +372,24 @@ static void calcSentenceBreaks(const ushort *string, quint32 len, HB_CharAttribu } +static void getWhiteSpaces(const ushort *string, quint32 len, HB_CharAttributes *attributes) +{ + for (quint32 i = 0; i != len; ++i) { + uint ucs4 = string[i]; + if (QChar::isHighSurrogate(ucs4) && i + 1 != len) { + ushort low = string[i + 1]; + if (QChar::isLowSurrogate(low)) { + ucs4 = QChar::surrogateToUcs4(ucs4, low); + ++i; + } + } + + if (QChar::isSpace(ucs4)) + attributes[i].whiteSpace = true; + } +} + + Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length, const HB_ScriptItem *items, int numItems, HB_CharAttributes *attributes, CharAttributeOptions options) @@ -391,12 +403,14 @@ Q_CORE_EXPORT void initCharAttributes(const ushort *string, int length, options |= GraphemeBreaks; } - if (options & (GraphemeBreaks | LineBreaks | WhiteSpaces)) + if (options & (GraphemeBreaks | LineBreaks)) calcGraphemeAndLineBreaks(string, length, attributes); if (options & WordBreaks) calcWordBreaks(string, length, attributes); if (options & SentenceBreaks) calcSentenceBreaks(string, length, attributes); + if (options & WhiteSpaces) + getWhiteSpaces(string, length, attributes); if (!items || numItems <= 0) return; -- cgit v1.2.3