From cbfdec66033d14020d3e8a49bacc0d12d2b6798e Mon Sep 17 00:00:00 2001 From: Konstantin Ritt Date: Thu, 10 May 2012 10:31:16 +0300 Subject: move the default text breaking algorithm impl from HarfBuzz to Qt there are several reasons to do this: * text breaking is not a shaper's job; * since the text breaking rules are bound to a specific Unicode version, updating Qt's internal unicode data would require updating the data in HB as well; * makes porting to HurfBuzz-NG some easier Change-Id: I0bbf8e8a343bc074696f4ddf2ae4e7fa32a61629 Reviewed-by: Lars Knoll --- src/3rdparty/harfbuzz/src/harfbuzz-shaper.cpp | 340 +------------------------- 1 file changed, 5 insertions(+), 335 deletions(-) (limited to 'src/3rdparty/harfbuzz/src/harfbuzz-shaper.cpp') diff --git a/src/3rdparty/harfbuzz/src/harfbuzz-shaper.cpp b/src/3rdparty/harfbuzz/src/harfbuzz-shaper.cpp index f6900325bc..2e1b5322d2 100644 --- a/src/3rdparty/harfbuzz/src/harfbuzz-shaper.cpp +++ b/src/3rdparty/harfbuzz/src/harfbuzz-shaper.cpp @@ -32,205 +32,6 @@ #define HB_MIN(a, b) ((a) < (b) ? (a) : (b)) #define HB_MAX(a, b) ((a) > (b) ? (a) : (b)) -// ----------------------------------------------------------------------------------------------------- -// -// The line break algorithm. See http://www.unicode.org/reports/tr14/tr14-13.html -// -// ----------------------------------------------------------------------------------------------------- - -/* The Unicode algorithm does in our opinion allow line breaks at some - places they shouldn't be allowed. The following changes were thus - made in comparison to the Unicode reference: - - EX->AL from DB to IB - SY->AL from DB to IB - SY->PO from DB to IB - SY->PR from DB to IB - SY->OP from DB to IB - AL->PR from DB to IB - AL->PO from DB to IB - PR->PR from DB to IB - PO->PO from DB to IB - PR->PO from DB to IB - PO->PR from DB to IB - HY->PO from DB to IB - HY->PR from DB to IB - HY->OP from DB to IB - NU->EX from PB to IB - EX->PO from DB to IB -*/ - -// The following line break classes are not treated by the table: -// AI, BK, CB, CR, LF, NL, SA, SG, SP, XX - -enum break_class { - // the first 4 values have to agree with the enum in QCharAttributes - ProhibitedBreak, // PB in table - DirectBreak, // DB in table - IndirectBreak, // IB in table - CombiningIndirectBreak, // CI in table - CombiningProhibitedBreak // CP in table -}; -#define DB DirectBreak -#define IB IndirectBreak -#define CI CombiningIndirectBreak -#define CP CombiningProhibitedBreak -#define PB ProhibitedBreak - -static const hb_uint8 breakTable[HB_LineBreak_JT+1][HB_LineBreak_JT+1] = -{ -/* OP CL QU GL NS EX SY IS PR PO NU AL ID IN HY BA BB B2 ZW CM WJ H2 H3 JL JV JT */ -/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB }, -/* CL */ { DB, PB, IB, IB, PB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* QU */ { PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB }, -/* GL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB }, -/* NS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* EX */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* SY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* IS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* PR */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB }, -/* PO */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* NU */ { IB, PB, IB, IB, IB, IB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* AL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* ID */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* IN */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* HY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* BA */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* BB */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB }, -/* B2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB }, -/* CM */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB }, -/* WJ */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB }, -/* H2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB }, -/* H3 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB }, -/* JL */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB }, -/* JV */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB }, -/* JT */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB } -}; -#undef DB -#undef IB -#undef CI -#undef CP -#undef PB - -static const hb_uint8 graphemeTable[HB_Grapheme_LVT + 1][HB_Grapheme_LVT + 1] = -{ -// Other, CR, LF, Control,Extend,L, V, T, LV, LVT - { true , true , true , true , true , true , true , true , true , true }, // Other, - { true , true , true , true , true , true , true , true , true , true }, // CR, - { true , false, true , true , true , true , true , true , true , true }, // LF, - { true , true , true , true , true , true , true , true , true , true }, // Control, - { false, true , true , true , false, false, false, false, false, false }, // Extend, - { true , true , true , true , true , false, true , true , true , true }, // L, - { true , true , true , true , true , false, false, true , false, true }, // V, - { true , true , true , true , true , true , false, false, false, false }, // T, - { true , true , true , true , true , false, true , true , true , true }, // LV, - { true , true , true , true , true , false, true , true , true , true }, // LVT -}; - -static void calcLineBreaks(const HB_UChar16 *uc, hb_uint32 len, HB_CharAttributes *charAttributes) -{ - if (!len) - return; - - // ##### can this fail if the first char is a surrogate? - HB_LineBreakClass cls; - HB_GraphemeClass grapheme; - HB_GetGraphemeAndLineBreakClass(*uc, &grapheme, &cls); - // handle case where input starts with an LF - if (cls == HB_LineBreak_LF) - cls = HB_LineBreak_BK; - - charAttributes[0].whiteSpace = (cls == HB_LineBreak_SP || cls == HB_LineBreak_BK); - charAttributes[0].charStop = true; - - int lcls = cls; - for (hb_uint32 i = 1; i < len; ++i) { - charAttributes[i].whiteSpace = false; - charAttributes[i].charStop = true; - - HB_UChar32 code = uc[i]; - HB_GraphemeClass ngrapheme; - HB_LineBreakClass ncls; - HB_GetGraphemeAndLineBreakClass(code, &ngrapheme, &ncls); - charAttributes[i].charStop = graphemeTable[ngrapheme][grapheme]; - // handle surrogates - if (ncls == HB_LineBreak_SG) { - if (HB_IsHighSurrogate(uc[i]) && i < len - 1 && HB_IsLowSurrogate(uc[i+1])) { - continue; - } else if (HB_IsLowSurrogate(uc[i]) && HB_IsHighSurrogate(uc[i-1])) { - code = HB_SurrogateToUcs4(uc[i-1], uc[i]); - HB_GetGraphemeAndLineBreakClass(code, &ngrapheme, &ncls); - charAttributes[i].charStop = false; - } else { - ncls = HB_LineBreak_AL; - } - } - - // set white space and char stop flag - if (ncls >= HB_LineBreak_SP) - charAttributes[i].whiteSpace = true; - - HB_LineBreakType lineBreakType = HB_NoBreak; - if (cls >= HB_LineBreak_LF) { - lineBreakType = HB_ForcedBreak; - } else if(cls == HB_LineBreak_CR) { - lineBreakType = (ncls == HB_LineBreak_LF) ? HB_NoBreak : HB_ForcedBreak; - } - - if (ncls == HB_LineBreak_SP) - goto next_no_cls_update; - if (ncls >= HB_LineBreak_CR) - goto next; - - { - int tcls = ncls; - // for south east asian chars that require a complex (dictionary analysis), the unicode - // standard recommends to treat them as AL. thai_attributes and other attribute methods that - // do dictionary analysis can override - if (tcls >= HB_LineBreak_SA) - tcls = HB_LineBreak_AL; - if (cls >= HB_LineBreak_SA) - cls = HB_LineBreak_AL; - - int brk = breakTable[cls][tcls]; - switch (brk) { - case DirectBreak: - lineBreakType = HB_Break; - if (uc[i-1] == 0xad) // soft hyphen - lineBreakType = HB_SoftHyphen; - break; - case IndirectBreak: - lineBreakType = (lcls == HB_LineBreak_SP) ? HB_Break : HB_NoBreak; - break; - case CombiningIndirectBreak: - lineBreakType = HB_NoBreak; - if (lcls == HB_LineBreak_SP){ - if (i > 1) - charAttributes[i-2].lineBreakType = HB_Break; - } else { - goto next_no_cls_update; - } - break; - case CombiningProhibitedBreak: - lineBreakType = HB_NoBreak; - if (lcls != HB_LineBreak_SP) - goto next_no_cls_update; - case ProhibitedBreak: - default: - break; - } - } - next: - cls = ncls; - next_no_cls_update: - lcls = ncls; - grapheme = ngrapheme; - charAttributes[i-1].lineBreakType = lineBreakType; - } - charAttributes[len-1].lineBreakType = HB_ForcedBreak; -} - // -------------------------------------------------------------------------------------------------------------------------------------------- // // Basic processing @@ -679,13 +480,12 @@ const HB_ScriptEngine HB_ScriptEngines[] = { { HB_ArabicShape, 0} }; -void HB_GetCharAttributes(const HB_UChar16 *string, hb_uint32 stringLength, - const HB_ScriptItem *items, hb_uint32 numItems, - HB_CharAttributes *attributes) +void HB_GetTailoredCharAttributes(const HB_UChar16 *string, hb_uint32 stringLength, + const HB_ScriptItem *items, hb_uint32 numItems, + HB_CharAttributes *attributes) { - memset(attributes, 0, stringLength * sizeof(HB_CharAttributes)); - calcLineBreaks(string, stringLength, attributes); - + if (stringLength == 0) + return; for (hb_uint32 i = 0; i < numItems; ++i) { HB_Script script = items[i].script; if (script == HB_Script_Inherited) @@ -698,136 +498,6 @@ void HB_GetCharAttributes(const HB_UChar16 *string, hb_uint32 stringLength, } -enum BreakRule { NoBreak = 0, Break = 1, Middle = 2 }; - -static const hb_uint8 wordbreakTable[HB_Word_ExtendNumLet + 1][HB_Word_ExtendNumLet + 1] = { -// Other Format Katakana ALetter MidLetter MidNum Numeric ExtendNumLet - { Break, Break, Break, Break, Break, Break, Break, Break }, // Other - { Break, Break, Break, Break, Break, Break, Break, Break }, // Format - { Break, Break, NoBreak, Break, Break, Break, Break, NoBreak }, // Katakana - { Break, Break, Break, NoBreak, Middle, Break, NoBreak, NoBreak }, // ALetter - { Break, Break, Break, Break, Break, Break, Break, Break }, // MidLetter - { Break, Break, Break, Break, Break, Break, Break, Break }, // MidNum - { Break, Break, Break, NoBreak, Break, Middle, NoBreak, NoBreak }, // Numeric - { Break, Break, NoBreak, NoBreak, Break, Break, NoBreak, NoBreak }, // ExtendNumLet -}; - -void HB_GetWordBoundaries(const HB_UChar16 *string, hb_uint32 stringLength, - const HB_ScriptItem * /*items*/, hb_uint32 /*numItems*/, - HB_CharAttributes *attributes) -{ - if (stringLength == 0) - return; - unsigned int brk = HB_GetWordClass(string[0]); - attributes[0].wordBoundary = true; - for (hb_uint32 i = 1; i < stringLength; ++i) { - if (!attributes[i].charStop) { - attributes[i].wordBoundary = false; - continue; - } - hb_uint32 nbrk = HB_GetWordClass(string[i]); - if (nbrk == HB_Word_Format) { - attributes[i].wordBoundary = (HB_GetSentenceClass(string[i-1]) == HB_Sentence_Sep); - continue; - } - BreakRule rule = (BreakRule)wordbreakTable[brk][nbrk]; - if (rule == Middle) { - rule = Break; - hb_uint32 lookahead = i + 1; - while (lookahead < stringLength) { - hb_uint32 testbrk = HB_GetWordClass(string[lookahead]); - if (testbrk == HB_Word_Format && HB_GetSentenceClass(string[lookahead]) != HB_Sentence_Sep) { - ++lookahead; - continue; - } - if (testbrk == brk) { - rule = NoBreak; - while (i < lookahead) - attributes[i++].wordBoundary = false; - nbrk = testbrk; - } - break; - } - } - attributes[i].wordBoundary = (rule == Break); - brk = nbrk; - } -} - - -enum SentenceBreakStates { - SB_Initial, - SB_Upper, - SB_UpATerm, - SB_ATerm, - SB_ATermC, - SB_ACS, - SB_STerm, - SB_STermC, - SB_SCS, - SB_BAfter, - SB_Break, - SB_Look -}; - -static const hb_uint8 sentenceBreakTable[HB_Sentence_Close + 1][HB_Sentence_Close + 1] = { -// Other Sep Format Sp Lower Upper OLetter Numeric ATerm STerm Close - { SB_Initial, SB_BAfter , SB_Initial, SB_Initial, SB_Initial, SB_Upper , SB_Initial, SB_Initial, SB_ATerm , SB_STerm , SB_Initial }, // SB_Initial, - { SB_Initial, SB_BAfter , SB_Upper , SB_Initial, SB_Initial, SB_Upper , SB_Initial, SB_Initial, SB_UpATerm, SB_STerm , SB_Initial }, // SB_Upper - - { SB_Look , SB_BAfter , SB_UpATerm, SB_ACS , SB_Initial, SB_Upper , SB_Break , SB_Initial, SB_ATerm , SB_STerm , SB_ATermC }, // SB_UpATerm - { SB_Look , SB_BAfter , SB_ATerm , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Initial, SB_ATerm , SB_STerm , SB_ATermC }, // SB_ATerm - { SB_Look , SB_BAfter , SB_ATermC , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Look , SB_ATerm , SB_STerm , SB_ATermC }, // SB_ATermC, - { SB_Look , SB_BAfter , SB_ACS , SB_ACS , SB_Initial, SB_Break , SB_Break , SB_Look , SB_ATerm , SB_STerm , SB_Look }, // SB_ACS, - - { SB_Break , SB_BAfter , SB_STerm , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_STermC }, // SB_STerm, - { SB_Break , SB_BAfter , SB_STermC , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_STermC }, // SB_STermC, - { SB_Break , SB_BAfter , SB_SCS , SB_SCS , SB_Break , SB_Break , SB_Break , SB_Break , SB_ATerm , SB_STerm , SB_Break }, // SB_SCS, - { SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break , SB_Break }, // SB_BAfter, -}; - -void HB_GetSentenceBoundaries(const HB_UChar16 *string, hb_uint32 stringLength, - const HB_ScriptItem * /*items*/, hb_uint32 /*numItems*/, - HB_CharAttributes *attributes) -{ - if (stringLength == 0) - return; - hb_uint32 brk = sentenceBreakTable[SB_Initial][HB_GetSentenceClass(string[0])]; - attributes[0].sentenceBoundary = true; - for (hb_uint32 i = 1; i < stringLength; ++i) { - if (!attributes[i].charStop) { - attributes[i].sentenceBoundary = false; - continue; - } - brk = sentenceBreakTable[brk][HB_GetSentenceClass(string[i])]; - if (brk == SB_Look) { - brk = SB_Break; - hb_uint32 lookahead = i + 1; - while (lookahead < stringLength) { - hb_uint32 sbrk = HB_GetSentenceClass(string[lookahead]); - if (sbrk != HB_Sentence_Other && sbrk != HB_Sentence_Numeric && sbrk != HB_Sentence_Close) { - break; - } else if (sbrk == HB_Sentence_Lower) { - brk = SB_Initial; - break; - } - ++lookahead; - } - if (brk == SB_Initial) { - while (i < lookahead) - attributes[i++].sentenceBoundary = false; - } - } - if (brk == SB_Break) { - attributes[i].sentenceBoundary = true; - brk = sentenceBreakTable[SB_Initial][HB_GetSentenceClass(string[i])]; - } else { - attributes[i].sentenceBoundary = false; - } - } -} - - static inline char *tag_to_string(HB_UInt tag) { static char string[5]; -- cgit v1.2.3