move the default text breaking algorithm impl from HarfBuzz to Qt

there are several reasons to do this: * text breaking is not a shaper's job; * since the text breaking rules are bound to a specific Unicode version, updating Qt's internal unicode data would require updating the data in HB as well; * makes porting to HurfBuzz-NG some easier Change-Id: I0bbf8e8a343bc074696f4ddf2ae4e7fa32a61629 Reviewed-by: Lars Knoll <lars.knoll@nokia.com>
author: Konstantin Ritt <ritt.ks@gmail.com> 2012-05-10 10:31:16 +0300
committer: Qt by Nokia <qt-info@nokia.com> 2012-05-10 11:34:25 +0200
commit: cbfdec66033d14020d3e8a49bacc0d12d2b6798e (patch)
tree: 02e94eaad62c85f1556623b17412607102f067cb /src/3rdparty/harfbuzz/src/harfbuzz-shaper.cpp
parent: 8c0048a377568b646b3b87be0b02322fce68b780 (diff)
1 files changed, 5 insertions, 335 deletions
diff --git a/src/3rdparty/harfbuzz/src/harfbuzz-shaper.cpp b/src/3rdparty/harfbuzz/src/harfbuzz-shaper.cpp
index f6900325bc..2e1b5322d2 100644
--- a/src/3rdparty/harfbuzz/src/harfbuzz-shaper.cpp
+++ b/src/3rdparty/harfbuzz/src/harfbuzz-shaper.cpp
@@ -32,205 +32,6 @@
 #define HB_MIN(a, b) ((a) < (b) ? (a) : (b))
 #define HB_MAX(a, b) ((a) > (b) ? (a) : (b))
 
-// -----------------------------------------------------------------------------------------------------
-//
-// The line break algorithm. See http://www.unicode.org/reports/tr14/tr14-13.html
-//
-// -----------------------------------------------------------------------------------------------------
-
-/* The Unicode algorithm does in our opinion allow line breaks at some
-   places they shouldn't be allowed. The following changes were thus
-   made in comparison to the Unicode reference:
-
-   EX->AL from DB to IB
-   SY->AL from DB to IB
-   SY->PO from DB to IB
-   SY->PR from DB to IB
-   SY->OP from DB to IB
-   AL->PR from DB to IB
-   AL->PO from DB to IB
-   PR->PR from DB to IB
-   PO->PO from DB to IB
-   PR->PO from DB to IB
-   PO->PR from DB to IB
-   HY->PO from DB to IB
-   HY->PR from DB to IB
-   HY->OP from DB to IB
-   NU->EX from PB to IB
-   EX->PO from DB to IB
-*/
-
-// The following line break classes are not treated by the table:
-//  AI, BK, CB, CR, LF, NL, SA, SG, SP, XX
-
-enum break_class {
-    // the first 4 values have to agree with the enum in QCharAttributes
-    ProhibitedBreak,            // PB in table
-    DirectBreak,                // DB in table
-    IndirectBreak,              // IB in table
-    CombiningIndirectBreak,     // CI in table
-    CombiningProhibitedBreak    // CP in table
-};
-#define DB DirectBreak
-#define IB IndirectBreak
-#define CI CombiningIndirectBreak
-#define CP CombiningProhibitedBreak
-#define PB ProhibitedBreak
-
-static const hb_uint8 breakTable[HB_LineBreak_JT+1][HB_LineBreak_JT+1] =
-{
-/*          OP  CL  QU  GL  NS  EX  SY  IS  PR  PO  NU  AL  ID  IN  HY  BA  BB  B2  ZW  CM  WJ  H2  H3  JL  JV  JT */
-/* OP */ { PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, PB, CP, PB, PB, PB, PB, PB, PB },
-/* CL */ { DB, PB, IB, IB, PB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* QU */ { PB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
-/* GL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
-/* NS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* EX */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* SY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* IS */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* PR */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, DB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, IB },
-/* PO */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* NU */ { IB, PB, IB, IB, IB, IB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* AL */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* ID */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* IN */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* HY */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* BA */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* BB */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
-/* B2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, DB, DB, DB, DB, IB, IB, DB, PB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* ZW */ { DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, DB, PB, DB, DB, DB, DB, DB, DB, DB },
-/* CM */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, DB, IB, IB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, DB },
-/* WJ */ { IB, PB, IB, IB, IB, PB, PB, PB, IB, IB, IB, IB, IB, IB, IB, IB, IB, IB, PB, CI, PB, IB, IB, IB, IB, IB },
-/* H2 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
-/* H3 */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB },
-/* JL */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, IB, IB, IB, IB, DB },
-/* JV */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, IB, IB },
-/* JT */ { DB, PB, IB, IB, IB, PB, PB, PB, DB, IB, DB, DB, DB, IB, IB, IB, DB, DB, PB, CI, PB, DB, DB, DB, DB, IB }
-};
-#undef DB
-#undef IB
-#undef CI
-#undef CP
-#undef PB
-
-static const hb_uint8 graphemeTable[HB_Grapheme_LVT + 1][HB_Grapheme_LVT + 1] =
-{
-//      Other, CR,    LF,    Control,Extend,L,    V,     T,     LV,    LVT
-    { true , true , true , true , true , true , true , true , true , true  }, // Other, 
-    { true , true , true , true , true , true , true , true , true , true  }, // CR,
-    { true , false, true , true , true , true , true , true , true , true  }, // LF,
-    { true , true , true , true , true , true , true , true , true , true  }, // Control,
-    { false, true , true , true , false, false, false, false, false, false }, // Extend,
-    { true , true , true , true , true , false, true , true , true , true  }, // L, 
-    { true , true , true , true , true , false, false, true , false, true  }, // V, 
-    { true , true , true , true , true , true , false, false, false, false }, // T, 
-    { true , true , true , true , true , false, true , true , true , true  }, // LV, 
-    { true , true , true , true , true , false, true , true , true , true  }, // LVT
-};
-    
-static void calcLineBreaks(const HB_UChar16 *uc, hb_uint32 len, HB_CharAttributes *charAttributes)
-{
-    if (!len)
-        return;
-
-    // ##### can this fail if the first char is a surrogate?
-    HB_LineBreakClass cls;
-    HB_GraphemeClass grapheme;
-    HB_GetGraphemeAndLineBreakClass(*uc, &grapheme, &cls);
-    // handle case where input starts with an LF
-    if (cls == HB_LineBreak_LF)
-        cls = HB_LineBreak_BK;
-
-    charAttributes[0].whiteSpace = (cls == HB_LineBreak_SP || cls == HB_LineBreak_BK);
-    charAttributes[0].charStop = true;
-
-    int lcls = cls;
-    for (hb_uint32 i = 1; i < len; ++i) {
-        charAttributes[i].whiteSpace = false;
-        charAttributes[i].charStop = true;
-
-        HB_UChar32 code = uc[i];
-        HB_GraphemeClass ngrapheme;
-        HB_LineBreakClass ncls;
-        HB_GetGraphemeAndLineBreakClass(code, &ngrapheme, &ncls);
-        charAttributes[i].charStop = graphemeTable[ngrapheme][grapheme];
-        // handle surrogates
-        if (ncls == HB_LineBreak_SG) {
-            if (HB_IsHighSurrogate(uc[i]) && i < len - 1 && HB_IsLowSurrogate(uc[i+1])) {
-                continue;
-            } else if (HB_IsLowSurrogate(uc[i]) && HB_IsHighSurrogate(uc[i-1])) {
-                code = HB_SurrogateToUcs4(uc[i-1], uc[i]);
-                HB_GetGraphemeAndLineBreakClass(code, &ngrapheme, &ncls);
-                charAttributes[i].charStop = false;
-            } else {
-                ncls = HB_LineBreak_AL;
-            }
-        }
-
-        // set white space and char stop flag
-        if (ncls >= HB_LineBreak_SP)
-            charAttributes[i].whiteSpace = true;
-
-        HB_LineBreakType lineBreakType = HB_NoBreak;
-        if (cls >= HB_LineBreak_LF) {
-            lineBreakType = HB_ForcedBreak;
-        } else if(cls == HB_LineBreak_CR) {
-            lineBreakType = (ncls == HB_LineBreak_LF) ? HB_NoBreak : HB_ForcedBreak;
-        }
-
-        if (ncls == HB_LineBreak_SP)
-            goto next_no_cls_update;
-        if (ncls >= HB_LineBreak_CR)
-            goto next;
-
-        {
-            int tcls = ncls;
-            // for south east asian chars that require a complex (dictionary analysis), the unicode
-            // standard recommends to treat them as AL. thai_attributes and other attribute methods that
-            // do dictionary analysis can override
-            if (tcls >= HB_LineBreak_SA)
-                tcls = HB_LineBreak_AL;
-            if (cls >= HB_LineBreak_SA)
-                cls = HB_LineBreak_AL;
-
-            int brk = breakTable[cls][tcls];
-            switch (brk) {
-            case DirectBreak:
-                lineBreakType = HB_Break;
-                if (uc[i-1] == 0xad) // soft hyphen
-                    lineBreakType = HB_SoftHyphen;
-                break;
-            case IndirectBreak:
-                lineBreakType = (lcls == HB_LineBreak_SP) ? HB_Break : HB_NoBreak;
-                break;
-            case CombiningIndirectBreak:
-                lineBreakType = HB_NoBreak;
-                if (lcls == HB_LineBreak_SP){
-                    if (i > 1)
-                        charAttributes[i-2].lineBreakType = HB_Break;
-                } else {
-                    goto next_no_cls_update;
-                }
-                break;
-            case CombiningProhibitedBreak:
-                lineBreakType = HB_NoBreak;
-                if (lcls != HB_LineBreak_SP)
-                    goto next_no_cls_update;
-            case ProhibitedBreak:
-            default:
-                break;
-            }
-        }
-    next:
-        cls = ncls;
-    next_no_cls_update:
-        lcls = ncls;
-        grapheme = ngrapheme;
-        charAttributes[i-1].lineBreakType = lineBreakType;
-    }
-    charAttributes[len-1].lineBreakType = HB_ForcedBreak;
-}
-
 // --------------------------------------------------------------------------------------------------------------------------------------------
 //
 // Basic processing
@@ -679,13 +480,12 @@ const HB_ScriptEngine HB_ScriptEngines[] = {
     { HB_ArabicShape, 0}
 };
 
-void HB_GetCharAttributes(const HB_UChar16 *string, hb_uint32 stringLength,
-                          const HB_ScriptItem *items, hb_uint32 numItems,
-                          HB_CharAttributes *attributes)
+void HB_GetTailoredCharAttributes(const HB_UChar16 *string, hb_uint32 stringLength,
+                                  const HB_ScriptItem *items, hb_uint32 numItems,
+                                  HB_CharAttributes *attributes)
 {
-    memset(attributes, 0, stringLength * sizeof(HB_CharAttributes));
-    calcLineBreaks(string, stringLength, attributes);
-
+    if (stringLength == 0)
+        return;
     for (hb_uint32 i = 0; i < numItems; ++i) {
         HB_Script script = items[i].script;
         if (script == HB_Script_Inherited)
@@ -698,136 +498,6 @@ void HB_GetCharAttributes(const HB_UChar16 *string, hb_uint32 stringLength,
 }
 
 
-enum BreakRule { NoBreak = 0, Break = 1, Middle = 2 };
-
-static const hb_uint8 wordbreakTable[HB_Word_ExtendNumLet + 1][HB_Word_ExtendNumLet + 1] = {
-//        Other    Format   Katakana ALetter  MidLetter MidNum  Numeric  ExtendNumLet
-    {   Break,   Break,   Break,   Break,   Break,   Break,   Break,   Break }, // Other
-    {   Break,   Break,   Break,   Break,   Break,   Break,   Break,   Break }, // Format 
-    {   Break,   Break, NoBreak,   Break,   Break,   Break,   Break, NoBreak }, // Katakana
-    {   Break,   Break,   Break, NoBreak,  Middle,   Break, NoBreak, NoBreak }, // ALetter
-    {   Break,   Break,   Break,   Break,   Break,   Break,   Break,   Break }, // MidLetter
-    {   Break,   Break,   Break,   Break,   Break,   Break,   Break,   Break }, // MidNum
-    {   Break,   Break,   Break, NoBreak,   Break,  Middle, NoBreak, NoBreak }, // Numeric
-    {   Break,   Break, NoBreak, NoBreak,   Break,   Break, NoBreak, NoBreak }, // ExtendNumLet
-};
-
-void HB_GetWordBoundaries(const HB_UChar16 *string, hb_uint32 stringLength,
-                          const HB_ScriptItem * /*items*/, hb_uint32 /*numItems*/,
-                          HB_CharAttributes *attributes)
-{
-    if (stringLength == 0)
-        return;
-    unsigned int brk = HB_GetWordClass(string[0]);
-    attributes[0].wordBoundary = true;
-    for (hb_uint32 i = 1; i < stringLength; ++i) {
-        if (!attributes[i].charStop) {
-            attributes[i].wordBoundary = false;
-            continue;
-        }
-        hb_uint32 nbrk = HB_GetWordClass(string[i]);
-        if (nbrk == HB_Word_Format) {
-            attributes[i].wordBoundary = (HB_GetSentenceClass(string[i-1]) == HB_Sentence_Sep);
-            continue;
-        }
-        BreakRule rule = (BreakRule)wordbreakTable[brk][nbrk];
-        if (rule == Middle) {
-            rule = Break;
-            hb_uint32 lookahead = i + 1;
-            while (lookahead < stringLength) {
-                hb_uint32 testbrk = HB_GetWordClass(string[lookahead]);
-                if (testbrk == HB_Word_Format && HB_GetSentenceClass(string[lookahead]) != HB_Sentence_Sep) {
-                    ++lookahead;
-                    continue;
-                }
-                if (testbrk == brk) {
-                    rule = NoBreak;
-                    while (i < lookahead)
-                        attributes[i++].wordBoundary = false;
-                    nbrk = testbrk;
-                }
-                break;
-            }
-        }
-        attributes[i].wordBoundary = (rule == Break);
-        brk = nbrk;
-    }
-}
-
-
-enum SentenceBreakStates {
-    SB_Initial,
-    SB_Upper,
-    SB_UpATerm, 
-    SB_ATerm,
-    SB_ATermC, 
-    SB_ACS, 
-    SB_STerm, 
-    SB_STermC, 
-    SB_SCS,
-    SB_BAfter, 
-    SB_Break,
-    SB_Look
-};
-
-static const hb_uint8 sentenceBreakTable[HB_Sentence_Close + 1][HB_Sentence_Close + 1] = {
-//        Other       Sep         Format      Sp          Lower       Upper       OLetter     Numeric     ATerm       STerm       Close
-      { SB_Initial, SB_BAfter , SB_Initial, SB_Initial, SB_Initial, SB_Upper  , SB_Initial, SB_Initial, SB_ATerm  , SB_STerm  , SB_Initial }, // SB_Initial,
-      { SB_Initial, SB_BAfter , SB_Upper  , SB_Initial, SB_Initial, SB_Upper  , SB_Initial, SB_Initial, SB_UpATerm, SB_STerm  , SB_Initial }, // SB_Upper
-      
-      { SB_Look   , SB_BAfter , SB_UpATerm, SB_ACS    , SB_Initial, SB_Upper  , SB_Break  , SB_Initial, SB_ATerm  , SB_STerm  , SB_ATermC  }, // SB_UpATerm
-      { SB_Look   , SB_BAfter , SB_ATerm  , SB_ACS    , SB_Initial, SB_Break  , SB_Break  , SB_Initial, SB_ATerm  , SB_STerm  , SB_ATermC  }, // SB_ATerm
-      { SB_Look   , SB_BAfter , SB_ATermC , SB_ACS    , SB_Initial, SB_Break  , SB_Break  , SB_Look   , SB_ATerm  , SB_STerm  , SB_ATermC  }, // SB_ATermC,
-      { SB_Look   , SB_BAfter , SB_ACS    , SB_ACS    , SB_Initial, SB_Break  , SB_Break  , SB_Look   , SB_ATerm  , SB_STerm  , SB_Look    }, // SB_ACS,
-      
-      { SB_Break  , SB_BAfter , SB_STerm  , SB_SCS    , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_ATerm  , SB_STerm  , SB_STermC  }, // SB_STerm,
-      { SB_Break  , SB_BAfter , SB_STermC , SB_SCS    , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_ATerm  , SB_STerm  , SB_STermC  }, // SB_STermC,
-      { SB_Break  , SB_BAfter , SB_SCS    , SB_SCS    , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_ATerm  , SB_STerm  , SB_Break   }, // SB_SCS,
-      { SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break  , SB_Break   }, // SB_BAfter,
-};
-
-void HB_GetSentenceBoundaries(const HB_UChar16 *string, hb_uint32 stringLength,
-                              const HB_ScriptItem * /*items*/, hb_uint32 /*numItems*/,
-                              HB_CharAttributes *attributes)
-{
-    if (stringLength == 0)
-        return;
-    hb_uint32 brk = sentenceBreakTable[SB_Initial][HB_GetSentenceClass(string[0])];
-    attributes[0].sentenceBoundary = true;
-    for (hb_uint32 i = 1; i < stringLength; ++i) {
-        if (!attributes[i].charStop) {
-            attributes[i].sentenceBoundary = false;
-            continue;
-        }
-        brk = sentenceBreakTable[brk][HB_GetSentenceClass(string[i])];
-        if (brk == SB_Look) {
-            brk = SB_Break;
-            hb_uint32 lookahead = i + 1;
-            while (lookahead < stringLength) {
-                hb_uint32 sbrk = HB_GetSentenceClass(string[lookahead]);
-                if (sbrk != HB_Sentence_Other && sbrk != HB_Sentence_Numeric && sbrk != HB_Sentence_Close) {
-                    break;
-                } else if (sbrk == HB_Sentence_Lower) {
-                    brk = SB_Initial;
-                    break;
-                }
-                ++lookahead;
-            }
-            if (brk == SB_Initial) {
-                while (i < lookahead)
-                    attributes[i++].sentenceBoundary = false;
-            }
-        }
-        if (brk == SB_Break) {
-            attributes[i].sentenceBoundary = true;
-            brk = sentenceBreakTable[SB_Initial][HB_GetSentenceClass(string[i])];
-        } else {
-            attributes[i].sentenceBoundary = false;
-        }
-    }
-}
-
-
 static inline char *tag_to_string(HB_UInt tag)
 {
     static char string[5];
author	Konstantin Ritt <ritt.ks@gmail.com>	2012-05-10 10:31:16 +0300
committer	Qt by Nokia <qt-info@nokia.com>	2012-05-10 11:34:25 +0200
commit	cbfdec66033d14020d3e8a49bacc0d12d2b6798e (patch)
tree	02e94eaad62c85f1556623b17412607102f067cb /src/3rdparty/harfbuzz/src/harfbuzz-shaper.cpp
parent	8c0048a377568b646b3b87be0b02322fce68b780 (diff)