diff options
Diffstat (limited to 'util/unicode/main.cpp')
-rw-r--r-- | util/unicode/main.cpp | 2786 |
1 files changed, 2786 insertions, 0 deletions
diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp new file mode 100644 index 0000000000..3384e62ff1 --- /dev/null +++ b/util/unicode/main.cpp @@ -0,0 +1,2786 @@ +/**************************************************************************** +** +** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies). +** All rights reserved. +** Contact: Nokia Corporation (qt-info@nokia.com) +** +** This file is part of the utils of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** No Commercial Usage +** This file contains pre-release code and may not be distributed. +** You may use this file in accordance with the terms and conditions +** contained in the Technology Preview License Agreement accompanying +** this package. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 2.1 requirements +** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** In addition, as a special exception, Nokia gives you certain additional +** rights. These rights are described in the Nokia Qt LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** If you have questions regarding the use of this file, please contact +** Nokia at qt-info@nokia.com. +** +** +** +** +** +** +** +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include <qlist.h> +#include <qhash.h> +#include <qfile.h> +#include <qbytearray.h> +#include <qstring.h> +#include <qchar.h> +#include <qvector.h> +#include <qdebug.h> +#if 0 +#include <private/qunicodetables_p.h> +#endif + +#define DATA_VERSION_S "5.0" +#define DATA_VERSION_STR "QChar::Unicode_5_0" + +#define LAST_CODEPOINT 0x10ffff +#define LAST_CODEPOINT_STR "0x10ffff" + + +static QHash<QByteArray, QChar::UnicodeVersion> age_map; + +static void initAgeMap() +{ + struct AgeMap { + const QChar::UnicodeVersion version; + const char *age; + } ageMap[] = { + { QChar::Unicode_1_1, "1.1" }, + { QChar::Unicode_2_0, "2.0" }, + { QChar::Unicode_2_1_2, "2.1" }, + { QChar::Unicode_3_0, "3.0" }, + { QChar::Unicode_3_1, "3.1" }, + { QChar::Unicode_3_2, "3.2" }, + { QChar::Unicode_4_0, "4.0" }, + { QChar::Unicode_4_1, "4.1" }, + { QChar::Unicode_5_0, "5.0" }, + { QChar::Unicode_Unassigned, 0 } + }; + AgeMap *d = ageMap; + while (d->age) { + age_map.insert(d->age, d->version); + ++d; + } +} + + +enum Joining { + Joining_None, + Joining_Left, + Joining_Causing, + Joining_Dual, + Joining_Right, + Joining_Transparent + + , Joining_Unassigned +}; + +static QHash<QByteArray, Joining> joining_map; + +static void initJoiningMap() +{ + struct JoiningList { + Joining joining; + const char *name; + } joinings[] = { + { Joining_None, "U" }, + { Joining_Left, "L" }, + { Joining_Causing, "C" }, + { Joining_Dual, "D" }, + { Joining_Right, "R" }, + { Joining_Transparent, "T" }, + { Joining_Unassigned, 0 } + }; + JoiningList *d = joinings; + while (d->name) { + joining_map.insert(d->name, d->joining); + ++d; + } +} + + +static const char *grapheme_break_string = + " enum GraphemeBreak {\n" + " GraphemeBreakOther,\n" + " GraphemeBreakCR,\n" + " GraphemeBreakLF,\n" + " GraphemeBreakControl,\n" + " GraphemeBreakExtend,\n" + " GraphemeBreakL,\n" + " GraphemeBreakV,\n" + " GraphemeBreakT,\n" + " GraphemeBreakLV,\n" + " GraphemeBreakLVT\n" + " };\n\n"; + +enum GraphemeBreak { + GraphemeBreakOther, + GraphemeBreakCR, + GraphemeBreakLF, + GraphemeBreakControl, + GraphemeBreakExtend, + GraphemeBreakL, + GraphemeBreakV, + GraphemeBreakT, + GraphemeBreakLV, + GraphemeBreakLVT + + , GraphemeBreak_Unassigned +}; + +static QHash<QByteArray, GraphemeBreak> grapheme_break_map; + +static void initGraphemeBreak() +{ + struct GraphemeBreakList { + GraphemeBreak brk; + const char *name; + } breaks[] = { + { GraphemeBreakOther, "Other" }, + { GraphemeBreakCR, "CR" }, + { GraphemeBreakLF, "LF" }, + { GraphemeBreakControl, "Control" }, + { GraphemeBreakExtend, "Extend" }, + { GraphemeBreakL, "L" }, + { GraphemeBreakV, "V" }, + { GraphemeBreakT, "T" }, + { GraphemeBreakLV, "LV" }, + { GraphemeBreakLVT, "LVT" }, + { GraphemeBreak_Unassigned, 0 } + }; + GraphemeBreakList *d = breaks; + while (d->name) { + grapheme_break_map.insert(d->name, d->brk); + ++d; + } +} + + +static const char *word_break_string = + " enum WordBreak {\n" + " WordBreakOther,\n" + " WordBreakFormat,\n" + " WordBreakKatakana,\n" + " WordBreakALetter,\n" + " WordBreakMidLetter,\n" + " WordBreakMidNum,\n" + " WordBreakNumeric,\n" + " WordBreakExtendNumLet\n" + " };\n\n"; + +enum WordBreak { + WordBreakOther, + WordBreakFormat, + WordBreakKatakana, + WordBreakALetter, + WordBreakMidLetter, + WordBreakMidNum, + WordBreakNumeric, + WordBreakExtendNumLet + + , WordBreak_Unassigned +}; + +static QHash<QByteArray, WordBreak> word_break_map; + +static void initWordBreak() +{ + struct WordBreakList { + WordBreak brk; + const char *name; + } breaks[] = { + { WordBreakFormat, "Format" }, + { WordBreakFormat, "Extend" }, // these are copied in from GraphemeBreakProperty.txt + { WordBreakKatakana, "Katakana" }, + { WordBreakALetter, "ALetter" }, + { WordBreakMidLetter, "MidLetter" }, + { WordBreakMidNum, "MidNum" }, + { WordBreakNumeric, "Numeric" }, + { WordBreakExtendNumLet, "ExtendNumLet" }, + { WordBreak_Unassigned, 0 } + }; + WordBreakList *d = breaks; + while (d->name) { + word_break_map.insert(d->name, d->brk); + ++d; + } +} + + +static const char *sentence_break_string = + " enum SentenceBreak {\n" + " SentenceBreakOther,\n" + " SentenceBreakSep,\n" + " SentenceBreakFormat,\n" + " SentenceBreakSp,\n" + " SentenceBreakLower,\n" + " SentenceBreakUpper,\n" + " SentenceBreakOLetter,\n" + " SentenceBreakNumeric,\n" + " SentenceBreakATerm,\n" + " SentenceBreakSTerm,\n" + " SentenceBreakClose\n" + " };\n\n"; + +enum SentenceBreak { + SentenceBreakOther, + SentenceBreakSep, + SentenceBreakFormat, + SentenceBreakSp, + SentenceBreakLower, + SentenceBreakUpper, + SentenceBreakOLetter, + SentenceBreakNumeric, + SentenceBreakATerm, + SentenceBreakSTerm, + SentenceBreakClose + + , SentenceBreak_Unassigned +}; + +static QHash<QByteArray, SentenceBreak> sentence_break_map; + +static void initSentenceBreak() +{ + struct SentenceBreakList { + SentenceBreak brk; + const char *name; + } breaks[] = { + { SentenceBreakOther, "Other" }, + { SentenceBreakSep, "Sep" }, + { SentenceBreakFormat, "Format" }, + { SentenceBreakSp, "Sp" }, + { SentenceBreakLower, "Lower" }, + { SentenceBreakUpper, "Upper" }, + { SentenceBreakOLetter, "OLetter" }, + { SentenceBreakNumeric, "Numeric" }, + { SentenceBreakATerm, "ATerm" }, + { SentenceBreakSTerm, "STerm" }, + { SentenceBreakClose, "Close" }, + { SentenceBreak_Unassigned, 0 } + }; + SentenceBreakList *d = breaks; + while (d->name) { + sentence_break_map.insert(d->name, d->brk); + ++d; + } +} + + +static const char *lineBreakClass = + " // see http://www.unicode.org/reports/tr14/tr14-19.html\n" + " // we don't use the XX, AI and CB properties and map them to AL instead.\n" + " // as we don't support any EBDIC based OS'es, NL is ignored and mapped to AL as well.\n" + " enum LineBreakClass {\n" + " LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS,\n" + " LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO,\n" + " LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY,\n" + " LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM,\n" + " LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV,\n" + " LineBreak_JT, LineBreak_SA, LineBreak_SG,\n" + " LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK\n" + " };\n\n"; + +enum LineBreakClass { + LineBreak_OP, LineBreak_CL, LineBreak_QU, LineBreak_GL, LineBreak_NS, + LineBreak_EX, LineBreak_SY, LineBreak_IS, LineBreak_PR, LineBreak_PO, + LineBreak_NU, LineBreak_AL, LineBreak_ID, LineBreak_IN, LineBreak_HY, + LineBreak_BA, LineBreak_BB, LineBreak_B2, LineBreak_ZW, LineBreak_CM, + LineBreak_WJ, LineBreak_H2, LineBreak_H3, LineBreak_JL, LineBreak_JV, + LineBreak_JT, LineBreak_SA, LineBreak_SG, + LineBreak_SP, LineBreak_CR, LineBreak_LF, LineBreak_BK + + , LineBreak_Unassigned +}; + +static QHash<QByteArray, LineBreakClass> line_break_map; + +static void initLineBreak() +{ + // ### Classes XX and AI are left out and mapped to AL for now; + // ### Class NL is ignored and mapped to AL as well. + struct LineBreakList { + LineBreakClass brk; + const char *name; + } breaks[] = { + { LineBreak_BK, "BK" }, + { LineBreak_CR, "CR" }, + { LineBreak_LF, "LF" }, + { LineBreak_CM, "CM" }, + { LineBreak_AL, "NL" }, + { LineBreak_SG, "SG" }, + { LineBreak_WJ, "WJ" }, + { LineBreak_ZW, "ZW" }, + { LineBreak_GL, "GL" }, + { LineBreak_SP, "SP" }, + { LineBreak_B2, "B2" }, + { LineBreak_BA, "BA" }, + { LineBreak_BB, "BB" }, + { LineBreak_HY, "HY" }, + { LineBreak_AL, "CB" }, // ### + { LineBreak_CL, "CL" }, + { LineBreak_EX, "EX" }, + { LineBreak_IN, "IN" }, + { LineBreak_NS, "NS" }, + { LineBreak_OP, "OP" }, + { LineBreak_QU, "QU" }, + { LineBreak_IS, "IS" }, + { LineBreak_NU, "NU" }, + { LineBreak_PO, "PO" }, + { LineBreak_PR, "PR" }, + { LineBreak_SY, "SY" }, + { LineBreak_AL, "AI" }, + { LineBreak_AL, "AL" }, + { LineBreak_H2, "H2" }, + { LineBreak_H3, "H3" }, + { LineBreak_ID, "ID" }, + { LineBreak_JL, "JL" }, + { LineBreak_JV, "JV" }, + { LineBreak_JT, "JT" }, + { LineBreak_SA, "SA" }, + { LineBreak_AL, "XX" }, + { LineBreak_Unassigned, 0 } + }; + LineBreakList *d = breaks; + while (d->name) { + line_break_map.insert(d->name, d->brk); + ++d; + } +} + + +// Keep this one in sync with the code in createPropertyInfo +static const char *property_string = + " struct Properties {\n" + " ushort category : 8; /* 5 needed */\n" + " ushort line_break_class : 8; /* 6 needed */\n" + " ushort direction : 8; /* 5 needed */\n" + " ushort combiningClass : 8;\n" + " ushort joining : 2;\n" + " signed short digitValue : 6; /* 5 needed */\n" + " ushort unicodeVersion : 4;\n" + " ushort lowerCaseSpecial : 1;\n" + " ushort upperCaseSpecial : 1;\n" + " ushort titleCaseSpecial : 1;\n" + " ushort caseFoldSpecial : 1; /* currently unused */\n" + " signed short mirrorDiff : 16;\n" + " signed short lowerCaseDiff : 16;\n" + " signed short upperCaseDiff : 16;\n" + " signed short titleCaseDiff : 16;\n" + " signed short caseFoldDiff : 16;\n" + " ushort graphemeBreak : 8; /* 4 needed */\n" + " ushort wordBreak : 8; /* 4 needed */\n" + " ushort sentenceBreak : 8; /* 4 needed */\n" + " };\n" + " Q_CORE_EXPORT const Properties * QT_FASTCALL properties(uint ucs4);\n" + " Q_CORE_EXPORT const Properties * QT_FASTCALL properties(ushort ucs2);\n"; + +static const char *methods = + " Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL lineBreakClass(uint ucs4);\n" + " inline int lineBreakClass(const QChar &ch)\n" + " { return lineBreakClass(ch.unicode()); }\n" + "\n" + " Q_CORE_EXPORT int QT_FASTCALL script(uint ucs4);\n" + " inline int script(const QChar &ch)\n" + " { return script(ch.unicode()); }\n\n"; + +static const int SizeOfPropertiesStruct = 20; + +struct PropertyFlags { + bool operator ==(const PropertyFlags &o) { + return (combiningClass == o.combiningClass + && category == o.category + && direction == o.direction + && joining == o.joining + && age == o.age + && digitValue == o.digitValue + && line_break_class == o.line_break_class + && mirrorDiff == o.mirrorDiff + && lowerCaseDiff == o.lowerCaseDiff + && upperCaseDiff == o.upperCaseDiff + && titleCaseDiff == o.titleCaseDiff + && caseFoldDiff == o.caseFoldDiff + && lowerCaseSpecial == o.lowerCaseSpecial + && upperCaseSpecial == o.upperCaseSpecial + && titleCaseSpecial == o.titleCaseSpecial + && caseFoldSpecial == o.caseFoldSpecial + && graphemeBreak == o.graphemeBreak + && wordBreak == o.wordBreak + && sentenceBreak == o.sentenceBreak + ); + } + // from UnicodeData.txt + uchar combiningClass : 8; + QChar::Category category : 5; + QChar::Direction direction : 5; + // from ArabicShaping.txt + QChar::Joining joining : 2; + // from DerivedAge.txt + QChar::UnicodeVersion age : 4; + int digitValue; + LineBreakClass line_break_class; + + int mirrorDiff : 16; + + int lowerCaseDiff; + int upperCaseDiff; + int titleCaseDiff; + int caseFoldDiff; + bool lowerCaseSpecial; + bool upperCaseSpecial; + bool titleCaseSpecial; + bool caseFoldSpecial; + GraphemeBreak graphemeBreak; + WordBreak wordBreak; + SentenceBreak sentenceBreak; +}; + + +static QList<int> specialCaseMap; +static int specialCaseMaxLen = 0; + +static int appendToSpecialCaseMap(const QList<int> &map) +{ + QList<int> utf16map; + for (int i = 0; i < map.size(); ++i) { + int val = map.at(i); + if (val >= 0x10000) { + utf16map << QChar::highSurrogate(val); + utf16map << QChar::lowSurrogate(val); + } else { + utf16map << val; + } + } + specialCaseMaxLen = qMax(specialCaseMaxLen, utf16map.size()); + utf16map << 0; + + for (int i = 0; i < specialCaseMap.size() - utf16map.size() + 1; ++i) { + int j; + for (j = 0; j < utf16map.size(); ++j) { + if (specialCaseMap.at(i+j) != utf16map.at(j)) + break; + } + if (j == utf16map.size()) + return i; + } + + int pos = specialCaseMap.size(); + specialCaseMap << utf16map; + return pos; +} + +struct UnicodeData { + UnicodeData(int codepoint = 0) { + p.category = QChar::Other_NotAssigned; // Cn + p.combiningClass = 0; + + p.direction = QChar::DirL; + // DerivedBidiClass.txt + // DirR for: U+0590..U+05FF, U+07C0..U+08FF, U+FB1D..U+FB4F, U+10800..U+10FFF + if ((codepoint >= 0x590 && codepoint <= 0x5ff) + || (codepoint >= 0x7c0 && codepoint <= 0x8ff) + || (codepoint >= 0xfb1d && codepoint <= 0xfb4f) + || (codepoint >= 0x10800 && codepoint <= 0x10fff)) { + p.direction = QChar::DirR; + } + // DirAL for: U+0600..U+07BF, U+FB50..U+FDFF, U+FE70..U+FEFF + // minus noncharacter code points (intersects with U+FDD0..U+FDEF) + if ((codepoint >= 0x600 && codepoint <= 0x7bf) + || (codepoint >= 0xfb50 && codepoint <= 0xfdcf) + || (codepoint >= 0xfdf0 && codepoint <= 0xfdff) + || (codepoint >= 0xfe70 && codepoint <= 0xfeff)) { + p.direction = QChar::DirAL; + } + + mirroredChar = 0; + decompositionType = QChar::NoDecomposition; + p.joining = QChar::OtherJoining; + p.age = QChar::Unicode_Unassigned; + p.mirrorDiff = 0; + p.digitValue = -1; + p.line_break_class = LineBreak_AL; // XX -> AL + p.lowerCaseDiff = 0; + p.upperCaseDiff = 0; + p.titleCaseDiff = 0; + p.caseFoldDiff = 0; + p.lowerCaseSpecial = 0; + p.upperCaseSpecial = 0; + p.titleCaseSpecial = 0; + p.caseFoldSpecial = 0; + p.graphemeBreak = GraphemeBreakOther; + p.wordBreak = WordBreakOther; + p.sentenceBreak = SentenceBreakOther; + propertyIndex = -1; + excludedComposition = false; + } + PropertyFlags p; + + // from UnicodeData.txt + QChar::Decomposition decompositionType; + QList<int> decomposition; + + QList<int> specialFolding; + + // from BidiMirroring.txt + int mirroredChar; + + // DerivedNormalizationProps.txt + bool excludedComposition; + + // computed position of unicode property set + int propertyIndex; +}; + +enum UniDataFields { + UD_Value, + UD_Name, + UD_Category, + UD_CombiningClass, + UD_BidiCategory, + UD_Decomposition, + UD_DecimalDigitValue, + UD_DigitValue, + UD_NumericValue, + UD_Mirrored, + UD_OldName, + UD_Comment, + UD_UpperCase, + UD_LowerCase, + UD_TitleCase +}; + + +static QHash<QByteArray, QChar::Category> categoryMap; + +static void initCategoryMap() +{ + struct Cat { + QChar::Category cat; + const char *name; + } categories[] = { + { QChar::Mark_NonSpacing, "Mn" }, + { QChar::Mark_SpacingCombining, "Mc" }, + { QChar::Mark_Enclosing, "Me" }, + + { QChar::Number_DecimalDigit, "Nd" }, + { QChar::Number_Letter, "Nl" }, + { QChar::Number_Other, "No" }, + + { QChar::Separator_Space, "Zs" }, + { QChar::Separator_Line, "Zl" }, + { QChar::Separator_Paragraph, "Zp" }, + + { QChar::Other_Control, "Cc" }, + { QChar::Other_Format, "Cf" }, + { QChar::Other_Surrogate, "Cs" }, + { QChar::Other_PrivateUse, "Co" }, + { QChar::Other_NotAssigned, "Cn" }, + + { QChar::Letter_Uppercase, "Lu" }, + { QChar::Letter_Lowercase, "Ll" }, + { QChar::Letter_Titlecase, "Lt" }, + { QChar::Letter_Modifier, "Lm" }, + { QChar::Letter_Other, "Lo" }, + + { QChar::Punctuation_Connector, "Pc" }, + { QChar::Punctuation_Dash, "Pd" }, + { QChar::Punctuation_Open, "Ps" }, + { QChar::Punctuation_Close, "Pe" }, + { QChar::Punctuation_InitialQuote, "Pi" }, + { QChar::Punctuation_FinalQuote, "Pf" }, + { QChar::Punctuation_Other, "Po" }, + + { QChar::Symbol_Math, "Sm" }, + { QChar::Symbol_Currency, "Sc" }, + { QChar::Symbol_Modifier, "Sk" }, + { QChar::Symbol_Other, "So" }, + { QChar::NoCategory, 0 } + }; + Cat *c = categories; + while (c->name) { + categoryMap.insert(c->name, c->cat); + ++c; + } +} + + +static QHash<QByteArray, QChar::Direction> directionMap; + +static void initDirectionMap() +{ + struct Dir { + QChar::Direction dir; + const char *name; + } directions[] = { + { QChar::DirL, "L" }, + { QChar::DirR, "R" }, + { QChar::DirEN, "EN" }, + { QChar::DirES, "ES" }, + { QChar::DirET, "ET" }, + { QChar::DirAN, "AN" }, + { QChar::DirCS, "CS" }, + { QChar::DirB, "B" }, + { QChar::DirS, "S" }, + { QChar::DirWS, "WS" }, + { QChar::DirON, "ON" }, + { QChar::DirLRE, "LRE" }, + { QChar::DirLRO, "LRO" }, + { QChar::DirAL, "AL" }, + { QChar::DirRLE, "RLE" }, + { QChar::DirRLO, "RLO" }, + { QChar::DirPDF, "PDF" }, + { QChar::DirNSM, "NSM" }, + { QChar::DirBN, "BN" }, + { QChar::DirL, 0 } + }; + Dir *d = directions; + while (d->name) { + directionMap.insert(d->name, d->dir); + ++d; + } +} + + +static QHash<QByteArray, QChar::Decomposition> decompositionMap; + +static void initDecompositionMap() +{ + struct Dec { + QChar::Decomposition dec; + const char *name; + } decompositions[] = { + { QChar::Canonical, "<canonical>" }, + { QChar::Font, "<font>" }, + { QChar::NoBreak, "<noBreak>" }, + { QChar::Initial, "<initial>" }, + { QChar::Medial, "<medial>" }, + { QChar::Final, "<final>" }, + { QChar::Isolated, "<isolated>" }, + { QChar::Circle, "<circle>" }, + { QChar::Super, "<super>" }, + { QChar::Sub, "<sub>" }, + { QChar::Vertical, "<vertical>" }, + { QChar::Wide, "<wide>" }, + { QChar::Narrow, "<narrow>" }, + { QChar::Small, "<small>" }, + { QChar::Square, "<square>" }, + { QChar::Compat, "<compat>" }, + { QChar::Fraction, "<fraction>" }, + { QChar::NoDecomposition, 0 } + }; + Dec *d = decompositions; + while (d->name) { + decompositionMap.insert(d->name, d->dec); + ++d; + } +} + + +static QHash<int, UnicodeData> unicodeData; +static QList<PropertyFlags> uniqueProperties; + + +static QHash<int, int> decompositionLength; +static int highestComposedCharacter = 0; +static int numLigatures = 0; +static int highestLigature = 0; + +struct Ligature { + ushort u1; + ushort u2; + ushort ligature; +}; +// we need them sorted after the first component for fast lookup +bool operator < (const Ligature &l1, const Ligature &l2) +{ return l1.u1 < l2.u1; } + +static QHash<ushort, QList<Ligature> > ligatureHashes; + +static QHash<int, int> combiningClassUsage; + +static int maxLowerCaseDiff = 0; +static int maxUpperCaseDiff = 0; +static int maxTitleCaseDiff = 0; + +static void readUnicodeData() +{ + QFile f("data/UnicodeData.txt"); + if (!f.exists()) + qFatal("Couldn't find UnicodeData.txt"); + + f.open(QFile::ReadOnly); + + while (!f.atEnd()) { + QByteArray line; + line.resize(1024); + int len = f.readLine(line.data(), 1024); + line.truncate(len-1); + + int comment = line.indexOf('#'); + if (comment >= 0) + line = line.left(comment); + if (line.isEmpty()) + continue; + + QList<QByteArray> properties = line.split(';'); + bool ok; + int codepoint = properties[UD_Value].toInt(&ok, 16); + Q_ASSERT(ok); + Q_ASSERT(codepoint <= LAST_CODEPOINT); + int lastCodepoint = codepoint; + + QByteArray name = properties[UD_Name]; + if (name.startsWith('<') && name.contains("First")) { + QByteArray nextLine; + nextLine.resize(1024); + f.readLine(nextLine.data(), 1024); + QList<QByteArray> properties = nextLine.split(';'); + Q_ASSERT(properties[UD_Name].startsWith('<') && properties[UD_Name].contains("Last")); + lastCodepoint = properties[UD_Value].toInt(&ok, 16); + Q_ASSERT(ok); + Q_ASSERT(lastCodepoint <= LAST_CODEPOINT); + } + + UnicodeData data(codepoint); + data.p.category = categoryMap.value(properties[UD_Category], QChar::NoCategory); + if (data.p.category == QChar::NoCategory) + qFatal("unassigned char category: %s", properties[UD_Category].constData()); + + data.p.combiningClass = properties[UD_CombiningClass].toInt(); + if (!combiningClassUsage.contains(data.p.combiningClass)) + combiningClassUsage[data.p.combiningClass] = 1; + else + ++combiningClassUsage[data.p.combiningClass]; + + data.p.direction = directionMap.value(properties[UD_BidiCategory], data.p.direction); + + if (!properties[UD_UpperCase].isEmpty()) { + int upperCase = properties[UD_UpperCase].toInt(&ok, 16); + Q_ASSERT(ok); + int diff = upperCase - codepoint; + if (qAbs(diff) >= (1<<14)) + qWarning() << "upperCaseDiff exceeded (" << hex << codepoint << "->" << upperCase << ")"; + data.p.upperCaseDiff = diff; + maxUpperCaseDiff = qMax(maxUpperCaseDiff, qAbs(diff)); + if (codepoint >= 0x10000 || upperCase >= 0x10000) { + // if the conditions below doesn't hold anymore we need to modify our upper casing code + Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(upperCase)); + Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(upperCase)); + } + } + if (!properties[UD_LowerCase].isEmpty()) { + int lowerCase = properties[UD_LowerCase].toInt(&ok, 16); + Q_ASSERT(ok); + int diff = lowerCase - codepoint; + if (qAbs(diff) >= (1<<14)) + qWarning() << "lowerCaseDiff exceeded (" << hex << codepoint << "->" << lowerCase << ")"; + data.p.lowerCaseDiff = diff; + maxLowerCaseDiff = qMax(maxLowerCaseDiff, qAbs(diff)); + if (codepoint >= 0x10000 || lowerCase >= 0x10000) { + // if the conditions below doesn't hold anymore we need to modify our lower casing code + Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(lowerCase)); + Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(lowerCase)); + } + } + // we want toTitleCase to map to ToUpper in case we don't have any titlecase. + if (properties[UD_TitleCase].isEmpty()) + properties[UD_TitleCase] = properties[UD_UpperCase]; + if (!properties[UD_TitleCase].isEmpty()) { + int titleCase = properties[UD_TitleCase].toInt(&ok, 16); + Q_ASSERT(ok); + int diff = titleCase - codepoint; + if (qAbs(diff) >= (1<<14)) + qWarning() << "titleCaseDiff exceeded (" << hex << codepoint << "->" << titleCase << ")"; + data.p.titleCaseDiff = diff; + maxTitleCaseDiff = qMax(maxTitleCaseDiff, qAbs(diff)); + if (codepoint >= 0x10000 || titleCase >= 0x10000) { + // if the conditions below doesn't hold anymore we need to modify our title casing code + Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(titleCase)); + Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(titleCase)); + } + } + + if (!properties[UD_DigitValue].isEmpty()) + data.p.digitValue = properties[UD_DigitValue].toInt(); + + // decompositition + QByteArray decomposition = properties[UD_Decomposition]; + if (!decomposition.isEmpty()) { + highestComposedCharacter = qMax(highestComposedCharacter, codepoint); + QList<QByteArray> d = decomposition.split(' '); + if (d[0].contains('<')) { + data.decompositionType = decompositionMap.value(d[0], QChar::NoDecomposition); + if (data.decompositionType == QChar::NoDecomposition) + qFatal("unassigned decomposition type: %s", d[0].constData()); + d.takeFirst(); + } else { + data.decompositionType = QChar::Canonical; + } + for (int i = 0; i < d.size(); ++i) { + data.decomposition.append(d[i].toInt(&ok, 16)); + Q_ASSERT(ok); + } + if (!decompositionLength.contains(data.decomposition.size())) + decompositionLength[data.decomposition.size()] = 1; + else + ++decompositionLength[data.decomposition.size()]; + } + + for (int i = codepoint; i <= lastCodepoint; ++i) + unicodeData.insert(i, data); + } + +} + +static int maxMirroredDiff = 0; + +static void readBidiMirroring() +{ + QFile f("data/BidiMirroring.txt"); + if (!f.exists()) + qFatal("Couldn't find BidiMirroring.txt"); + + f.open(QFile::ReadOnly); + + while (!f.atEnd()) { + QByteArray line; + line.resize(1024); + int len = f.readLine(line.data(), 1024); + line.resize(len-1); + + int comment = line.indexOf('#'); + if (comment >= 0) + line = line.left(comment); + + if (line.isEmpty()) + continue; + line = line.replace(" ", ""); + + QList<QByteArray> pair = line.split(';'); + Q_ASSERT(pair.size() == 2); + + bool ok; + int codepoint = pair[0].toInt(&ok, 16); + Q_ASSERT(ok); + int mirror = pair[1].toInt(&ok, 16); + Q_ASSERT(ok); + + UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); + d.mirroredChar = mirror; + d.p.mirrorDiff = d.mirroredChar - codepoint; + maxMirroredDiff = qMax(maxMirroredDiff, qAbs(d.p.mirrorDiff)); + unicodeData.insert(codepoint, d); + } +} + +static void readArabicShaping() +{ + QFile f("data/ArabicShaping.txt"); + if (!f.exists()) + qFatal("Couldn't find ArabicShaping.txt"); + + f.open(QFile::ReadOnly); + + while (!f.atEnd()) { + QByteArray line; + line.resize(1024); + int len = f.readLine(line.data(), 1024); + line.resize(len-1); + + int comment = line.indexOf('#'); + if (comment >= 0) + line = line.left(comment); + line = line.trimmed(); + + if (line.isEmpty()) + continue; + + QList<QByteArray> l = line.split(';'); + Q_ASSERT(l.size() == 4); + + bool ok; + int codepoint = l[0].toInt(&ok, 16); + Q_ASSERT(ok); + + Joining joining = joining_map.value(l[2].trimmed(), Joining_Unassigned); + if (joining == Joining_Unassigned) + qFatal("unassigned or unhandled joining value: %s", l[2].constData()); + + if (joining == Joining_Left) { + // There are currently no characters of joining type Left_Joining defined in Unicode. + qFatal("%x: joining type '%s' was met; the current implementation needs to be revised!", codepoint, l[2].constData()); + } + + UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); + if (joining == Joining_Right) + d.p.joining = QChar::Right; + else if (joining == Joining_Dual) + d.p.joining = QChar::Dual; + else if (joining == Joining_Causing) + d.p.joining = QChar::Center; + else + d.p.joining = QChar::OtherJoining; + unicodeData.insert(codepoint, d); + } +} + +static void readDerivedAge() +{ + QFile f("data/DerivedAge.txt"); + if (!f.exists()) + qFatal("Couldn't find DerivedAge.txt"); + + f.open(QFile::ReadOnly); + + while (!f.atEnd()) { + QByteArray line; + line.resize(1024); + int len = f.readLine(line.data(), 1024); + line.resize(len-1); + + int comment = line.indexOf('#'); + if (comment >= 0) + line = line.left(comment); + line.replace(" ", ""); + + if (line.isEmpty()) + continue; + + QList<QByteArray> l = line.split(';'); + Q_ASSERT(l.size() == 2); + + QByteArray codes = l[0]; + codes.replace("..", "."); + QList<QByteArray> cl = codes.split('.'); + + bool ok; + int from = cl[0].toInt(&ok, 16); + Q_ASSERT(ok); + int to = from; + if (cl.size() == 2) { + to = cl[1].toInt(&ok, 16); + Q_ASSERT(ok); + } + + QChar::UnicodeVersion age = age_map.value(l[1].trimmed(), QChar::Unicode_Unassigned); + //qDebug() << hex << from << ".." << to << ba << age; + if (age == QChar::Unicode_Unassigned) + qFatal("unassigned or unhandled age value: %s", l[1].constData()); + + for (int codepoint = from; codepoint <= to; ++codepoint) { + UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); + d.p.age = age; + unicodeData.insert(codepoint, d); + } + } +} + + +static void readDerivedNormalizationProps() +{ + QFile f("data/DerivedNormalizationProps.txt"); + if (!f.exists()) + qFatal("Couldn't find DerivedNormalizationProps.txt"); + + f.open(QFile::ReadOnly); + + while (!f.atEnd()) { + QByteArray line; + line.resize(1024); + int len = f.readLine(line.data(), 1024); + line.resize(len-1); + + int comment = line.indexOf('#'); + if (comment >= 0) + line = line.left(comment); + + if (line.trimmed().isEmpty()) + continue; + + QList<QByteArray> l = line.split(';'); + Q_ASSERT(l.size() >= 2); + + QByteArray propName = l[1].trimmed(); + if (propName != "Full_Composition_Exclusion") + // ### + continue; + + QByteArray codes = l[0].trimmed(); + codes.replace("..", "."); + QList<QByteArray> cl = codes.split('.'); + + bool ok; + int from = cl[0].toInt(&ok, 16); + Q_ASSERT(ok); + int to = from; + if (cl.size() == 2) { + to = cl[1].toInt(&ok, 16); + Q_ASSERT(ok); + } + + for (int codepoint = from; codepoint <= to; ++codepoint) { + UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); + d.excludedComposition = true; + unicodeData.insert(codepoint, d); + } + } + + for (int codepoint = 0; codepoint <= LAST_CODEPOINT; ++codepoint) { + UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); + if (!d.excludedComposition + && d.decompositionType == QChar::Canonical + && d.decomposition.size() > 1) { + Q_ASSERT(d.decomposition.size() == 2); + + int part1 = d.decomposition.at(0); + int part2 = d.decomposition.at(1); + + // all non-starters are listed in DerivedNormalizationProps.txt + // and already excluded from composition + Q_ASSERT(unicodeData.value(part1, UnicodeData(part1)).p.combiningClass == 0); + + ++numLigatures; + highestLigature = qMax(highestLigature, part1); + Ligature l = {(ushort)part1, (ushort)part2, (ushort)codepoint}; + ligatureHashes[part2].append(l); + } + } +} + + +struct NormalizationCorrection { + uint codepoint; + uint mapped; + uint version; +}; + +static QByteArray createNormalizationCorrections() +{ + QFile f("data/NormalizationCorrections.txt"); + if (!f.exists()) + qFatal("Couldn't find NormalizationCorrections.txt"); + + f.open(QFile::ReadOnly); + + QByteArray out; + + out += "struct NormalizationCorrection {\n" + " uint ucs4;\n" + " uint old_mapping;\n" + " int version;\n" + "};\n\n" + + "static const NormalizationCorrection uc_normalization_corrections[] = {\n"; + + int numCorrections = 0; + while (!f.atEnd()) { + QByteArray line; + line.resize(1024); + int len = f.readLine(line.data(), 1024); + line.resize(len-1); + + int comment = line.indexOf('#'); + if (comment >= 0) + line = line.left(comment); + line.replace(" ", ""); + + if (line.isEmpty()) + continue; + + Q_ASSERT(!line.contains("..")); + + QList<QByteArray> fields = line.split(';'); + Q_ASSERT(fields.size() == 4); + + NormalizationCorrection c = { 0, 0, 0 }; + bool ok; + c.codepoint = fields.at(0).toInt(&ok, 16); + Q_ASSERT(ok); + c.mapped = fields.at(1).toInt(&ok, 16); + Q_ASSERT(ok); + if (fields.at(3) == "3.2.0") + c.version = QChar::Unicode_3_2; + else if (fields.at(3) == "4.0.0") + c.version = QChar::Unicode_4_0; + else + qFatal("unknown unicode version in NormalizationCorrection.txt"); + + out += " { 0x" + QByteArray::number(c.codepoint, 16) + ", 0x" + QByteArray::number(c.mapped, 16) + + ", " + QString::number(c.version) + " },\n"; + ++numCorrections; + } + + out += "};\n\n" + + "enum { NumNormalizationCorrections = " + QByteArray::number(numCorrections) + " };\n\n"; + + return out; +} + + +static void computeUniqueProperties() +{ + qDebug("computeUniqueProperties:"); + for (int uc = 0; uc <= LAST_CODEPOINT; ++uc) { + UnicodeData d = unicodeData.value(uc, UnicodeData(uc)); + + int index = uniqueProperties.indexOf(d.p); + if (index == -1) { + index = uniqueProperties.size(); + uniqueProperties.append(d.p); + } + d.propertyIndex = index; + unicodeData.insert(uc, d); + } + qDebug(" %d unique unicode properties found", uniqueProperties.size()); +} + + +static void readLineBreak() +{ + qDebug() << "Reading LineBreak.txt"; + QFile f("data/LineBreak.txt"); + if (!f.exists()) + qFatal("Couldn't find LineBreak.txt"); + + f.open(QFile::ReadOnly); + + while (!f.atEnd()) { + QByteArray line; + line.resize(1024); + int len = f.readLine(line.data(), 1024); + line.resize(len-1); + + int comment = line.indexOf('#'); + if (comment >= 0) + line = line.left(comment); + line.replace(" ", ""); + + if (line.isEmpty()) + continue; + + QList<QByteArray> l = line.split(';'); + Q_ASSERT(l.size() == 2); + + QByteArray codes = l[0]; + codes.replace("..", "."); + QList<QByteArray> cl = codes.split('.'); + + bool ok; + int from = cl[0].toInt(&ok, 16); + Q_ASSERT(ok); + int to = from; + if (cl.size() == 2) { + to = cl[1].toInt(&ok, 16); + Q_ASSERT(ok); + } + + LineBreakClass lb = line_break_map.value(l[1], LineBreak_Unassigned); + if (lb == LineBreak_Unassigned) + qFatal("unassigned line break class: %s", l[1].constData()); + + for (int codepoint = from; codepoint <= to; ++codepoint) { + UnicodeData d = unicodeData.value(codepoint, UnicodeData(codepoint)); + d.p.line_break_class = lb; + unicodeData.insert(codepoint, d); + } + } +} + + +static void readSpecialCasing() +{ + qDebug() << "Reading SpecialCasing.txt"; + QFile f("data/SpecialCasing.txt"); + if (!f.exists()) + qFatal("Couldn't find SpecialCasing.txt"); + + f.open(QFile::ReadOnly); + + while (!f.atEnd()) { + QByteArray line; + line.resize(1024); + int len = f.readLine(line.data(), 1024); + line.resize(len-1); + + int comment = line.indexOf('#'); + if (comment >= 0) + line = line.left(comment); + + if (line.isEmpty()) + continue; + + QList<QByteArray> l = line.split(';'); + + QByteArray condition = l.size() < 5 ? QByteArray() : l[4].trimmed(); + if (!condition.isEmpty()) + // ##### + continue; + + bool ok; + int codepoint = l[0].trimmed().toInt(&ok, 16); + Q_ASSERT(ok); + + // if the condition below doesn't hold anymore we need to modify our + // lower/upper/title casing code and case folding code + Q_ASSERT(codepoint < 0x10000); + +// qDebug() << "codepoint" << hex << codepoint; +// qDebug() << line; + + QList<QByteArray> lower = l[1].trimmed().split(' '); + QList<int> lowerMap; + for (int i = 0; i < lower.size(); ++i) { + bool ok; + lowerMap.append(lower.at(i).toInt(&ok, 16)); + Q_ASSERT(ok); + } + + QList<QByteArray> title = l[2].trimmed().split(' '); + QList<int> titleMap; + for (int i = 0; i < title.size(); ++i) { + bool ok; + titleMap.append(title.at(i).toInt(&ok, 16)); + Q_ASSERT(ok); + } + + QList<QByteArray> upper = l[3].trimmed().split(' '); + QList<int> upperMap; + for (int i = 0; i < upper.size(); ++i) { + bool ok; + upperMap.append(upper.at(i).toInt(&ok, 16)); + Q_ASSERT(ok); + } + + + UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint)); + + Q_ASSERT(lowerMap.size() > 1 || lowerMap.at(0) == codepoint + ud.p.lowerCaseDiff); + Q_ASSERT(titleMap.size() > 1 || titleMap.at(0) == codepoint + ud.p.titleCaseDiff); + Q_ASSERT(upperMap.size() > 1 || upperMap.at(0) == codepoint + ud.p.upperCaseDiff); + + if (lowerMap.size() > 1) { + ud.p.lowerCaseSpecial = true; + ud.p.lowerCaseDiff = appendToSpecialCaseMap(lowerMap); + } + if (titleMap.size() > 1) { + ud.p.titleCaseSpecial = true; + ud.p.titleCaseDiff = appendToSpecialCaseMap(titleMap); + } + if (upperMap.size() > 1) { + ud.p.upperCaseSpecial = true; + ud.p.upperCaseDiff = appendToSpecialCaseMap(upperMap);; + } + + unicodeData.insert(codepoint, ud); + } +} + +static int maxCaseFoldDiff = 0; + +static void readCaseFolding() +{ + qDebug() << "Reading CaseFolding.txt"; + QFile f("data/CaseFolding.txt"); + if (!f.exists()) + qFatal("Couldn't find CaseFolding.txt"); + + f.open(QFile::ReadOnly); + + while (!f.atEnd()) { + QByteArray line; + line.resize(1024); + int len = f.readLine(line.data(), 1024); + line.resize(len-1); + + int comment = line.indexOf('#'); + if (comment >= 0) + line = line.left(comment); + + if (line.isEmpty()) + continue; + + QList<QByteArray> l = line.split(';'); + + bool ok; + int codepoint = l[0].trimmed().toInt(&ok, 16); + Q_ASSERT(ok); + + + l[1] = l[1].trimmed(); + if (l[1] == "F" || l[1] == "T") + continue; + +// qDebug() << "codepoint" << hex << codepoint; +// qDebug() << line; + QList<QByteArray> fold = l[2].trimmed().split(' '); + QList<int> foldMap; + for (int i = 0; i < fold.size(); ++i) { + bool ok; + foldMap.append(fold.at(i).toInt(&ok, 16)); + Q_ASSERT(ok); + } + + UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint)); + if (foldMap.size() == 1) { + int caseFolded = foldMap.at(0); + int diff = caseFolded - codepoint; + if (qAbs(diff) >= (1<<14)) + qWarning() << "caseFoldDiff exceeded (" << hex << codepoint << "->" << caseFolded << ")"; + ud.p.caseFoldDiff = diff; + maxCaseFoldDiff = qMax(maxCaseFoldDiff, qAbs(diff)); + if (codepoint >= 0x10000 || caseFolded >= 0x10000) { + // if the conditions below doesn't hold anymore we need to modify our case folding code + Q_ASSERT(QChar::highSurrogate(codepoint) == QChar::highSurrogate(caseFolded)); + Q_ASSERT(QChar::lowSurrogate(codepoint) + diff == QChar::lowSurrogate(caseFolded)); + } + if (caseFolded != codepoint + ud.p.lowerCaseDiff) + qDebug() << hex << codepoint; + } else { + qFatal("we currently don't support full case foldings"); +// qDebug() << "special" << hex << foldMap; + ud.p.caseFoldSpecial = true; + ud.p.caseFoldDiff = appendToSpecialCaseMap(foldMap); + } + unicodeData.insert(codepoint, ud); + } +} + +static void readGraphemeBreak() +{ + qDebug() << "Reading GraphemeBreakProperty.txt"; + QFile f("data/GraphemeBreakProperty.txt"); + if (!f.exists()) + qFatal("Couldn't find GraphemeBreakProperty.txt"); + + f.open(QFile::ReadOnly); + + while (!f.atEnd()) { + QByteArray line; + line.resize(1024); + int len = f.readLine(line.data(), 1024); + line.resize(len-1); + + int comment = line.indexOf('#'); + if (comment >= 0) + line = line.left(comment); + line.replace(" ", ""); + + if (line.isEmpty()) + continue; + + QList<QByteArray> l = line.split(';'); + Q_ASSERT(l.size() == 2); + + QByteArray codes = l[0]; + codes.replace("..", "."); + QList<QByteArray> cl = codes.split('.'); + + bool ok; + int from = cl[0].toInt(&ok, 16); + Q_ASSERT(ok); + int to = from; + if (cl.size() == 2) { + to = cl[1].toInt(&ok, 16); + Q_ASSERT(ok); + } + + GraphemeBreak brk = grapheme_break_map.value(l[1], GraphemeBreak_Unassigned); + if (brk == GraphemeBreak_Unassigned) + qFatal("unassigned grapheme break class: %s", l[1].constData()); + + for (int codepoint = from; codepoint <= to; ++codepoint) { + UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint)); + ud.p.graphemeBreak = brk; + unicodeData.insert(codepoint, ud); + } + } +} + +static void readWordBreak() +{ + qDebug() << "Reading WordBreakProperty.txt"; + QFile f("data/WordBreakProperty.txt"); + if (!f.exists()) + qFatal("Couldn't find WordBreakProperty.txt"); + + f.open(QFile::ReadOnly); + + while (!f.atEnd()) { + QByteArray line; + line.resize(1024); + int len = f.readLine(line.data(), 1024); + line.resize(len-1); + + int comment = line.indexOf('#'); + if (comment >= 0) + line = line.left(comment); + line.replace(" ", ""); + + if (line.isEmpty()) + continue; + + QList<QByteArray> l = line.split(';'); + Q_ASSERT(l.size() == 2); + + QByteArray codes = l[0]; + codes.replace("..", "."); + QList<QByteArray> cl = codes.split('.'); + + bool ok; + int from = cl[0].toInt(&ok, 16); + Q_ASSERT(ok); + int to = from; + if (cl.size() == 2) { + to = cl[1].toInt(&ok, 16); + Q_ASSERT(ok); + } + + WordBreak brk = word_break_map.value(l[1], WordBreak_Unassigned); + if (brk == WordBreak_Unassigned) + qFatal("unassigned word break class: %s", l[1].constData()); + + for (int codepoint = from; codepoint <= to; ++codepoint) { + UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint)); + ud.p.wordBreak = brk; + unicodeData.insert(codepoint, ud); + } + } +} + +static void readSentenceBreak() +{ + qDebug() << "Reading SentenceBreakProperty.txt"; + QFile f("data/SentenceBreakProperty.txt"); + if (!f.exists()) + qFatal("Couldn't find SentenceBreakProperty.txt"); + + f.open(QFile::ReadOnly); + + while (!f.atEnd()) { + QByteArray line; + line.resize(1024); + int len = f.readLine(line.data(), 1024); + line.resize(len-1); + + int comment = line.indexOf('#'); + if (comment >= 0) + line = line.left(comment); + line.replace(" ", ""); + + if (line.isEmpty()) + continue; + + QList<QByteArray> l = line.split(';'); + Q_ASSERT(l.size() == 2); + + QByteArray codes = l[0]; + codes.replace("..", "."); + QList<QByteArray> cl = codes.split('.'); + + bool ok; + int from = cl[0].toInt(&ok, 16); + Q_ASSERT(ok); + int to = from; + if (cl.size() == 2) { + to = cl[1].toInt(&ok, 16); + Q_ASSERT(ok); + } + + SentenceBreak brk = sentence_break_map.value(l[1], SentenceBreak_Unassigned); + if (brk == SentenceBreak_Unassigned) + qFatal("unassigned sentence break class: %s", l[1].constData()); + + for (int codepoint = from; codepoint <= to; ++codepoint) { + UnicodeData ud = unicodeData.value(codepoint, UnicodeData(codepoint)); + ud.p.sentenceBreak = brk; + unicodeData.insert(codepoint, ud); + } + } +} + +#if 0 +// this piece of code does full case folding and comparison. We currently +// don't use it, since this gives lots of issues with things as case insensitive +// search and replace. +static inline void foldCase(uint ch, ushort *out) +{ + const QUnicodeTables::Properties *p = qGetProp(ch); + if (!p->caseFoldSpecial) { + *(out++) = ch + p->caseFoldDiff; + } else { + const ushort *folded = specialCaseMap + p->caseFoldDiff; + while (*folded) + *out++ = *folded++; + } + *out = 0; +} + +static int ucstricmp(const ushort *a, const ushort *ae, const ushort *b, const ushort *be) +{ + if (a == b) + return 0; + if (a == 0) + return 1; + if (b == 0) + return -1; + + while (a != ae && b != be) { + const QUnicodeTables::Properties *pa = qGetProp(*a); + const QUnicodeTables::Properties *pb = qGetProp(*b); + if (pa->caseFoldSpecial | pb->caseFoldSpecial) + goto special; + int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff); + if ((diff)) + return diff; + ++a; + ++b; + } + } + if (a == ae) { + if (b == be) + return 0; + return -1; + } + return 1; +special: + ushort abuf[SPECIAL_CASE_MAX_LEN + 1]; + ushort bbuf[SPECIAL_CASE_MAX_LEN + 1]; + abuf[0] = bbuf[0] = 0; + ushort *ap = abuf; + ushort *bp = bbuf; + while (1) { + if (!*ap) { + if (a == ae) { + if (!*bp && b == be) + return 0; + return -1; + } + foldCase(*(a++), abuf); + ap = abuf; + } + if (!*bp) { + if (b == be) + return 1; + foldCase(*(b++), bbuf); + bp = bbuf; + } + if (*ap != *bp) + return (int)*ap - (int)*bp; + ++ap; + ++bp; + } +} + + +static int ucstricmp(const ushort *a, const ushort *ae, const uchar *b) +{ + if (a == 0) + return 1; + if (b == 0) + return -1; + + while (a != ae && *b) { + const QUnicodeTables::Properties *pa = qGetProp(*a); + const QUnicodeTables::Properties *pb = qGetProp((ushort)*b); + if (pa->caseFoldSpecial | pb->caseFoldSpecial) + goto special; + int diff = (int)(*a + pa->caseFoldDiff) - (int)(*b + pb->caseFoldDiff); + if ((diff)) + return diff; + ++a; + ++b; + } + if (a == ae) { + if (!*b) + return 0; + return -1; + } + return 1; + +special: + ushort abuf[SPECIAL_CASE_MAX_LEN + 1]; + ushort bbuf[SPECIAL_CASE_MAX_LEN + 1]; + abuf[0] = bbuf[0] = 0; + ushort *ap = abuf; + ushort *bp = bbuf; + while (1) { + if (!*ap) { + if (a == ae) { + if (!*bp && !*b) + return 0; + return -1; + } + foldCase(*(a++), abuf); + ap = abuf; + } + if (!*bp) { + if (!*b) + return 1; + foldCase(*(b++), bbuf); + bp = bbuf; + } + if (*ap != *bp) + return (int)*ap - (int)*bp; + ++ap; + ++bp; + } +} +#endif + +#if 0 +static QList<QByteArray> blockNames; +struct BlockInfo +{ + int blockIndex; + int firstCodePoint; + int lastCodePoint; +}; +static QList<BlockInfo> blockInfoList; + +static void readBlocks() +{ + QFile f("data/Blocks.txt"); + if (!f.exists()) + qFatal("Couldn't find Blocks.txt"); + + f.open(QFile::ReadOnly); + + while (!f.atEnd()) { + QByteArray line = f.readLine(); + line.resize(line.size() - 1); + + int comment = line.indexOf("#"); + if (comment >= 0) + line = line.left(comment); + + line.replace(" ", ""); + + if (line.isEmpty()) + continue; + + int semicolon = line.indexOf(';'); + Q_ASSERT(semicolon >= 0); + QByteArray codePoints = line.left(semicolon); + QByteArray blockName = line.mid(semicolon + 1); + + int blockIndex = blockNames.indexOf(blockName); + if (blockIndex == -1) { + blockIndex = blockNames.size(); + blockNames.append(blockName); + } + + codePoints.replace("..", "."); + QList<QByteArray> cl = codePoints.split('.'); + + bool ok; + int first = cl[0].toInt(&ok, 16); + Q_ASSERT(ok); + int last = first; + if (cl.size() == 2) { + last = cl[1].toInt(&ok, 16); + Q_ASSERT(ok); + } + + BlockInfo blockInfo = { blockIndex, first, last }; + blockInfoList.append(blockInfo); + } +} +#endif + +static QList<QByteArray> scriptNames; +static QHash<int, int> scriptAssignment; +static QHash<int, int> scriptHash; + +struct ExtraBlock { + int block; + QVector<int> vector; +}; + +static QList<ExtraBlock> extraBlockList; + + +static void readScripts() +{ + scriptNames.append("Common"); + + static const char *files[] = { + "data/ScriptsInitial.txt", + "data/Scripts.txt", + "data/ScriptsCorrections.txt" + }; + enum { fileCount = sizeof(files) / sizeof(const char *) }; + + for (int i = 0; i < fileCount; ++i) { + QFile f(files[i]); + if (!f.exists()) + qFatal("Couldn't find %s", files[i]); + + f.open(QFile::ReadOnly); + + while (!f.atEnd()) { + QByteArray line = f.readLine(); + line.resize(line.size() - 1); + + int comment = line.indexOf("#"); + if (comment >= 0) + line = line.left(comment); + + line.replace(" ", ""); + line.replace("_", ""); + + if (line.isEmpty()) + continue; + + int semicolon = line.indexOf(';'); + Q_ASSERT(semicolon >= 0); + QByteArray codePoints = line.left(semicolon); + QByteArray scriptName = line.mid(semicolon + 1); + + int scriptIndex = scriptNames.indexOf(scriptName); + if (scriptIndex == -1) { + scriptIndex = scriptNames.size(); + scriptNames.append(scriptName); + } + + codePoints.replace("..", "."); + QList<QByteArray> cl = codePoints.split('.'); + + bool ok; + int first = cl[0].toInt(&ok, 16); + Q_ASSERT(ok); + int last = first; + if (cl.size() == 2) { + last = cl[1].toInt(&ok, 16); + Q_ASSERT(ok); + } + + for (int i = first; i <= last; ++i) + scriptAssignment[i] = scriptIndex; + } + } +} + + +static int scriptSentinel = 0; + +QByteArray createScriptEnumDeclaration() +{ + static const char *specialScripts[] = { + "Common", + "Arabic", + "Armenian", + "Bengali", + "Cyrillic", + "Devanagari", + "Georgian", + "Greek", + "Gujarati", + "Gurmukhi", + "Hangul", + "Hebrew", + "Kannada", + "Khmer", + "Lao", + "Malayalam", + "Myanmar", + "Nko", + "Ogham", + "Oriya", + "Runic", + "Sinhala", + "Syriac", + "Tamil", + "Telugu", + "Thaana", + "Thai", + "Tibetan", + "Inherited" + }; + const int specialScriptsCount = sizeof(specialScripts) / sizeof(const char *); + + // generate script enum + QByteArray declaration; + + declaration += " // See http://www.unicode.org/reports/tr24/tr24-5.html\n"; + declaration += " enum Script {\n Common"; + + int uniqueScripts = 1; // Common + + // output the ones with special processing first + for (int i = 1; i < scriptNames.size(); ++i) { + QByteArray scriptName = scriptNames.at(i); + // does the script require special processing? + bool special = false; + for (int s = 0; s < specialScriptsCount; ++s) { + if (scriptName == specialScripts[s]) { + special = true; + break; + } + } + if (!special) { + scriptHash[i] = 0; // alias for 'Common' + continue; + } else { + ++uniqueScripts; + scriptHash[i] = i; + } + + if (scriptName != "Inherited") { + declaration += ",\n "; + declaration += scriptName; + } + } + declaration += ",\n Inherited"; + declaration += ",\n ScriptCount = Inherited"; + + // output the ones that are an alias for 'Common' + for (int i = 1; i < scriptNames.size(); ++i) { + if (scriptHash.value(i) != 0) + continue; + declaration += ",\n "; + declaration += scriptNames.at(i); + declaration += " = Common"; + } + + declaration += "\n };\n"; + + scriptSentinel = ((uniqueScripts + 16) / 32) * 32; // a multiple of 32 + declaration += " enum { ScriptSentinel = "; + declaration += QByteArray::number(scriptSentinel); + declaration += " };\n\n"; + return declaration; +} + +QByteArray createScriptTableDeclaration() +{ + Q_ASSERT(scriptSentinel > 0); + + QByteArray declaration; + + const int unicodeBlockCount = 512; // number of unicode blocks + const int unicodeBlockSize = 128; // size of each block + declaration = "enum { UnicodeBlockCount = "; + declaration += QByteArray::number(unicodeBlockCount); + declaration += " }; // number of unicode blocks\n"; + declaration += "enum { UnicodeBlockSize = "; + declaration += QByteArray::number(unicodeBlockSize); + declaration += " }; // size of each block\n\n"; + + // script table + declaration += "namespace QUnicodeTables {\n\nstatic const unsigned char uc_scripts[] = {\n"; + for (int i = 0; i < unicodeBlockCount; ++i) { + int block = (((i << 7) & 0xff00) | ((i & 1) * 0x80)); + int blockAssignment[unicodeBlockSize]; + for (int x = 0; x < unicodeBlockSize; ++x) { + int codePoint = (i << 7) | x; + blockAssignment[x] = scriptAssignment.value(codePoint, 0); + } + bool allTheSame = true; + const int originalScript = blockAssignment[0]; + const int script = scriptHash.value(originalScript); + for (int x = 1; allTheSame && x < unicodeBlockSize; ++x) { + const int s = scriptHash.value(blockAssignment[x]); + if (s != script) + allTheSame = false; + } + + if (allTheSame) { + declaration += " "; + declaration += scriptNames.value(originalScript); + declaration += ", /* U+"; + declaration += QByteArray::number(block, 16).rightJustified(4, '0'); + declaration += '-'; + declaration += QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0'); + declaration += " */\n"; + } else { + const int value = extraBlockList.size() + scriptSentinel; + const int offset = ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount; + + declaration += " "; + declaration += QByteArray::number(value); + declaration += ", /* U+"; + declaration += QByteArray::number(block, 16).rightJustified(4, '0'); + declaration += '-'; + declaration += QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0'); + declaration += " at offset "; + declaration += QByteArray::number(offset); + declaration += " */\n"; + + ExtraBlock extraBlock; + extraBlock.block = block; + extraBlock.vector.resize(unicodeBlockSize); + for (int x = 0; x < unicodeBlockSize; ++x) + extraBlock.vector[x] = blockAssignment[x]; + + extraBlockList.append(extraBlock); + } + } + + for (int i = 0; i < extraBlockList.size(); ++i) { + const int value = i + scriptSentinel; + const int offset = ((value - scriptSentinel) * unicodeBlockSize) + unicodeBlockCount; + const ExtraBlock &extraBlock = extraBlockList.at(i); + const int block = extraBlock.block; + + declaration += "\n\n /* U+"; + declaration += QByteArray::number(block, 16).rightJustified(4, '0'); + declaration += '-'; + declaration += QByteArray::number(block + unicodeBlockSize - 1, 16).rightJustified(4, '0'); + declaration += " at offset "; + declaration += QByteArray::number(offset); + declaration += " */\n "; + + for (int x = 0; x < extraBlock.vector.size(); ++x) { + const int o = extraBlock.vector.at(x); + + declaration += scriptNames.value(o); + if (x < extraBlock.vector.size() - 1 || i < extraBlockList.size() - 1) + declaration += ','; + if ((x & 7) == 7 && x < extraBlock.vector.size() - 1) + declaration += "\n "; + else + declaration += ' '; + } + if (declaration.endsWith(' ')) + declaration.chop(1); + } + declaration += "\n};\n\n} // namespace QUnicodeTables\n\n"; + + declaration += + "Q_CORE_EXPORT int QT_FASTCALL QUnicodeTables::script(uint ucs4)\n" + "{\n" + " if (ucs4 > 0xffff)\n" + " return Common;\n" + " int script = uc_scripts[ucs4 >> 7];\n" + " if (script < ScriptSentinel)\n" + " return script;\n" + " script = (((script - ScriptSentinel) * UnicodeBlockSize) + UnicodeBlockCount);\n" + " script = uc_scripts[script + (ucs4 & 0x7f)];\n" + " return script;\n" + "}\n\n"; + + qDebug("createScriptTableDeclaration: table size is %d bytes", + unicodeBlockCount + (extraBlockList.size() * unicodeBlockSize)); + + return declaration; +} + +#if 0 +static void dump(int from, int to) +{ + for (int i = from; i <= to; ++i) { + UnicodeData d = unicodeData.value(i, UnicodeData(i)); + qDebug("0x%04x: cat=%d combining=%d dir=%d case=%x mirror=%x joining=%d age=%d", + i, d.p.category, d.p.combiningClass, d.p.direction, d.otherCase, d.mirroredChar, d.p.joining, d.p.age); + if (d.decompositionType != QChar::NoDecomposition) { + qDebug(" decomposition: type=%d, length=%d, first=%x", d.decompositionType, d.decomposition.size(), + d.decomposition[0]); + } + } + qDebug(" "); +} +#endif + +struct PropertyBlock { + PropertyBlock() { index = -1; } + int index; + QList<int> properties; + bool operator==(const PropertyBlock &other) + { return properties == other.properties; } +}; + +static QByteArray createPropertyInfo() +{ + qDebug("createPropertyInfo:"); + + const int BMP_BLOCKSIZE = 32; + const int BMP_SHIFT = 5; + const int BMP_END = 0x11000; + const int SMP_END = 0x110000; + const int SMP_BLOCKSIZE = 256; + const int SMP_SHIFT = 8; + + QList<PropertyBlock> blocks; + QList<int> blockMap; + + int used = 0; + + for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) { + PropertyBlock b; + for (int i = 0; i < BMP_BLOCKSIZE; ++i) { + int uc = block*BMP_BLOCKSIZE + i; + UnicodeData d = unicodeData.value(uc, UnicodeData(uc)); + b.properties.append(d.propertyIndex); + } + int index = blocks.indexOf(b); + if (index == -1) { + index = blocks.size(); + b.index = used; + used += BMP_BLOCKSIZE; + blocks.append(b); + } + blockMap.append(blocks.at(index).index); + } + + int bmp_blocks = blocks.size(); + Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE); + + for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) { + PropertyBlock b; + for (int i = 0; i < SMP_BLOCKSIZE; ++i) { + int uc = block*SMP_BLOCKSIZE + i; + UnicodeData d = unicodeData.value(uc, UnicodeData(uc)); + b.properties.append(d.propertyIndex); + } + int index = blocks.indexOf(b); + if (index == -1) { + index = blocks.size(); + b.index = used; + used += SMP_BLOCKSIZE; + blocks.append(b); + } + blockMap.append(blocks.at(index).index); + } + + int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2; + int bmp_trie = BMP_END/BMP_BLOCKSIZE*2; + int bmp_mem = bmp_block_data + bmp_trie; + qDebug(" %d unique blocks in BMP.", blocks.size()); + qDebug(" block data uses: %d bytes", bmp_block_data); + qDebug(" trie data uses : %d bytes", bmp_trie); + + int smp_block_data = (blocks.size() - bmp_blocks)*SMP_BLOCKSIZE*2; + int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2; + int smp_mem = smp_block_data + smp_trie; + qDebug(" %d unique blocks in SMP.", blocks.size()-bmp_blocks); + qDebug(" block data uses: %d bytes", smp_block_data); + qDebug(" trie data uses : %d bytes", smp_trie); + + qDebug("\n properties uses : %d bytes", uniqueProperties.size() * SizeOfPropertiesStruct); + qDebug(" memory usage: %d bytes", bmp_mem + smp_mem + uniqueProperties.size() * SizeOfPropertiesStruct); + + QByteArray out; + out += "static const unsigned short uc_property_trie[] = {\n"; + + // first write the map + out += " // 0 - 0x" + QByteArray::number(BMP_END, 16); + for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) { + if (!(i % 8)) { + if (out.endsWith(' ')) + out.chop(1); + if (!((i*BMP_BLOCKSIZE) % 0x1000)) + out += "\n"; + out += "\n "; + } + out += QByteArray::number(blockMap.at(i) + blockMap.size()); + out += ", "; + } + if (out.endsWith(' ')) + out.chop(1); + out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";; + for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) { + if (!(i % 8)) { + if (out.endsWith(' ')) + out.chop(1); + if (!(i % (0x10000/SMP_BLOCKSIZE))) + out += "\n"; + out += "\n "; + } + out += QByteArray::number(blockMap.at(i) + blockMap.size()); + out += ", "; + } + if (out.endsWith(' ')) + out.chop(1); + out += "\n"; + // write the data + for (int i = 0; i < blocks.size(); ++i) { + if (out.endsWith(' ')) + out.chop(1); + out += "\n"; + const PropertyBlock &b = blocks.at(i); + for (int j = 0; j < b.properties.size(); ++j) { + if (!(j % 8)) { + if (out.endsWith(' ')) + out.chop(1); + out += "\n "; + } + out += QByteArray::number(b.properties.at(j)); + out += ", "; + } + } + + // we reserve one bit more than in the assert below for the sign + Q_ASSERT(maxMirroredDiff < (1<<12)); + Q_ASSERT(maxLowerCaseDiff < (1<<14)); + Q_ASSERT(maxUpperCaseDiff < (1<<14)); + Q_ASSERT(maxTitleCaseDiff < (1<<14)); + Q_ASSERT(maxCaseFoldDiff < (1<<14)); + + if (out.endsWith(' ')) + out.chop(1); + out += "\n};\n\n" + + "#define GET_PROP_INDEX(ucs4) \\\n" + " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n" + " ? (uc_property_trie[uc_property_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) + + "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n" + " : (uc_property_trie[uc_property_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) + + ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]" + " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]))\n\n" + "#define GET_PROP_INDEX_UCS2(ucs2) \\\n" + "(uc_property_trie[uc_property_trie[ucs2>>" + QByteArray::number(BMP_SHIFT) + + "] + (ucs2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")])\n\n" + + + "static const QUnicodeTables::Properties uc_properties[] = {\n"; + + // keep in sync with the property declaration + for (int i = 0; i < uniqueProperties.size(); ++i) { + PropertyFlags p = uniqueProperties.at(i); + out += " { "; +// " ushort category : 8;\n" + out += QByteArray::number( p.category ); + out += ", "; +// " ushort line_break_class : 8;\n" + out += QByteArray::number( p.line_break_class ); + out += ", "; +// " ushort direction : 8;\n" + out += QByteArray::number( p.direction ); + out += ", "; +// " ushort combiningClass :8;\n" + out += QByteArray::number( p.combiningClass ); + out += ", "; +// " ushort joining : 2;\n" + out += QByteArray::number( p.joining ); + out += ", "; +// " signed short digitValue : 6;\n /* 5 needed */" + out += QByteArray::number( p.digitValue ); + out += ", "; +// " ushort unicodeVersion : 4;\n" + out += QByteArray::number( p.age ); + out += ", "; +// " ushort lowerCaseSpecial : 1;\n" +// " ushort upperCaseSpecial : 1;\n" +// " ushort titleCaseSpecial : 1;\n" +// " ushort caseFoldSpecial : 1;\n" + out += QByteArray::number( p.lowerCaseSpecial ); + out += ", "; + out += QByteArray::number( p.upperCaseSpecial ); + out += ", "; + out += QByteArray::number( p.titleCaseSpecial ); + out += ", "; + out += QByteArray::number( p.caseFoldSpecial ); + out += ", "; +// " signed short mirrorDiff : 16;\n" +// " signed short lowerCaseDiff : 16;\n" +// " signed short upperCaseDiff : 16;\n" +// " signed short titleCaseDiff : 16;\n" +// " signed short caseFoldDiff : 16;\n" + out += QByteArray::number( p.mirrorDiff ); + out += ", "; + out += QByteArray::number( p.lowerCaseDiff ); + out += ", "; + out += QByteArray::number( p.upperCaseDiff ); + out += ", "; + out += QByteArray::number( p.titleCaseDiff ); + out += ", "; + out += QByteArray::number( p.caseFoldDiff ); + out += ", "; + out += QByteArray::number( p.graphemeBreak ); + out += ", "; + out += QByteArray::number( p.wordBreak ); + out += ", "; + out += QByteArray::number( p.sentenceBreak ); + out += " },\n"; + } + out += "};\n\n"; + + out += "static inline const QUnicodeTables::Properties *qGetProp(uint ucs4)\n" + "{\n" + " int index = GET_PROP_INDEX(ucs4);\n" + " return uc_properties + index;\n" + "}\n" + "\n" + "static inline const QUnicodeTables::Properties *qGetProp(ushort ucs2)\n" + "{\n" + " int index = GET_PROP_INDEX_UCS2(ucs2);\n" + " return uc_properties + index;\n" + "}\n" + "\n" + "Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(uint ucs4)\n" + "{\n" + " int index = GET_PROP_INDEX(ucs4);\n" + " return uc_properties + index;\n" + "}\n" + "\n" + "Q_CORE_EXPORT const QUnicodeTables::Properties * QT_FASTCALL QUnicodeTables::properties(ushort ucs2)\n" + "{\n" + " int index = GET_PROP_INDEX_UCS2(ucs2);\n" + " return uc_properties + index;\n" + "}\n\n"; + + out += "Q_CORE_EXPORT QUnicodeTables::LineBreakClass QT_FASTCALL QUnicodeTables::lineBreakClass(uint ucs4)\n" + "{\n" + " return (QUnicodeTables::LineBreakClass)qGetProp(ucs4)->line_break_class;\n" + "}\n\n"; + + out += "static const ushort specialCaseMap[] = {\n "; + for (int i = 0; i < specialCaseMap.size(); ++i) { + out += QByteArray(" 0x") + QByteArray::number(specialCaseMap.at(i), 16); + if (i < specialCaseMap.size() - 1) + out += ","; + if (!specialCaseMap.at(i)) + out += "\n "; + } + out += "\n};\n"; + out += "#define SPECIAL_CASE_MAX_LEN " + QByteArray::number(specialCaseMaxLen) + "\n\n"; + + qDebug("Special case map uses : %d bytes", specialCaseMap.size()*2); + + return out; +} + + +struct DecompositionBlock { + DecompositionBlock() { index = -1; } + int index; + QList<int> decompositionPositions; + bool operator ==(const DecompositionBlock &other) + { return decompositionPositions == other.decompositionPositions; } +}; + +static QByteArray createCompositionInfo() +{ + qDebug("createCompositionInfo:"); + + const int BMP_BLOCKSIZE = 16; + const int BMP_SHIFT = 4; + const int BMP_END = 0x3400; // start of Han + const int SMP_END = 0x30000; + const int SMP_BLOCKSIZE = 256; + const int SMP_SHIFT = 8; + + if (SMP_END <= highestComposedCharacter) + qFatal("end of table smaller than highest composed character at %x", highestComposedCharacter); + + QList<DecompositionBlock> blocks; + QList<int> blockMap; + QList<unsigned short> decompositions; + + int used = 0; + int tableIndex = 0; + + for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) { + DecompositionBlock b; + for (int i = 0; i < BMP_BLOCKSIZE; ++i) { + int uc = block*BMP_BLOCKSIZE + i; + UnicodeData d = unicodeData.value(uc, UnicodeData(uc)); + if (!d.decomposition.isEmpty()) { + int utf16Chars = 0; + for (int j = 0; j < d.decomposition.size(); ++j) + utf16Chars += d.decomposition.at(j) >= 0x10000 ? 2 : 1; + decompositions.append(d.decompositionType + (utf16Chars<<8)); + for (int j = 0; j < d.decomposition.size(); ++j) { + int code = d.decomposition.at(j); + if (code >= 0x10000) { + // save as surrogate pair + ushort high = QChar::highSurrogate(code); + ushort low = QChar::lowSurrogate(code); + decompositions.append(high); + decompositions.append(low); + } else { + decompositions.append(code); + } + } + b.decompositionPositions.append(tableIndex); + tableIndex += utf16Chars + 1; + } else { + b.decompositionPositions.append(0xffff); + } + } + int index = blocks.indexOf(b); + if (index == -1) { + index = blocks.size(); + b.index = used; + used += BMP_BLOCKSIZE; + blocks.append(b); + } + blockMap.append(blocks.at(index).index); + } + + int bmp_blocks = blocks.size(); + Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE); + + for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) { + DecompositionBlock b; + for (int i = 0; i < SMP_BLOCKSIZE; ++i) { + int uc = block*SMP_BLOCKSIZE + i; + UnicodeData d = unicodeData.value(uc, UnicodeData(uc)); + if (!d.decomposition.isEmpty()) { + int utf16Chars = 0; + for (int j = 0; j < d.decomposition.size(); ++j) + utf16Chars += d.decomposition.at(j) >= 0x10000 ? 2 : 1; + decompositions.append(d.decompositionType + (utf16Chars<<8)); + for (int j = 0; j < d.decomposition.size(); ++j) { + int code = d.decomposition.at(j); + if (code >= 0x10000) { + // save as surrogate pair + ushort high = QChar::highSurrogate(code); + ushort low = QChar::lowSurrogate(code); + decompositions.append(high); + decompositions.append(low); + } else { + decompositions.append(code); + } + } + b.decompositionPositions.append(tableIndex); + tableIndex += utf16Chars + 1; + } else { + b.decompositionPositions.append(0xffff); + } + } + int index = blocks.indexOf(b); + if (index == -1) { + index = blocks.size(); + b.index = used; + used += SMP_BLOCKSIZE; + blocks.append(b); + } + blockMap.append(blocks.at(index).index); + } + + int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2; + int bmp_trie = BMP_END/BMP_BLOCKSIZE*2; + int bmp_mem = bmp_block_data + bmp_trie; + qDebug(" %d unique blocks in BMP.", blocks.size()); + qDebug(" block data uses: %d bytes", bmp_block_data); + qDebug(" trie data uses : %d bytes", bmp_trie); + qDebug(" memory usage: %d bytes", bmp_mem); + + int smp_block_data = (blocks.size() - bmp_blocks)*SMP_BLOCKSIZE*2; + int smp_trie = (SMP_END-BMP_END)/SMP_BLOCKSIZE*2; + int smp_mem = smp_block_data + smp_trie; + qDebug(" %d unique blocks in SMP.", blocks.size()-bmp_blocks); + qDebug(" block data uses: %d bytes", smp_block_data); + qDebug(" trie data uses : %d bytes", smp_trie); + + qDebug("\n decomposition table use : %d bytes", decompositions.size()*2); + qDebug(" memory usage: %d bytes", bmp_mem+smp_mem + decompositions.size()*2); + + QByteArray out; + + out += "static const unsigned short uc_decomposition_trie[] = {\n"; + + // first write the map + out += " // 0 - 0x" + QByteArray::number(BMP_END, 16); + for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) { + if (!(i % 8)) { + if (out.endsWith(' ')) + out.chop(1); + if (!((i*BMP_BLOCKSIZE) % 0x1000)) + out += "\n"; + out += "\n "; + } + out += QByteArray::number(blockMap.at(i) + blockMap.size()); + out += ", "; + } + if (out.endsWith(' ')) + out.chop(1); + out += "\n\n // 0x" + QByteArray::number(BMP_END, 16) + " - 0x" + QByteArray::number(SMP_END, 16) + "\n";; + for (int i = BMP_END/BMP_BLOCKSIZE; i < blockMap.size(); ++i) { + if (!(i % 8)) { + if (out.endsWith(' ')) + out.chop(1); + if (!(i % (0x10000/SMP_BLOCKSIZE))) + out += "\n"; + out += "\n "; + } + out += QByteArray::number(blockMap.at(i) + blockMap.size()); + out += ", "; + } + if (out.endsWith(' ')) + out.chop(1); + out += "\n"; + // write the data + for (int i = 0; i < blocks.size(); ++i) { + if (out.endsWith(' ')) + out.chop(1); + out += "\n"; + const DecompositionBlock &b = blocks.at(i); + for (int j = 0; j < b.decompositionPositions.size(); ++j) { + if (!(j % 8)) { + if (out.endsWith(' ')) + out.chop(1); + out += "\n "; + } + out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16); + out += ", "; + } + } + + if (out.endsWith(' ')) + out.chop(1); + out += "\n};\n\n" + + "#define GET_DECOMPOSITION_INDEX(ucs4) \\\n" + " (ucs4 < 0x" + QByteArray::number(BMP_END, 16) + " \\\n" + " ? (uc_decomposition_trie[uc_decomposition_trie[ucs4>>" + QByteArray::number(BMP_SHIFT) + + "] + (ucs4 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")]) \\\n" + " : (ucs4 < 0x" + QByteArray::number(SMP_END, 16) + "\\\n" + " ? uc_decomposition_trie[uc_decomposition_trie[((ucs4 - 0x" + QByteArray::number(BMP_END, 16) + + ")>>" + QByteArray::number(SMP_SHIFT) + ") + 0x" + QByteArray::number(BMP_END/BMP_BLOCKSIZE, 16) + "]" + " + (ucs4 & 0x" + QByteArray::number(SMP_BLOCKSIZE-1, 16) + ")]\\\n" + " : 0xffff))\n\n" + + "static const unsigned short uc_decomposition_map[] = {\n"; + + for (int i = 0; i < decompositions.size(); ++i) { + if (!(i % 8)) { + if (out.endsWith(' ')) + out.chop(1); + out += "\n "; + } + out += "0x" + QByteArray::number(decompositions.at(i), 16); + out += ", "; + } + + if (out.endsWith(' ')) + out.chop(1); + out += "\n};\n\n"; + + return out; +} + +static QByteArray createLigatureInfo() +{ + qDebug("createLigatureInfo: numLigatures=%d", numLigatures); + + QList<DecompositionBlock> blocks; + QList<int> blockMap; + QList<unsigned short> ligatures; + + const int BMP_BLOCKSIZE = 32; + const int BMP_SHIFT = 5; + const int BMP_END = 0x3100; + Q_ASSERT(highestLigature < BMP_END); + + int used = 0; + int tableIndex = 0; + + for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) { + DecompositionBlock b; + for (int i = 0; i < BMP_BLOCKSIZE; ++i) { + int uc = block*BMP_BLOCKSIZE + i; + QList<Ligature> l = ligatureHashes.value(uc); + if (!l.isEmpty()) { + qSort(l); + + ligatures.append(l.size()); + for (int j = 0; j < l.size(); ++j) { + Q_ASSERT(l.at(j).u2 == uc); + ligatures.append(l.at(j).u1); + ligatures.append(l.at(j).ligature); + } + b.decompositionPositions.append(tableIndex); + tableIndex += 2*l.size() + 1; + } else { + b.decompositionPositions.append(0xffff); + } + } + int index = blocks.indexOf(b); + if (index == -1) { + index = blocks.size(); + b.index = used; + used += BMP_BLOCKSIZE; + blocks.append(b); + } + blockMap.append(blocks.at(index).index); + } + + int bmp_blocks = blocks.size(); + Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE); + + int bmp_block_data = bmp_blocks*BMP_BLOCKSIZE*2; + int bmp_trie = BMP_END/BMP_BLOCKSIZE*2; + int bmp_mem = bmp_block_data + bmp_trie; + qDebug(" %d unique blocks in BMP.", blocks.size()); + qDebug(" block data uses: %d bytes", bmp_block_data); + qDebug(" trie data uses : %d bytes", bmp_trie); + qDebug("\n ligature data uses : %d bytes", ligatures.size()*2); + qDebug(" memory usage: %d bytes", bmp_mem + ligatures.size() * 2); + + QByteArray out; + + out += "static const unsigned short uc_ligature_trie[] = {\n"; + + // first write the map + out += " // 0 - 0x" + QByteArray::number(BMP_END, 16); + for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) { + if (!(i % 8)) { + if (out.endsWith(' ')) + out.chop(1); + if (!((i*BMP_BLOCKSIZE) % 0x1000)) + out += "\n"; + out += "\n "; + } + out += QByteArray::number(blockMap.at(i) + blockMap.size()); + out += ", "; + } + if (out.endsWith(' ')) + out.chop(1); + out += "\n"; + // write the data + for (int i = 0; i < blocks.size(); ++i) { + if (out.endsWith(' ')) + out.chop(1); + out += "\n"; + const DecompositionBlock &b = blocks.at(i); + for (int j = 0; j < b.decompositionPositions.size(); ++j) { + if (!(j % 8)) { + if (out.endsWith(' ')) + out.chop(1); + out += "\n "; + } + out += "0x" + QByteArray::number(b.decompositionPositions.at(j), 16); + out += ", "; + } + } + if (out.endsWith(' ')) + out.chop(1); + out += "\n};\n\n" + + "#define GET_LIGATURE_INDEX(u2) " + "(u2 < 0x" + QByteArray::number(BMP_END, 16) + " ? " + "uc_ligature_trie[uc_ligature_trie[u2>>" + QByteArray::number(BMP_SHIFT) + + "] + (u2 & 0x" + QByteArray::number(BMP_BLOCKSIZE-1, 16)+ ")] : 0xffff);\n\n" + + "static const unsigned short uc_ligature_map[] = {\n"; + + for (int i = 0; i < ligatures.size(); ++i) { + if (!(i % 8)) { + if (out.endsWith(' ')) + out.chop(1); + out += "\n "; + } + out += "0x" + QByteArray::number(ligatures.at(i), 16); + out += ", "; + } + + if (out.endsWith(' ')) + out.chop(1); + out += "\n};\n\n"; + + return out; +} + +QByteArray createCasingInfo() +{ + QByteArray out; + + out += "struct CasingInfo {\n" + " uint codePoint : 16;\n" + " uint flags : 8;\n" + " uint offset : 8;\n" + "};\n\n"; + + return out; +} + + +int main(int, char **) +{ + initAgeMap(); + initCategoryMap(); + initDecompositionMap(); + initDirectionMap(); + initJoiningMap(); + initGraphemeBreak(); + initWordBreak(); + initSentenceBreak(); + initLineBreak(); + + readUnicodeData(); + readBidiMirroring(); + readArabicShaping(); + readDerivedAge(); + readDerivedNormalizationProps(); + readSpecialCasing(); + readCaseFolding(); + // readBlocks(); + readScripts(); + readGraphemeBreak(); + readWordBreak(); + readSentenceBreak(); + readLineBreak(); + + computeUniqueProperties(); + QByteArray properties = createPropertyInfo(); + QByteArray compositions = createCompositionInfo(); + QByteArray ligatures = createLigatureInfo(); + QByteArray normalizationCorrections = createNormalizationCorrections(); + QByteArray scriptEnumDeclaration = createScriptEnumDeclaration(); + QByteArray scriptTableDeclaration = createScriptTableDeclaration(); + + QByteArray header = + "/****************************************************************************\n" + "**\n" + "** Copyright (C) 2011 Nokia Corporation and/or its subsidiary(-ies).\n" + "** All rights reserved.\n" + "** Contact: Nokia Corporation (qt-info@nokia.com)\n" + "**\n" + "** This file is part of the QtCore module of the Qt Toolkit.\n" + "**\n" + "** $QT_BEGIN_LICENSE:LGPL$\n" + "** No Commercial Usage\n" + "** This file contains pre-release code and may not be distributed.\n" + "** You may use this file in accordance with the terms and conditions\n" + "** contained in the Technology Preview License Agreement accompanying\n" + "** this package.\n" + "**\n" + "** GNU Lesser General Public License Usage\n" + "** Alternatively, this file may be used under the terms of the GNU Lesser\n" + "** General Public License version 2.1 as published by the Free Software\n" + "** Foundation and appearing in the file LICENSE.LGPL included in the\n" + "** packaging of this file. Please review the following information to\n" + "** ensure the GNU Lesser General Public License version 2.1 requirements\n" + "** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.\n" + "**\n" + "** In addition, as a special exception, Nokia gives you certain additional\n" + "** rights. These rights are described in the Nokia Qt LGPL Exception\n" + "** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.\n" + "**\n" + "** If you have questions regarding the use of this file, please contact\n" + "** Nokia at qt-info@nokia.com.\n" + "**\n" + "**\n" + "**\n" + "**\n" + "**\n" + "**\n" + "**\n" + "**\n" + "** $QT_END_LICENSE$\n" + "**\n" + "****************************************************************************/\n\n"; + + QByteArray note = + "/* This file is autogenerated from the Unicode "DATA_VERSION_S" database. Do not edit */\n\n"; + + QByteArray warning = + "//\n" + "// W A R N I N G\n" + "// -------------\n" + "//\n" + "// This file is not part of the Qt API. It exists for the convenience\n" + "// of internal files. This header file may change from version to version\n" + "// without notice, or even be removed.\n" + "//\n" + "// We mean it.\n" + "//\n\n"; + + QFile f("../../src/corelib/tools/qunicodetables.cpp"); + f.open(QFile::WriteOnly|QFile::Truncate); + f.write(header); + f.write(note); + f.write("QT_BEGIN_NAMESPACE\n\n"); + f.write(properties); + f.write(compositions); + f.write(ligatures); + f.write(normalizationCorrections); + f.write(scriptTableDeclaration); + f.write("QT_END_NAMESPACE\n"); + f.close(); + + f.setFileName("../../src/corelib/tools/qunicodetables_p.h"); + f.open(QFile::WriteOnly | QFile::Truncate); + f.write(header); + f.write(note); + f.write(warning); + f.write("#ifndef QUNICODETABLES_P_H\n" + "#define QUNICODETABLES_P_H\n\n" + "#include <QtCore/qchar.h>\n\n" + "QT_BEGIN_NAMESPACE\n\n"); + f.write("#define UNICODE_DATA_VERSION "DATA_VERSION_STR"\n\n"); + f.write("#define UNICODE_LAST_CODEPOINT "LAST_CODEPOINT_STR"\n\n"); + f.write("namespace QUnicodeTables {\n\n"); + f.write(property_string); + f.write("\n"); + f.write(scriptEnumDeclaration); + f.write("\n"); + f.write(grapheme_break_string); + f.write("\n"); + f.write(word_break_string); + f.write("\n"); + f.write(sentence_break_string); + f.write("\n"); + f.write(lineBreakClass); + f.write("\n"); + f.write(methods); + f.write("} // namespace QUnicodeTables\n\n" + "QT_END_NAMESPACE\n\n" + "#endif // QUNICODETABLES_P_H\n"); + f.close(); + + qDebug() << "maxMirroredDiff = " << hex << maxMirroredDiff; + qDebug() << "maxLowerCaseDiff = " << hex << maxLowerCaseDiff; + qDebug() << "maxUpperCaseDiff = " << hex << maxUpperCaseDiff; + qDebug() << "maxTitleCaseDiff = " << hex << maxTitleCaseDiff; + qDebug() << "maxCaseFoldDiff = " << hex << maxCaseFoldDiff; +#if 0 +// dump(0, 0x7f); +// dump(0x620, 0x640); +// dump(0x10000, 0x10020); +// dump(0x10800, 0x10820); + + qDebug("decompositionLength used:"); + int totalcompositions = 0; + int sum = 0; + for (int i = 1; i < 20; ++i) { + qDebug(" length %d used %d times", i, decompositionLength.value(i, 0)); + totalcompositions += i*decompositionLength.value(i, 0); + sum += decompositionLength.value(i, 0); + } + qDebug(" len decomposition map %d, average length %f, num composed chars %d", + totalcompositions, (float)totalcompositions/(float)sum, sum); + qDebug("highest composed character %x", highestComposedCharacter); + qDebug("num ligatures = %d highest=%x, maxLength=%d", numLigatures, highestLigature, longestLigature); + + qBubbleSort(ligatures); + for (int i = 0; i < ligatures.size(); ++i) + qDebug("%s", ligatures.at(i).data()); + +// qDebug("combiningClass usage:"); +// int numClasses = 0; +// for (int i = 0; i < 255; ++i) { +// int num = combiningClassUsage.value(i, 0); +// if (num) { +// ++numClasses; +// qDebug(" combiningClass %d used %d times", i, num); +// } +// } +// qDebug("total of %d combining classes used", numClasses); + +#endif +} |