From a111dd26b1706d011a46ee3435ea025010eeab4e Mon Sep 17 00:00:00 2001 From: Edward Welbourne Date: Wed, 12 Aug 2020 14:17:50 +0200 Subject: Document the indexing used in the Unicode tables Make clear why we don't need to assert against out-of-bounda accesses in the generated code, provided the code point is within its bound, (Using one table's early entries as indices into later in the same table at which to look up indices into another table made it a little hard to work out what was going on, especially as nothing told me about the early / late distinction. Record what I discovered, to save the next person to stumble into this some confusion.) Change-Id: I8e5771a7f3d70c1911aeae1b0cabe5c47bc7e9c7 Reviewed-by: Konstantin Ritt Reviewed-by: Thiago Macieira --- util/unicode/main.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp index 66ebb22d0a..845d837a5e 100644 --- a/util/unicode/main.cpp +++ b/util/unicode/main.cpp @@ -2330,6 +2330,8 @@ static QByteArray createPropertyInfo() QList blockMap; int used = 0; + // Group BMP data into blocks indexed by their 12 most significant bits + // (blockId = ucs >> 5): for (int block = 0; block < BMP_END/BMP_BLOCKSIZE; ++block) { UniqueBlock b; b.values.reserve(BMP_BLOCKSIZE); @@ -2349,6 +2351,8 @@ static QByteArray createPropertyInfo() } int bmp_blocks = uniqueBlocks.size(); + // Group SMP data into blocks indexed by their 9 most significant bits, plus + // an offset to put them after the BMP blocks (blockId = (ucs >> 8) + 0x880): for (int block = BMP_END/SMP_BLOCKSIZE; block < SMP_END/SMP_BLOCKSIZE; ++block) { UniqueBlock b; b.values.reserve(SMP_BLOCKSIZE); @@ -2386,10 +2390,11 @@ static QByteArray createPropertyInfo() qDebug("\n properties data uses : %d bytes", prop_data); qDebug(" memory usage: %d bytes", bmp_mem + smp_mem + prop_data); + Q_ASSERT(blockMap.size() == BMP_END/BMP_BLOCKSIZE +(SMP_END-BMP_END)/SMP_BLOCKSIZE); // 0x1870 Q_ASSERT(blockMap.last() + blockMap.size() < (1<<(sizeof(unsigned short)*8))); QByteArray out = "static const unsigned short uc_property_trie[] = {\n"; - // first write the map + // First write the map from blockId to indices of unique blocks: out += " // [0x0..0x" + QByteArray::number(BMP_END, 16) + ")"; for (int i = 0; i < BMP_END/BMP_BLOCKSIZE; ++i) { if (!(i % 8)) { @@ -2419,7 +2424,9 @@ static QByteArray createPropertyInfo() if (out.endsWith(' ')) out.chop(1); out += "\n"; - // write the data + // Then write the contents of the unique blocks, at the anticipated indices. + // Each unique block is a list of UnicodeData::propertyIndex values, whch + // are indices into the uc_properties table. for (int i = 0; i < uniqueBlocks.size(); ++i) { if (out.endsWith(' ')) out.chop(1); -- cgit v1.2.3