From 5f3f0731269b9a8700604dd3de57aa6d57db5367 Mon Sep 17 00:00:00 2001 From: Ievgenii Meshcheriakov Date: Mon, 30 Aug 2021 11:35:02 +0200 Subject: unicode: Build IDNA map superstring greedily This slightly reduces memory required for mapping tables: uncompressed size: 1146 characters consolidated size: 904 characters memory usage: 47856 bytes Task-number: QTBUG-85323 Change-Id: Ic960789e433e80acf1a4e36791533a1c55a735c8 Reviewed-by: Edward Welbourne --- util/unicode/main.cpp | 107 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 100 insertions(+), 7 deletions(-) (limited to 'util/unicode') diff --git a/util/unicode/main.cpp b/util/unicode/main.cpp index 7385795ce7..3f7f3956f9 100644 --- a/util/unicode/main.cpp +++ b/util/unicode/main.cpp @@ -33,6 +33,7 @@ #include #include #include +#include #include #if 0 #include @@ -2499,6 +2500,29 @@ static void resolveIdnaStatus() } } +/* + Return maximum overlap for strings left and right in this order. + + The input strings should not be substrings of each other. +*/ +static qsizetype overlap(const QString &left, const QString &right) +{ + for (qsizetype n = std::min(left.size(), right.size()) - 1; n > 0; n--) { + if (left.last(n) == right.first(n)) + return n; + } + return 0; +} + +using GraphNode = unsigned int; + +struct OverlapGraphEdge +{ + GraphNode start; + GraphNode end; + qsizetype overlap; +}; + /* Returns a common supersting of all inputs. @@ -2506,24 +2530,93 @@ static void resolveIdnaStatus() possible size, but the shortest common superstring problem is know to be NP-hard so an approximation must be used here. - This function implements a simple algorithm that consists of appending - input strings to the result if they are not already contained in it. + This function implements the greedy algorithm for building the superstring. As an optimization this function is allowed to destroy its inputs. */ static QString buildSuperstring(QList &inputs) { - // Sort by size, longest strings first. + // Ensure that the inputs don't contain substrings. + // First, sort the array by length to make substring removal easier. std::sort(inputs.begin(), inputs.end(), [](const QString &a, const QString &b) { return a.size() == b.size() ? a > b : a.size() > b.size(); }); - QString superstring; + // Remove duplicates and other substrings + for (auto i = inputs.begin() + 1; i != inputs.end();) { + bool isSubstring = std::any_of(inputs.begin(), i, [i](const QString &s) { + return s.contains(*i); + }); + i = isSubstring ? inputs.erase(i) : i + 1; + } + + // Build overlap graph for the remaining inputs. It is fully-connected. + QList graphEdges; + graphEdges.reserve(inputs.size() * (inputs.size() - 1)); + + for (GraphNode i = 0; i < inputs.size(); i++) { + for (GraphNode j = 0; j < inputs.size(); j++) { + if (i != j) + graphEdges.append(OverlapGraphEdge {i, j, overlap(inputs[i], inputs[j])}); + } + } + + // Build a Hamiltonian path through the overlap graph, taking nodes with highest overlap + // first. + std::sort(graphEdges.begin(), graphEdges.end(), [](const auto &a, const auto &b) { + return a.overlap == b.overlap + ? a.start == b.start ? a.end < b.end : a.start < b.start + : a.overlap > b.overlap; + }); + + QBitArray starts(inputs.size()); + QBitArray ends(inputs.size()); + QMap pathEdges; + + auto createsCycle = [&](const OverlapGraphEdge &edge) { + if (!starts[edge.end] || !ends[edge.start]) + return false; + Q_ASSERT(!pathEdges.contains(edge.start)); // Caller checks it's not yet a start. + + GraphNode node = edge.end; + while (pathEdges.contains(node)) + node = pathEdges[node].end; + + return node == edge.start; + }; + + for (const auto &edge : graphEdges) { + if (!starts[edge.start] && !ends[edge.end] && !createsCycle(edge)) { + starts.setBit(edge.start); + ends.setBit(edge.end); + pathEdges[edge.start] = edge; + if (pathEdges.size() == inputs.size() - 1) + break; + } + } + + Q_ASSERT(ends.count(false) == 1); + Q_ASSERT(starts.count(false) == 1); + + // Find the start node of the path. + GraphNode node = 0; + while (node < ends.size() && ends[node]) + node++; + Q_ASSERT(node < ends.size()); + + QString superstring = inputs[node]; + qsizetype pathNodes = 1; // Count path nodes for sanity check + + while (pathEdges.contains(node)) { + const auto &edge = pathEdges[node]; + Q_ASSERT(edge.start == node); + + superstring.append(QStringView { inputs[edge.end] }.sliced(edge.overlap)); - for (const QString &s : inputs) { - if (!superstring.contains(s)) - superstring.append(s); + node = edge.end; + pathNodes++; } + Q_ASSERT(pathNodes == inputs.size()); return superstring; } -- cgit v1.2.3