/* * Copyright (C) 2015 Apple Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include "config.h" #include "CombinedURLFilters.h" #if ENABLE(CONTENT_EXTENSIONS) #include "HashableActionList.h" #include "Term.h" #include #include #include namespace WebCore { namespace ContentExtensions { struct PrefixTreeEdge { const Term* term; std::unique_ptr child; }; typedef Vector PrefixTreeEdges; struct PrefixTreeVertex { PrefixTreeEdges edges; }; struct ReverseSuffixTreeVertex; struct ReverseSuffixTreeEdge { const Term* term; std::unique_ptr child; }; typedef Vector ReverseSuffixTreeEdges; struct ReverseSuffixTreeVertex { ReverseSuffixTreeEdges edges; uint32_t nodeId; }; typedef HashMap ReverseSuffixTreeRoots; #if CONTENT_EXTENSIONS_PERFORMANCE_REPORTING static size_t recursiveMemoryUsed(const PrefixTreeVertex& vertex) { size_t size = sizeof(PrefixTreeVertex) + vertex.edges.capacity() * sizeof(PrefixTreeEdge); for (const auto& edge : vertex.edges) { ASSERT(edge.child); size += recursiveMemoryUsed(*edge.child.get()); } return size; } size_t CombinedURLFilters::memoryUsed() const { ASSERT(m_prefixTreeRoot); size_t actionsSize = 0; for (const auto& slot : m_actions) actionsSize += slot.value.capacity() * sizeof(uint64_t); return sizeof(CombinedURLFilters) + m_alphabet.memoryUsed() + recursiveMemoryUsed(*m_prefixTreeRoot.get()) + sizeof(HashMap) + m_actions.capacity() * (sizeof(PrefixTreeVertex*) + sizeof(ActionList)) + actionsSize; } #endif #if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING static String prefixTreeVertexToString(const PrefixTreeVertex& vertex, const HashMap& actions, unsigned depth) { StringBuilder builder; while (depth--) builder.append(" "); builder.append("vertex actions: "); auto actionsSlot = actions.find(&vertex); if (actionsSlot != actions.end()) { for (uint64_t action : actionsSlot->value) { builder.appendNumber(action); builder.append(','); } } builder.append('\n'); return builder.toString(); } static void recursivePrint(const PrefixTreeVertex& vertex, const HashMap& actions, unsigned depth) { dataLogF("%s", prefixTreeVertexToString(vertex, actions, depth).utf8().data()); for (const auto& edge : vertex.edges) { StringBuilder builder; for (unsigned i = 0; i < depth * 2; ++i) builder.append(' '); builder.append("vertex edge: "); builder.append(edge.term->toString()); builder.append('\n'); dataLogF("%s", builder.toString().utf8().data()); ASSERT(edge.child); recursivePrint(*edge.child.get(), actions, depth + 1); } } void CombinedURLFilters::print() const { recursivePrint(*m_prefixTreeRoot.get(), m_actions, 0); } #endif CombinedURLFilters::CombinedURLFilters() : m_prefixTreeRoot(std::make_unique()) { } CombinedURLFilters::~CombinedURLFilters() { } bool CombinedURLFilters::isEmpty() const { return m_prefixTreeRoot->edges.isEmpty(); } void CombinedURLFilters::addDomain(uint64_t actionId, const String& domain) { unsigned domainLength = domain.length(); if (domainLength && domain[0] == '*') { // If domain starts with a '*' then it means match domain and its subdomains, like (^|.)domain$ // This way a domain of "*webkit.org" will match "bugs.webkit.org" and "webkit.org". Vector prependDot; Vector prependBeginningOfLine; prependDot.reserveInitialCapacity(domainLength + 2); prependBeginningOfLine.reserveInitialCapacity(domainLength); // This is just no .* at the beginning. Term canonicalDotStar(Term::UniversalTransition); canonicalDotStar.quantify(AtomQuantifier::ZeroOrMore); prependDot.uncheckedAppend(canonicalDotStar); prependDot.uncheckedAppend(Term('.', true)); for (unsigned i = 1; i < domainLength; i++) { ASSERT(isASCII(domain[i])); ASSERT(!isASCIIUpper(domain[i])); prependDot.uncheckedAppend(Term(domain[i], true)); prependBeginningOfLine.uncheckedAppend(Term(domain[i], true)); } prependDot.uncheckedAppend(Term::EndOfLineAssertionTerm); prependBeginningOfLine.uncheckedAppend(Term::EndOfLineAssertionTerm); addPattern(actionId, prependDot); addPattern(actionId, prependBeginningOfLine); } else { // This is like adding ^domain$, but interpreting domain as a series of characters, not a regular expression. // "webkit.org" will match "webkit.org" but not "bugs.webkit.org". Vector prependBeginningOfLine; prependBeginningOfLine.reserveInitialCapacity(domainLength + 1); // This is just no .* at the beginning. for (unsigned i = 0; i < domainLength; i++) { ASSERT(isASCII(domain[i])); ASSERT(!isASCIIUpper(domain[i])); prependBeginningOfLine.uncheckedAppend(Term(domain[i], true)); } prependBeginningOfLine.uncheckedAppend(Term::EndOfLineAssertionTerm); addPattern(actionId, prependBeginningOfLine); } } void CombinedURLFilters::addPattern(uint64_t actionId, const Vector& pattern) { ASSERT_WITH_MESSAGE(!pattern.isEmpty(), "The parser should have excluded empty patterns before reaching CombinedURLFilters."); if (pattern.isEmpty()) return; // Extend the prefix tree with the new pattern. PrefixTreeVertex* lastPrefixTree = m_prefixTreeRoot.get(); for (const Term& term : pattern) { size_t nextEntryIndex = WTF::notFound; for (size_t i = 0; i < lastPrefixTree->edges.size(); ++i) { if (*lastPrefixTree->edges[i].term == term) { nextEntryIndex = i; break; } } if (nextEntryIndex != WTF::notFound) lastPrefixTree = lastPrefixTree->edges[nextEntryIndex].child.get(); else { lastPrefixTree->edges.append(PrefixTreeEdge({m_alphabet.interned(term), std::make_unique()})); lastPrefixTree = lastPrefixTree->edges.last().child.get(); } } auto addResult = m_actions.add(lastPrefixTree, ActionList()); ActionList& actions = addResult.iterator->value; if (actions.find(actionId) == WTF::notFound) actions.append(actionId); } struct ActiveSubtree { ActiveSubtree(PrefixTreeVertex& vertex, ImmutableCharNFANodeBuilder&& nfaNode, unsigned edgeIndex) : vertex(vertex) , nfaNode(WTFMove(nfaNode)) , edgeIndex(edgeIndex) { } PrefixTreeVertex& vertex; ImmutableCharNFANodeBuilder nfaNode; unsigned edgeIndex; }; static void generateInfixUnsuitableForReverseSuffixTree(NFA& nfa, Vector& stack, const HashMap& actions) { // To avoid conflicts, we use the reverse suffix tree for subtrees that do not merge // in the prefix tree. // // We only unify the suffixes to the actions on the leaf. // If there are actions inside the tree, we generate the part of the subtree up to the action. // // If we accidentally insert a node with action inside the reverse-suffix-tree, we would create // new actions on unrelated pattern when unifying their suffixes. for (unsigned i = stack.size() - 1; i--;) { ActiveSubtree& activeSubtree = stack[i]; if (activeSubtree.nfaNode.isValid()) return; RELEASE_ASSERT_WITH_MESSAGE(i > 0, "The bottom of the stack must be the root of our fixed-length subtree. It should have it the isValid() case above."); auto actionsIterator = actions.find(&activeSubtree.vertex); bool hasActionInsideTree = actionsIterator != actions.end(); // Stricto sensu, we should count the number of exit edges with fixed length. // That is costly and unlikely to matter in practice. bool hasSingleOutcome = activeSubtree.vertex.edges.size() == 1; if (hasActionInsideTree || !hasSingleOutcome) { // Go back to the end of the subtree that has already been generated. // From there, generate everything up to the vertex we found. unsigned end = i; unsigned beginning = end; ActiveSubtree* sourceActiveSubtree = nullptr; while (beginning--) { ActiveSubtree& activeSubtree = stack[beginning]; if (activeSubtree.nfaNode.isValid()) { sourceActiveSubtree = &activeSubtree; break; } } ASSERT_WITH_MESSAGE(sourceActiveSubtree, "The root should always have a valid generator."); for (unsigned stackIndex = beginning + 1; stackIndex <= end; ++stackIndex) { ImmutableCharNFANodeBuilder& sourceNode = sourceActiveSubtree->nfaNode; ASSERT(sourceNode.isValid()); auto& edge = sourceActiveSubtree->vertex.edges[sourceActiveSubtree->edgeIndex]; ActiveSubtree& destinationActiveSubtree = stack[stackIndex]; destinationActiveSubtree.nfaNode = edge.term->generateGraph(nfa, sourceNode, actions.get(&destinationActiveSubtree.vertex)); sourceActiveSubtree = &destinationActiveSubtree; } return; } } } static void generateSuffixWithReverseSuffixTree(NFA& nfa, Vector& stack, const HashMap& actions, ReverseSuffixTreeRoots& reverseSuffixTreeRoots) { ActiveSubtree& leafSubtree = stack.last(); ASSERT_WITH_MESSAGE(!leafSubtree.nfaNode.isValid(), "The leaf should never be generated by the code above, it should always be inserted into the prefix tree."); ActionList actionList = actions.get(&leafSubtree.vertex); ASSERT_WITH_MESSAGE(!actionList.isEmpty(), "Our prefix tree should always have actions on the leaves by construction."); HashableActionList hashableActionList(actionList); auto rootAddResult = reverseSuffixTreeRoots.add(hashableActionList, ReverseSuffixTreeVertex()); if (rootAddResult.isNewEntry) { ImmutableCharNFANodeBuilder newNode(nfa); newNode.setActions(actionList.begin(), actionList.end()); rootAddResult.iterator->value.nodeId = newNode.nodeId(); } ReverseSuffixTreeVertex* activeReverseSuffixTreeVertex = &rootAddResult.iterator->value; uint32_t destinationNodeId = rootAddResult.iterator->value.nodeId; unsigned stackPosition = stack.size() - 2; while (true) { ActiveSubtree& source = stack[stackPosition]; auto& edge = source.vertex.edges[source.edgeIndex]; // This is the end condition: when we meet a node that has already been generated, // we just need to connect our backward tree to the forward tree. // // We *must not* add this last node to the reverse-suffix tree. That node can have // transitions back to earlier part of the prefix tree. If the prefix tree "caches" // such node, it would create new transitions that did not exist in the source language. if (source.nfaNode.isValid()) { stack.shrink(stackPosition + 1); edge.term->generateGraph(nfa, source.nfaNode, destinationNodeId); return; } --stackPosition; ASSERT_WITH_MESSAGE(!actions.contains(&source.vertex), "Any node with final actions should have been created before hitting the reverse suffix-tree."); ReverseSuffixTreeEdge* existingEdge = nullptr; for (ReverseSuffixTreeEdge& potentialExistingEdge : activeReverseSuffixTreeVertex->edges) { if (edge.term == potentialExistingEdge.term) { existingEdge = &potentialExistingEdge; break; } } if (existingEdge) activeReverseSuffixTreeVertex = existingEdge->child.get(); else { ImmutableCharNFANodeBuilder newNode(nfa); edge.term->generateGraph(nfa, newNode, destinationNodeId); std::unique_ptr newVertex(new ReverseSuffixTreeVertex()); newVertex->nodeId = newNode.nodeId(); ReverseSuffixTreeVertex* newVertexAddress = newVertex.get(); activeReverseSuffixTreeVertex->edges.append(ReverseSuffixTreeEdge({ edge.term, WTFMove(newVertex) })); activeReverseSuffixTreeVertex = newVertexAddress; } destinationNodeId = activeReverseSuffixTreeVertex->nodeId; ASSERT(source.vertex.edges.size() == 1); source.vertex.edges.clear(); } RELEASE_ASSERT_NOT_REACHED(); } static void clearReverseSuffixTree(ReverseSuffixTreeRoots& reverseSuffixTreeRoots) { // We cannot rely on the destructor being called in order from top to bottom as we may overflow // the stack. Instead, we go depth first in the reverse-suffix-tree. for (auto& slot : reverseSuffixTreeRoots) { Vector stack; stack.append(&slot.value); while (true) { ReverseSuffixTreeVertex* top = stack.last(); if (top->edges.isEmpty()) { stack.removeLast(); if (stack.isEmpty()) break; stack.last()->edges.removeLast(); } else stack.append(top->edges.last().child.get()); } } reverseSuffixTreeRoots.clear(); } static void generateNFAForSubtree(NFA& nfa, ImmutableCharNFANodeBuilder&& subtreeRoot, PrefixTreeVertex& root, const HashMap& actions, size_t maxNFASize) { // This recurses the subtree of the prefix tree. // For each edge that has fixed length (no quantifiers like ?, *, or +) it generates the nfa graph, // recurses into children, and deletes any processed leaf nodes. ReverseSuffixTreeRoots reverseSuffixTreeRoots; Vector stack; if (!root.edges.isEmpty()) stack.append(ActiveSubtree(root, WTFMove(subtreeRoot), 0)); bool nfaTooBig = false; // Generate graphs for each subtree that does not contain any quantifiers. while (!stack.isEmpty()) { PrefixTreeVertex& vertex = stack.last().vertex; const unsigned edgeIndex = stack.last().edgeIndex; if (edgeIndex < vertex.edges.size()) { auto& edge = vertex.edges[edgeIndex]; // Clean up any processed leaves and return early if we are past the maxNFASize. if (nfaTooBig) { stack.last().edgeIndex = stack.last().vertex.edges.size(); continue; } // Quantified edges in the subtree will be a part of another NFA. if (!edge.term->hasFixedLength()) { stack.last().edgeIndex++; continue; } ASSERT(edge.child.get()); ImmutableCharNFANodeBuilder emptyBuilder; stack.append(ActiveSubtree(*edge.child.get(), WTFMove(emptyBuilder), 0)); } else { bool isLeaf = vertex.edges.isEmpty(); ASSERT(edgeIndex == vertex.edges.size()); vertex.edges.removeAllMatching([](PrefixTreeEdge& edge) { return !edge.term; }); if (isLeaf) { generateInfixUnsuitableForReverseSuffixTree(nfa, stack, actions); generateSuffixWithReverseSuffixTree(nfa, stack, actions, reverseSuffixTreeRoots); // Only stop generating an NFA at a leaf to ensure we have a correct NFA. We could go slightly over the maxNFASize. if (nfa.nodes.size() > maxNFASize) nfaTooBig = true; } else stack.removeLast(); if (!stack.isEmpty()) { auto& activeSubtree = stack.last(); auto& edge = activeSubtree.vertex.edges[stack.last().edgeIndex]; if (edge.child->edges.isEmpty()) edge.term = nullptr; // Mark this leaf for deleting. activeSubtree.edgeIndex++; } } } clearReverseSuffixTree(reverseSuffixTreeRoots); } void CombinedURLFilters::processNFAs(size_t maxNFASize, std::function handler) { #if CONTENT_EXTENSIONS_STATE_MACHINE_DEBUGGING print(); #endif while (true) { // Traverse out to a leaf. Vector stack; PrefixTreeVertex* vertex = m_prefixTreeRoot.get(); while (true) { ASSERT(vertex); stack.append(vertex); if (vertex->edges.isEmpty()) break; vertex = vertex->edges.last().child.get(); } if (stack.size() == 1) break; // We're done once we have processed and removed all the edges in the prefix tree. // Find the prefix root for this NFA. This is the vertex after the last term with a quantifier if there is one, // or the root if there are no quantifiers left. while (stack.size() > 1) { if (!stack[stack.size() - 2]->edges.last().term->hasFixedLength()) break; stack.removeLast(); } ASSERT_WITH_MESSAGE(!stack.isEmpty(), "At least the root should be in the stack"); // Make an NFA with the subtrees for whom this is also the last quantifier (or who also have no quantifier). NFA nfa; { // Put the prefix into the NFA. ImmutableCharNFANodeBuilder lastNode(nfa); for (unsigned i = 0; i < stack.size() - 1; ++i) { const PrefixTreeEdge& edge = stack[i]->edges.last(); ImmutableCharNFANodeBuilder newNode = edge.term->generateGraph(nfa, lastNode, m_actions.get(edge.child.get())); lastNode = WTFMove(newNode); } // Put the non-quantified vertices in the subtree into the NFA and delete them. ASSERT(stack.last()); generateNFAForSubtree(nfa, WTFMove(lastNode), *stack.last(), m_actions, maxNFASize); } nfa.finalize(); handler(WTFMove(nfa)); // Clean up any processed leaf nodes. while (true) { if (stack.size() > 1) { if (stack[stack.size() - 1]->edges.isEmpty()) { stack[stack.size() - 2]->edges.removeLast(); stack.removeLast(); } else break; // Vertex is not a leaf. } else break; // Leave the empty root. } } } } // namespace ContentExtensions } // namespace WebCore #endif // ENABLE(CONTENT_EXTENSIONS)