diff options
Diffstat (limited to 'src/3rdparty/libpsl/src/psl-make-dafsa')
-rwxr-xr-x | src/3rdparty/libpsl/src/psl-make-dafsa | 692 |
1 files changed, 692 insertions, 0 deletions
diff --git a/src/3rdparty/libpsl/src/psl-make-dafsa b/src/3rdparty/libpsl/src/psl-make-dafsa new file mode 100755 index 0000000000..8a374053d5 --- /dev/null +++ b/src/3rdparty/libpsl/src/psl-make-dafsa @@ -0,0 +1,692 @@ +#!/usr/bin/env python +# Copyright 2014 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE.chromium file. + +""" +A Deterministic acyclic finite state automaton (DAFSA) is a compact +representation of an unordered word list (dictionary). + +https://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton + +This python program converts a list of strings to a byte array in C++. +This python program fetches strings and return values from a gperf file +and generates a C++ file with a byte array representing graph that can be +used as a memory efficient replacement for the perfect hash table. + +The input strings must consist of printable 7-bit ASCII characters or UTF-8 +multibyte sequences. Control characters in the range [0x00-0x1F] are not +allowed. The return values must be one digit integers. . + +In this program a DAFSA is a diamond shaped graph starting at a common +source node and ending at a common sink node. All internal nodes contain +a label and each word is represented by the labels in one path from +the source node to the sink node. + +The following python represention is used for nodes: + + Source node: [ children ] + Internal node: (label, [ children ]) + Sink node: None + +The graph is first compressed by prefixes like a trie. In the next step +suffixes are compressed so that the graph gets diamond shaped. Finally +one to one linked nodes are replaced by nodes with the labels joined. + +The order of the operations is crucial since lookups will be performed +starting from the source with no backtracking. Thus a node must have at +most one child with a label starting by the same character. The output +is also arranged so that all jumps are to increasing addresses, thus forward +in memory. + +The generated output has suffix free decoding so that the sign of leading +bits in a link (a reference to a child node) indicate if it has a size of one, +two or three bytes and if it is the last outgoing link from the actual node. +A node label is terminated by a byte with the leading bit set. + +The generated byte array can described by the following BNF: + +<byte> ::= < 8-bit value in range [0x00-0xFF] > + +<char> ::= < byte in range [0x1F-0x7F] > +<end_char> ::= < char + 0x80, byte in range [0x9F-0xFF] > +<return value> ::= < value + 0x80, byte in range [0x80-0x8F] > + +<offset1> ::= < byte in range [0x00-0x3F] > +<offset2> ::= < byte in range [0x40-0x5F] > +<offset3> ::= < byte in range [0x60-0x7F] > + +<end_offset1> ::= < byte in range [0x80-0xBF] > +<end_offset2> ::= < byte in range [0xC0-0xDF] > +<end_offset3> ::= < byte in range [0xE0-0xFF] > + +<prefix> ::= <char> + +<label> ::= <end_char> + | <char> <label> + +<end_label> ::= <return_value> + | <char> <end_label> + +<offset> ::= <offset1> + | <offset2> <byte> + | <offset3> <byte> <byte> + +<end_offset> ::= <end_offset1> + | <end_offset2> <byte> + | <end_offset3> <byte> <byte> + +<offsets> ::= <end_offset> + | <offset> <offsets> + +<source> ::= <offsets> + +<node> ::= <label> <offsets> + | <prefix> <node> + | <end_label> + +<graph> ::= <graph> + | <graph> <node> + +<version> ::= <empty> # The DAFSA was generated in ASCII mode. + | < byte value 0x01 > # The DAFSA was generated in UTF-8 mode. + +<dafsa> ::= <graph> <version> + +Decoding: + +<char> -> character +<end_char> & 0x7F -> character +<return value> & 0x0F -> integer +<offset1 & 0x3F> -> integer +((<offset2> & 0x1F>) << 8) + <byte> -> integer +((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer + +end_offset1, end_offset2 and and_offset3 are decoded same as offset1, +offset2 and offset3 respectively. + +The first offset in a list of offsets is the distance in bytes between the +offset itself and the first child node. Subsequent offsets are the distance +between previous child node and next child node. Thus each offset links a node +to a child node. The distance is always counted between start addresses, i.e. +first byte in decoded offset or first byte in child node. + +Transcoding of UTF-8 multibyte sequences: + +The original DAFSA format was limited to 7-bit printable ASCII characters in +range [0x20-0xFF], but has been extended to allow UTF-8 multibyte sequences. +By transcoding of such characters the new format preserves compatibility with +old parsers, so that a DAFSA in the extended format can be used by an old +parser without false positives, although strings containing transcoded +characters will never match. Since the format is extended rather than being +changed, a parser supporting the new format will automatically support data +generated in the old format. + +Transcoding is performed by insertion of a start byte with the special value +0x1F, followed by 2-4 bytes shifted into the range [0x40-0x7F], thus inside +the range of printable ASCII. + +2-byte: 110nnnnn, 10nnnnnn -> 00011111, 010nnnnn, 01nnnnnn + +3-byte: 1110nnnn, 10nnnnnn, 10nnnnnn -> 00011111, 0110nnnn, 01nnnnnn, 01nnnnnn + +4-byte: 11110nnn, 10nnnnnn, 10nnnnnn, 10nnnnnn -> + 00011111, 01110nnn, 01nnnnnn, 01nnnnnn, 01nnnnnn + +Example 1: + +%% +aa, 1 +a, 2 +%% + +The input is first parsed to a list of words: +["aa1", "a2"] + +A fully expanded graph is created from the words: +source = [node1, node4] +node1 = ("a", [node2]) +node2 = ("a", [node3]) +node3 = ("\x01", [sink]) +node4 = ("a", [node5]) +node5 = ("\x02", [sink]) +sink = None + +Compression results in the following graph: +source = [node1] +node1 = ("a", [node2, node3]) +node2 = ("\x02", [sink]) +node3 = ("a\x01", [sink]) +sink = None + +A C++ representation of the compressed graph is generated: + +constexpr unsigned char dafsa[7] = { + 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81, +}; + +The bytes in the generated array has the following meaning: + + 0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1 + + 1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a" + 2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4 + + 3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5 + 4: 0x82 <return_value> 0x82 & 0x0F -> return 2 + + 5: 0x61 <char> label character 0x61 -> match "a" + 6: 0x81 <return_value> 0x81 & 0x0F -> return 1 + +Example 2: + +%% +aa, 1 +bbb, 2 +baa, 1 +%% + +The input is first parsed to a list of words: +["aa1", "bbb2", "baa1"] + +Compression results in the following graph: +source = [node1, node2] +node1 = ("b", [node2, node3]) +node2 = ("aa\x01", [sink]) +node3 = ("bb\x02", [sink]) +sink = None + +A C++ representation of the compressed graph is generated: + +constexpr unsigned char dafsa[11] = { + 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82, +}; + +The bytes in the generated array has the following meaning: + + 0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2 + 1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5 + + 2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b" + 3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5 + 4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8 + + 5: 0x61 <char> label character 0x61 -> match "a" + 6: 0x61 <char> label character 0x61 -> match "a" + 7: 0x81 <return_value> 0x81 & 0x0F -> return 1 + + 8: 0x62 <char> label character 0x62 -> match "b" + 9: 0x62 <char> label character 0x62 -> match "b" +10: 0x82 <return_value> 0x82 & 0x0F -> return 2 +""" + +import sys +import os.path +import hashlib + +class InputError(Exception): + """Exception raised for errors in the input file.""" + +# Length of a character starting at a given byte. +char_length_table = ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x00-0x0F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x10-0x1F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x20-0x2F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x30-x03F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x40-0x4F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x50-x05F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x60-0x6F + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # 0x70-x07F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x80-0x8F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0x90-0x9F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xA0-0xAF + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 0xB0-0xBF + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xC0-0xCF + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # 0xD0-0xDF + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, # 0xE0-0xEF + 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 ) # 0xF0-0xFF + +def to_bytes(n): + """Converts an integer value to a bytes object.""" + return bytes(bytearray((n,))) + +def to_dafsa(words, utf_mode): + """Generates a DAFSA from a word list and returns the source node. + + Each word is split into characters so that each character is represented by + a unique node. It is assumed the word list is not empty. + """ + if not words: + raise InputError('The domain list must not be empty') + def to_nodes(word, multibyte_length): + """Split words into characters""" + byte = ord(word[:1]) + if multibyte_length: + # Consume next byte in multibyte sequence. + if byte & 0xC0 != 0x80: + raise InputError('Invalid UTF-8 multibyte sequence') + return to_bytes(byte ^ 0xC0), [to_nodes(word[1:], multibyte_length - 1)] + char_length = char_length_table[byte] + if char_length == 1: + # 7-bit printable ASCII. + if len(word) == 1: + return to_bytes(int(word[:1], 16) & 0x0F), [None] + return word[:1], [to_nodes(word[1:], 0)] + elif char_length > 1: + # Leading byte in multibyte sequence. + if not utf_mode: + raise InputError('UTF-8 encoded characters are not allowed in ASCII mode') + if len(word) <= char_length: + raise InputError('Unterminated UTF-8 multibyte sequence') + return to_bytes(0x1F), [(to_bytes(byte ^ 0x80), [to_nodes(word[1:], char_length - 1)])] + # Unexpected character. + raise InputError('Domain names must be printable ASCII or UTF-8') + + return [to_nodes(word, 0) for word in words] + +def to_words(node): + """Generates a word list from all paths starting from an internal node.""" + if not node: + return [b''] + return [(node[0] + word) for child in node[1] for word in to_words(child)] + + +def reverse(dafsa): + """Generates a new DAFSA that is reversed, so that the old sink node becomes + the new source node. + """ + sink = [] + nodemap = {} + + def dfs(node, parent): + """Creates reverse nodes. + + A new reverse node will be created for each old node. The new node will + get a reversed label and the parents of the old node as children. + """ + if not node: + sink.append(parent) + elif id(node) not in nodemap: + nodemap[id(node)] = (node[0][::-1], [parent]) + for child in node[1]: + dfs(child, nodemap[id(node)]) + else: + nodemap[id(node)][1].append(parent) + + for node in dafsa: + dfs(node, None) + return sink + + +def join_labels(dafsa): + """Generates a new DAFSA where internal nodes are merged if there is a one to + one connection. + """ + parentcount = {id(None): 2} + nodemap = {id(None): None} + + def count_parents(node): + """Count incoming references""" + if id(node) in parentcount: + parentcount[id(node)] += 1 + else: + parentcount[id(node)] = 1 + for child in node[1]: + count_parents(child) + + def join(node): + """Create new nodes""" + if id(node) not in nodemap: + children = [join(child) for child in node[1]] + if len(children) == 1 and parentcount[id(node[1][0])] == 1: + child = children[0] + nodemap[id(node)] = (node[0] + child[0], child[1]) + else: + nodemap[id(node)] = (node[0], children) + return nodemap[id(node)] + + for node in dafsa: + count_parents(node) + return [join(node) for node in dafsa] + + +def join_suffixes(dafsa): + """Generates a new DAFSA where nodes that represent the same word lists + towards the sink are merged. + """ + nodemap = {frozenset((b'',)): None} + + def join(node): + """Returns a matching node. A new node is created if no matching node + exists. The graph is accessed in dfs order. + """ + suffixes = frozenset(to_words(node)) + if suffixes not in nodemap: + nodemap[suffixes] = (node[0], [join(child) for child in node[1]]) + return nodemap[suffixes] + + return [join(node) for node in dafsa] + + +def top_sort(dafsa): + """Generates list of nodes in topological sort order.""" + incoming = {} + + def count_incoming(node): + """Counts incoming references.""" + if node: + if id(node) not in incoming: + incoming[id(node)] = 1 + for child in node[1]: + count_incoming(child) + else: + incoming[id(node)] += 1 + + for node in dafsa: + count_incoming(node) + + for node in dafsa: + incoming[id(node)] -= 1 + + waiting = [node for node in dafsa if incoming[id(node)] == 0] + nodes = [] + + while waiting: + node = waiting.pop() + assert incoming[id(node)] == 0 + nodes.append(node) + for child in node[1]: + if child: + incoming[id(child)] -= 1 + if incoming[id(child)] == 0: + waiting.append(child) + return nodes + + +def encode_links(children, offsets, current): + """Encodes a list of children as one, two or three byte offsets.""" + if not children[0]: + # This is an <end_label> node and no links follow such nodes + assert len(children) == 1 + return [] + guess = 3 * len(children) + assert children + children = sorted(children, key=lambda x: -offsets[id(x)]) + while True: + offset = current + guess + buf = [] + for child in children: + last = len(buf) + distance = offset - offsets[id(child)] + assert distance > 0 and distance < (1 << 21) + + if distance < (1 << 6): + # A 6-bit offset: "s0xxxxxx" + buf.append(distance) + elif distance < (1 << 13): + # A 13-bit offset: "s10xxxxxxxxxxxxx" + buf.append(0x40 | (distance >> 8)) + buf.append(distance & 0xFF) + else: + # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx" + buf.append(0x60 | (distance >> 16)) + buf.append((distance >> 8) & 0xFF) + buf.append(distance & 0xFF) + # Distance in first link is relative to following record. + # Distance in other links are relative to previous link. + offset -= distance + if len(buf) == guess: + break + guess = len(buf) + # Set most significant bit to mark end of links in this node. + buf[last] |= (1 << 7) + buf.reverse() + return buf + + +def encode_prefix(label): + """Encodes a node label as a list of bytes without a trailing high byte. + + This method encodes a node if there is exactly one child and the + child follows immediately after so that no jump is needed. This label + will then be a prefix to the label in the child node. + """ + assert label + return [c for c in bytearray(reversed(label))] + + +def encode_label(label): + """Encodes a node label as a list of bytes with a trailing high byte >0x80. + """ + buf = encode_prefix(label) + # Set most significant bit to mark end of label in this node. + buf[0] |= (1 << 7) + return buf + + +def encode(dafsa, utf_mode): + """Encodes a DAFSA to a list of bytes""" + output = [] + offsets = {} + + for node in reversed(top_sort(dafsa)): + if (len(node[1]) == 1 and node[1][0] and + (offsets[id(node[1][0])] == len(output))): + output.extend(encode_prefix(node[0])) + else: + output.extend(encode_links(node[1], offsets, len(output))) + output.extend(encode_label(node[0])) + offsets[id(node)] = len(output) + + output.extend(encode_links(dafsa, offsets, len(output))) + output.reverse() + if utf_mode: + output.append(0x01) + return output + + +def to_cxx(data, codecs): + """Generates C++ code from a list of encoded bytes.""" + text = b'/* This file has been generated by psl-make-dafsa. DO NOT EDIT!\n\n' + text += b'The byte array encodes effective tld names. See psl-make-dafsa source for' + text += b' documentation.' + text += b'*/\n\n' + text += b'static constexpr unsigned char kDafsa[' + text += bytes(str(len(data)), **codecs) + text += b'] = {\n' + for i in range(0, len(data), 12): + text += b' ' + text += bytes(', '.join('0x%02x' % byte for byte in data[i:i + 12]), **codecs) + text += b',\n' + text += b'};\n' + return text + +def sha1_file(name): + sha1 = hashlib.sha1() + with open(name, 'rb') as f: + while True: + data = f.read(65536) + if not data: + break + sha1.update(data) + return sha1.hexdigest() + +def to_cxx_plus(data, codecs): + """Generates C++ code from a word list plus some variable assignments as needed by libpsl""" + text = to_cxx(data, codecs) + text += b'static time_t _psl_file_time = %d;\n' % os.stat(psl_input_file).st_mtime + text += b'static int _psl_nsuffixes = %d;\n' % psl_nsuffixes + text += b'static int _psl_nexceptions = %d;\n' % psl_nexceptions + text += b'static int _psl_nwildcards = %d;\n' % psl_nwildcards + text += b'static constexpr char _psl_sha1_checksum[] = "%s";\n' % bytes(sha1_file(psl_input_file), **codecs) + text += b'static constexpr char _psl_filename[] = "%s";\n' % bytes(psl_input_file, **codecs) + return text + +def words_to_whatever(words, converter, utf_mode, codecs): + """Generates C++ code from a word list""" + dafsa = to_dafsa(words, utf_mode) + for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels): + dafsa = fun(dafsa) + return converter(encode(dafsa, utf_mode), codecs) + + +def words_to_cxx(words, utf_mode, codecs): + """Generates C++ code from a word list""" + return words_to_whatever(words, to_cxx, utf_mode, codecs) + +def words_to_cxx_plus(words, utf_mode, codecs): + """Generates C++ code from a word list plus some variable assignments as needed by libpsl""" + return words_to_whatever(words, to_cxx_plus, utf_mode, codecs) + +def words_to_binary(words, utf_mode, codecs): + """Generates C++ code from a word list""" + return b'.DAFSA@PSL_0 \n' + words_to_whatever(words, lambda x, _: bytearray(x), utf_mode, codecs) + + +def parse_psl(infile, utf_mode, codecs): + """Parses PSL file and extract strings and return code""" + PSL_FLAG_EXCEPTION = (1<<0) + PSL_FLAG_WILDCARD = (1<<1) + PSL_FLAG_ICANN = (1<<2) # entry of ICANN section + PSL_FLAG_PRIVATE = (1<<3) # entry of PRIVATE section + PSL_FLAG_PLAIN = (1<<4) #just used for PSL syntax checking + + global psl_nsuffixes, psl_nexceptions, psl_nwildcards + + psl = {} + section = 0 + + for line in infile: + line = bytes(line.strip(), **codecs) + if not line: + continue + + if line.startswith(b'//'): + if section == 0: + if b'===BEGIN ICANN DOMAINS===' in line: + section = PSL_FLAG_ICANN + elif b'===BEGIN PRIVATE DOMAINS===' in line: + section = PSL_FLAG_PRIVATE + elif section == PSL_FLAG_ICANN and b'===END ICANN DOMAINS===' in line: + section = 0 + elif section == PSL_FLAG_PRIVATE and b'===END PRIVATE DOMAINS===' in line: + section = 0 + continue # skip comments + + if line[:1] == b'!': + psl_nexceptions += 1 + flags = PSL_FLAG_EXCEPTION | section + line = line[1:] + elif line[:1] == b'*': + if line[1:2] != b'.': + print('Unsupported kind of rule (ignored): %s' % line) + continue + psl_nwildcards += 1 + psl_nsuffixes += 1 + flags = PSL_FLAG_WILDCARD | PSL_FLAG_PLAIN | section + line = line[2:] + else: + psl_nsuffixes += 1 + flags = PSL_FLAG_PLAIN | section + + punycode = line.decode('utf-8').encode('idna') + + if punycode in psl: + """Found existing entry: + Combination of exception and plain rule is ambiguous + !foo.bar + foo.bar + + Allowed: + !foo.bar + *.foo.bar + foo.bar + *.foo.bar + """ + print('Found %s/%X (now %X)' % punycode, psl[punycode], flags) + continue + + if utf_mode: + psl[line] = flags + psl[punycode] = flags + +# with open("psl.out", 'w') as outfile: +# for (domain, flags) in sorted(psl.iteritems()): +# outfile.write(domain + "%X" % (flags & 0x0F) + "\n") + + return [domain + bytes('%X' % (flags & 0x0F), **codecs) for (domain, flags) in sorted(psl.items())] + + +def usage(): + """Prints the usage""" + print('usage: %s [options] infile outfile' % sys.argv[0]) + print(' --output-format=cxx Write DAFSA as C/C++ code (default)') + print(' --output-format=cxx+ Write DAFSA as C/C++ code plus statistical assignments') + print(' --output-format=binary Write DAFSA binary data') + print(' --encoding=ascii 7-bit ASCII mode') + print(' --encoding=utf-8 UTF-8 mode (default)') + exit(1) + + +def main(): + """Convert PSL file into C or binary DAFSA file""" + if len(sys.argv) < 3: + usage() + + converter = words_to_cxx + parser = parse_psl + utf_mode = True + + codecs = dict() + if sys.version_info.major > 2: + codecs['encoding'] = 'utf-8' + + for arg in sys.argv[1:-2]: + # Check --input-format for backward compatibility + if arg.startswith('--input-format='): + value = arg[15:].lower() + if value == 'psl': + parser = parse_psl + else: + print("Unknown input format '%s'" % value) + return 1 + elif arg.startswith('--output-format='): + value = arg[16:].lower() + if value == 'binary': + converter = words_to_binary + elif value == 'cxx': + converter = words_to_cxx + elif value == 'cxx+': + converter = words_to_cxx_plus + else: + print("Unknown output format '%s'" % value) + return 1 + elif arg.startswith('--encoding='): + value = arg[11:].lower() + if value == 'ascii': + utf_mode = False + elif value == 'utf-8': + utf_mode = True + else: + print("Unknown encoding '%s'" % value) + return 1 + else: + usage() + + if sys.argv[-2] == '-': + with open(sys.argv[-1], 'wb') as outfile: + outfile.write(converter(parser(sys.stdin, utf_mode, codecs), utf_mode, codecs)) + else: + """Some statistical data for --output-format=cxx+""" + global psl_input_file, psl_nsuffixes, psl_nexceptions, psl_nwildcards + + psl_input_file = sys.argv[-2] + psl_nsuffixes = 0 + psl_nexceptions = 0 + psl_nwildcards = 0 + + with open(sys.argv[-2], 'r', **codecs) as infile, open(sys.argv[-1], 'wb') as outfile: + outfile.write(converter(parser(infile, utf_mode, codecs), utf_mode, codecs)) + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) |