diff options
Diffstat (limited to 'chromium/mojo/public/tools/bindings/pylib/mojom/parse/lexer.py')
-rw-r--r-- | chromium/mojo/public/tools/bindings/pylib/mojom/parse/lexer.py | 277 |
1 files changed, 277 insertions, 0 deletions
diff --git a/chromium/mojo/public/tools/bindings/pylib/mojom/parse/lexer.py b/chromium/mojo/public/tools/bindings/pylib/mojom/parse/lexer.py new file mode 100644 index 00000000000..34419acd53c --- /dev/null +++ b/chromium/mojo/public/tools/bindings/pylib/mojom/parse/lexer.py @@ -0,0 +1,277 @@ +# Copyright 2014 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +import imp +import os.path +import sys + +# Disable lint check for finding modules: +# pylint: disable=F0401 + +def _GetDirAbove(dirname): + """Returns the directory "above" this file containing |dirname| (which must + also be "above" this file).""" + path = os.path.abspath(__file__) + while True: + path, tail = os.path.split(path) + assert tail + if tail == dirname: + return path + +try: + imp.find_module("ply") +except ImportError: + sys.path.append(os.path.join(_GetDirAbove("mojo"), "third_party")) +from ply.lex import TOKEN + +from ..error import Error + + +# Disable lint check for exceptions deriving from Exception: +# pylint: disable=W0710 +class LexError(Error): + """Class for errors from the lexer.""" + + def __init__(self, filename, message, lineno): + Error.__init__(self, filename, message, lineno=lineno) + + +# We have methods which look like they could be functions: +# pylint: disable=R0201 +class Lexer(object): + + def __init__(self, filename): + self.filename = filename + + ######################-- PRIVATE --###################### + + ## + ## Internal auxiliary methods + ## + def _error(self, msg, token): + raise LexError(self.filename, msg, token.lineno) + + ## + ## Reserved keywords + ## + keywords = ( + 'HANDLE', + + 'IMPORT', + 'MODULE', + 'STRUCT', + 'INTERFACE', + 'ENUM', + 'CONST', + 'TRUE', + 'FALSE', + 'DEFAULT', + ) + + keyword_map = {} + for keyword in keywords: + keyword_map[keyword.lower()] = keyword + + ## + ## All the tokens recognized by the lexer + ## + tokens = keywords + ( + # Identifiers + 'NAME', + + # Constants + 'ORDINAL', + 'INT_CONST_DEC', 'INT_CONST_HEX', + 'FLOAT_CONST', + 'CHAR_CONST', + + # String literals + 'STRING_LITERAL', + + # Operators + 'MINUS', + 'PLUS', + 'AMP', + + # Assignment + 'EQUALS', + + # Request / response + 'RESPONSE', + + # Delimiters + 'LPAREN', 'RPAREN', # ( ) + 'LBRACKET', 'RBRACKET', # [ ] + 'LBRACE', 'RBRACE', # { } + 'LANGLE', 'RANGLE', # < > + 'SEMI', # ; + 'COMMA', 'DOT' # , . + ) + + ## + ## Regexes for use in tokens + ## + + # valid C identifiers (K&R2: A.2.3) + identifier = r'[a-zA-Z_][0-9a-zA-Z_]*' + + hex_prefix = '0[xX]' + hex_digits = '[0-9a-fA-F]+' + + # integer constants (K&R2: A.2.5.1) + decimal_constant = '0|([1-9][0-9]*)' + hex_constant = hex_prefix+hex_digits + # Don't allow octal constants (even invalid octal). + octal_constant_disallowed = '0[0-9]+' + + # character constants (K&R2: A.2.5.2) + # Note: a-zA-Z and '.-~^_!=&;,' are allowed as escape chars to support #line + # directives with Windows paths as filenames (..\..\dir\file) + # For the same reason, decimal_escape allows all digit sequences. We want to + # parse all correct code, even if it means to sometimes parse incorrect + # code. + # + simple_escape = r"""([a-zA-Z._~!=&\^\-\\?'"])""" + decimal_escape = r"""(\d+)""" + hex_escape = r"""(x[0-9a-fA-F]+)""" + bad_escape = r"""([\\][^a-zA-Z._~^!=&\^\-\\?'"x0-7])""" + + escape_sequence = \ + r"""(\\("""+simple_escape+'|'+decimal_escape+'|'+hex_escape+'))' + cconst_char = r"""([^'\\\n]|"""+escape_sequence+')' + char_const = "'"+cconst_char+"'" + unmatched_quote = "('"+cconst_char+"*\\n)|('"+cconst_char+"*$)" + bad_char_const = \ + r"""('"""+cconst_char+"""[^'\n]+')|('')|('"""+ \ + bad_escape+r"""[^'\n]*')""" + + # string literals (K&R2: A.2.6) + string_char = r"""([^"\\\n]|"""+escape_sequence+')' + string_literal = '"'+string_char+'*"' + bad_string_literal = '"'+string_char+'*'+bad_escape+string_char+'*"' + + # floating constants (K&R2: A.2.5.3) + exponent_part = r"""([eE][-+]?[0-9]+)""" + fractional_constant = r"""([0-9]*\.[0-9]+)|([0-9]+\.)""" + floating_constant = \ + '(((('+fractional_constant+')'+ \ + exponent_part+'?)|([0-9]+'+exponent_part+')))' + + # Ordinals + ordinal = r'@[0-9]+' + missing_ordinal_value = r'@' + # Don't allow ordinal values in octal (even invalid octal, like 09) or + # hexadecimal. + octal_or_hex_ordinal_disallowed = r'@((0[0-9]+)|('+hex_prefix+hex_digits+'))' + + ## + ## Rules for the normal state + ## + t_ignore = ' \t\r' + + # Newlines + def t_NEWLINE(self, t): + r'\n+' + t.lexer.lineno += len(t.value) + + # Operators + t_MINUS = r'-' + t_PLUS = r'\+' + t_AMP = r'&' + + # = + t_EQUALS = r'=' + + # => + t_RESPONSE = r'=>' + + # Delimiters + t_LPAREN = r'\(' + t_RPAREN = r'\)' + t_LBRACKET = r'\[' + t_RBRACKET = r'\]' + t_LBRACE = r'\{' + t_RBRACE = r'\}' + t_LANGLE = r'<' + t_RANGLE = r'>' + t_COMMA = r',' + t_DOT = r'\.' + t_SEMI = r';' + + t_STRING_LITERAL = string_literal + + # The following floating and integer constants are defined as + # functions to impose a strict order (otherwise, decimal + # is placed before the others because its regex is longer, + # and this is bad) + # + @TOKEN(floating_constant) + def t_FLOAT_CONST(self, t): + return t + + @TOKEN(hex_constant) + def t_INT_CONST_HEX(self, t): + return t + + @TOKEN(octal_constant_disallowed) + def t_OCTAL_CONSTANT_DISALLOWED(self, t): + msg = "Octal values not allowed" + self._error(msg, t) + + @TOKEN(decimal_constant) + def t_INT_CONST_DEC(self, t): + return t + + # Must come before bad_char_const, to prevent it from + # catching valid char constants as invalid + # + @TOKEN(char_const) + def t_CHAR_CONST(self, t): + return t + + @TOKEN(unmatched_quote) + def t_UNMATCHED_QUOTE(self, t): + msg = "Unmatched '" + self._error(msg, t) + + @TOKEN(bad_char_const) + def t_BAD_CHAR_CONST(self, t): + msg = "Invalid char constant %s" % t.value + self._error(msg, t) + + # unmatched string literals are caught by the preprocessor + + @TOKEN(bad_string_literal) + def t_BAD_STRING_LITERAL(self, t): + msg = "String contains invalid escape code" + self._error(msg, t) + + # Handle ordinal-related tokens in the right order: + @TOKEN(octal_or_hex_ordinal_disallowed) + def t_OCTAL_OR_HEX_ORDINAL_DISALLOWED(self, t): + msg = "Octal and hexadecimal ordinal values not allowed" + self._error(msg, t) + + @TOKEN(ordinal) + def t_ORDINAL(self, t): + return t + + @TOKEN(missing_ordinal_value) + def t_BAD_ORDINAL(self, t): + msg = "Missing ordinal value" + self._error(msg, t) + + @TOKEN(identifier) + def t_NAME(self, t): + t.type = self.keyword_map.get(t.value, "NAME") + return t + + # Ignore C and C++ style comments + def t_COMMENT(self, t): + r'(/\*(.|\n)*?\*/)|(//.*(\n[ \t]*//.*)*)' + t.lexer.lineno += t.value.count("\n") + + def t_error(self, t): + msg = "Illegal character %s" % repr(t.value[0]) + self._error(msg, t) |