# # sources.py # # Convert source code comments to multi-line blocks (library file). # # Copyright 2002-2004, 2006-2009, 2012-2014 by # David Turner. # # This file is part of the FreeType project, and may only be used, # modified, and distributed under the terms of the FreeType project # license, LICENSE.TXT. By continuing to use, modify, or distribute # this file you indicate that you have read the license and # understand and accept it fully. # # This library file contains definitions of classes needed to decompose C # source code files into a series of multi-line `blocks'. There are two # kinds of blocks. # # - Normal blocks, which contain source code or ordinary comments. # # - Documentation blocks, which have restricted formatting, and whose text # always start with a documentation markup tag like `', # `', etc. # # The routines to process the content of documentation blocks are contained # in file `content.py'; the classes and methods found here only deal with # text parsing and basic documentation block extraction. # import fileinput, re, sys, os, string ################################################################ ## ## SOURCE BLOCK FORMAT CLASS ## ## A simple class containing compiled regular expressions to detect ## potential documentation format block comments within C source code. ## ## The `column' pattern must contain a group to `unbox' the content of ## documentation comment blocks. ## ## Later on, paragraphs are converted to long lines, which simplifies the ## regular expressions that act upon the text. ## class SourceBlockFormat: def __init__( self, id, start, column, end ): """Create a block pattern, used to recognize special documentation blocks.""" self.id = id self.start = re.compile( start, re.VERBOSE ) self.column = re.compile( column, re.VERBOSE ) self.end = re.compile( end, re.VERBOSE ) # # Format 1 documentation comment blocks. # # /************************************/ (at least 2 asterisks) # /* */ # /* */ # /* */ # /************************************/ (at least 2 asterisks) # start = r''' \s* # any number of whitespace /\*{2,}/ # followed by '/' and at least two asterisks then '/' \s*$ # probably followed by whitespace ''' column = r''' \s* # any number of whitespace /\*{1} # followed by '/' and precisely one asterisk ([^*].*) # followed by anything (group 1) \*{1}/ # followed by one asterisk and a '/' \s*$ # probably followed by whitespace ''' re_source_block_format1 = SourceBlockFormat( 1, start, column, start ) # # Format 2 documentation comment blocks. # # /************************************ (at least 2 asterisks) # * # * (1 asterisk) # * # */ (1 or more asterisks) # start = r''' \s* # any number of whitespace /\*{2,} # followed by '/' and at least two asterisks \s*$ # probably followed by whitespace ''' column = r''' \s* # any number of whitespace \*{1}(?![*/]) # followed by precisely one asterisk not followed by `/' (.*) # then anything (group1) ''' end = r''' \s* # any number of whitespace \*+/ # followed by at least one asterisk, then '/' ''' re_source_block_format2 = SourceBlockFormat( 2, start, column, end ) # # The list of supported documentation block formats. We could add new ones # quite easily. # re_source_block_formats = [re_source_block_format1, re_source_block_format2] # # The following regular expressions correspond to markup tags within the # documentation comment blocks. They are equivalent despite their different # syntax. # # A markup tag consists of letters or character `-', to be found in group 1. # # Notice that a markup tag _must_ begin a new paragraph. # re_markup_tag1 = re.compile( r'''\s*<((?:\w|-)*)>''' ) # format re_markup_tag2 = re.compile( r'''\s*@((?:\w|-)*):''' ) # @xxxx: format # # The list of supported markup tags. We could add new ones quite easily. # re_markup_tags = [re_markup_tag1, re_markup_tag2] # # A regular expression to detect a cross reference, after markup tags have # been stripped off. Group 1 is the reference, group 2 the rest of the # line. # # A cross reference consists of letters, digits, or characters `-' and `_'. # re_crossref = re.compile( r'@((?:\w|-)*)(.*)' ) # @foo # # Two regular expressions to detect italic and bold markup, respectively. # Group 1 is the markup, group 2 the rest of the line. # # Note that the markup is limited to words consisting of letters, digits, # the character `_', or an apostrophe (but not as the first character). # re_italic = re.compile( r"_(\w(?:\w|')*)_(.*)" ) # _italic_ re_bold = re.compile( r"\*(\w(?:\w|')*)\*(.*)" ) # *bold* # # This regular expression code to identify an URL has been taken from # # http://mail.python.org/pipermail/tutor/2002-September/017228.html # # (with slight modifications). # urls = r'(?:https?|telnet|gopher|file|wais|ftp)' ltrs = r'\w' gunk = r'/#~:.?+=&%@!\-' punc = r'.:?\-' any = "%(ltrs)s%(gunk)s%(punc)s" % { 'ltrs' : ltrs, 'gunk' : gunk, 'punc' : punc } url = r""" ( \b # start at word boundary %(urls)s : # need resource and a colon [%(any)s] +? # followed by one or more of any valid # character, but be conservative and # take only what you need to... (?= # [look-ahead non-consumptive assertion] [%(punc)s]* # either 0 or more punctuation (?: # [non-grouping parentheses] [^%(any)s] | $ # followed by a non-url char # or end of the string ) ) ) """ % {'urls' : urls, 'any' : any, 'punc' : punc } re_url = re.compile( url, re.VERBOSE | re.MULTILINE ) # # A regular expression that stops collection of comments for the current # block. # re_source_sep = re.compile( r'\s*/\*\s*\*/' ) # /* */ # # A regular expression to find possible C identifiers while outputting # source code verbatim, covering things like `*foo' or `(bar'. Group 1 is # the prefix, group 2 the identifier -- since we scan lines from left to # right, sequentially splitting the source code into prefix and identifier # is fully sufficient for our purposes. # re_source_crossref = re.compile( r'(\W*)(\w*)' ) # # A regular expression that matches a list of reserved C source keywords. # re_source_keywords = re.compile( '''\\b ( typedef | struct | enum | union | const | char | int | short | long | void | signed | unsigned | \#include | \#define | \#undef | \#if | \#ifdef | \#ifndef | \#else | \#endif ) \\b''', re.VERBOSE ) ################################################################ ## ## SOURCE BLOCK CLASS ## ## There are two important fields in a `SourceBlock' object. ## ## self.lines ## A list of text lines for the corresponding block. ## ## self.content ## For documentation comment blocks only, this is the block content ## that has been `unboxed' from its decoration. This is `None' for all ## other blocks (i.e., sources or ordinary comments with no starting ## markup tag) ## class SourceBlock: def __init__( self, processor, filename, lineno, lines ): self.processor = processor self.filename = filename self.lineno = lineno self.lines = lines[:] self.format = processor.format self.content = [] if self.format == None: return words = [] # extract comment lines lines = [] for line0 in self.lines: m = self.format.column.match( line0 ) if m: lines.append( m.group( 1 ) ) # now, look for a markup tag for l in lines: l = string.strip( l ) if len( l ) > 0: for tag in re_markup_tags: if tag.match( l ): self.content = lines return def location( self ): return "(" + self.filename + ":" + repr( self.lineno ) + ")" # debugging only -- not used in normal operations def dump( self ): if self.content: print "{{{content start---" for l in self.content: print l print "---content end}}}" return fmt = "" if self.format: fmt = repr( self.format.id ) + " " for line in self.lines: print line ################################################################ ## ## SOURCE PROCESSOR CLASS ## ## The `SourceProcessor' is in charge of reading a C source file and ## decomposing it into a series of different `SourceBlock' objects. ## ## A SourceBlock object consists of the following data. ## ## - A documentation comment block using one of the layouts above. Its ## exact format will be discussed later. ## ## - Normal sources lines, including comments. ## ## class SourceProcessor: def __init__( self ): """Initialize a source processor.""" self.blocks = [] self.filename = None self.format = None self.lines = [] def reset( self ): """Reset a block processor and clean up all its blocks.""" self.blocks = [] self.format = None def parse_file( self, filename ): """Parse a C source file and add its blocks to the processor's list.""" self.reset() self.filename = filename fileinput.close() self.format = None self.lineno = 0 self.lines = [] for line in fileinput.input( filename ): # strip trailing newlines, important on Windows machines! if line[-1] == '\012': line = line[0:-1] if self.format == None: self.process_normal_line( line ) else: if self.format.end.match( line ): # A normal block end. Add it to `lines' and create a # new block self.lines.append( line ) self.add_block_lines() elif self.format.column.match( line ): # A normal column line. Add it to `lines'. self.lines.append( line ) else: # An unexpected block end. Create a new block, but # don't process the line. self.add_block_lines() # we need to process the line again self.process_normal_line( line ) # record the last lines self.add_block_lines() def process_normal_line( self, line ): """Process a normal line and check whether it is the start of a new block.""" for f in re_source_block_formats: if f.start.match( line ): self.add_block_lines() self.format = f self.lineno = fileinput.filelineno() self.lines.append( line ) def add_block_lines( self ): """Add the current accumulated lines and create a new block.""" if self.lines != []: block = SourceBlock( self, self.filename, self.lineno, self.lines ) self.blocks.append( block ) self.format = None self.lines = [] # debugging only, not used in normal operations def dump( self ): """Print all blocks in a processor.""" for b in self.blocks: b.dump() # eof