diff options
Diffstat (limited to 'webapp/codereview/intra_region_diff.py')
-rw-r--r-- | webapp/codereview/intra_region_diff.py | 699 |
1 files changed, 699 insertions, 0 deletions
diff --git a/webapp/codereview/intra_region_diff.py b/webapp/codereview/intra_region_diff.py new file mode 100644 index 0000000000..3383c708e8 --- /dev/null +++ b/webapp/codereview/intra_region_diff.py @@ -0,0 +1,699 @@ +# Copyright 2008 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Intra-region diff utilities. + +Intra-region diff highlights the blocks of code which have been changed or +deleted within a region. So instead of highlighting the whole region marked as +changed, the user can see what exactly was changed within that region. + +Terminology: + 'region' is a list of consecutive code lines. + 'word' is the unit of intra-region diff. Its definition is arbitrary based on + what we think as to be a good unit of difference between two regions. + 'block' is a small section of code within a region. It can span multiple + lines. There can be multiple non overlapping blocks within a region. A block + can potentially span the whole region. + +The blocks have two representations. One is of the format (offset1, offset2, +size) which is returned by the SequenceMatcher to indicate a match of +length 'size' starting at offset1 in the first/old line and starting at offset2 +in the second/new line. We convert this representation to a pair of tuples i.e. +(offset1, size) and (offset2, size) for rendering each side of the diff +separately. This latter representation is also more efficient for doing +compaction of adjacent blocks which reduces the size of the HTML markup. See +CompactBlocks for more details. + +SequenceMatcher always returns one special matching block at the end with +contents (len(line1), len(line2), 0). We retain this special block as it +simplifies for loops in rendering the last non-matching block. All functions +which deal with the sequence of blocks assume presence of the special block at +the end of the sequence and retain it. +""" + +import cgi +import difflib +import re + +# Tag to begin a diff chunk. +BEGIN_TAG = "<span class=\"%s\">" +# Tag to end a diff block. +END_TAG = "</span>" +# Tag used for visual tab indication +TAB_TAG = ("<span class=\"visualtab\" title=\"Visual tab indicator. " + "Change settings above to hide.\">»</span>") +# Color scheme to govern the display properties of diff blocks and matching +# blocks. Each value e.g. 'oldlight' corresponds to a CSS style. +COLOR_SCHEME = { + 'old': { + 'match': 'oldlight', + 'diff': 'olddark', + 'bckgrnd': 'oldlight', + }, + 'new': { + 'match': 'newlight', + 'diff': 'newdark', + 'bckgrnd': 'newlight', + }, + 'oldmove': { + 'match': 'movelight', + 'diff': 'oldmovedark', + 'bckgrnd': 'movelight' + }, + 'newmove': { + 'match': 'newlight', + 'diff': 'newdark', + 'bckgrnd': 'newlight' + }, +} +# Regular expressions to tokenize lines. Default is 'b'. +EXPRS = { + 'a': r'(\w+|[^\w\s]+|\s+)', + 'b': r'([A-Za-z0-9]+|[^A-Za-z0-9])', + 'c': r'([A-Za-z0-9_]+|[^A-Za-z0-9_])', + } +# Maximum total characters in old and new lines for doing intra-region diffs. +# Intra-region diff for larger regions is hard to comprehend and wastes CPU +# time. +MAX_TOTAL_LEN = 10000 + + +def ExpandTabs(text, tabsize=8, tab_marker=None): + """Expand tab characters in a string into spaces with an optional marker. + + Args: + text: a string containing tab characters. + tabsize: the number of spaces that a tab represents + tab_marker: a character; if not None, we replace the first character + of each tab expansion with this. + """ + tabpos = text.find("\t") + while tabpos >= 0: + fillwidth = tabsize - (tabpos % tabsize) + if fillwidth == 0: + fillwidth = tabsize + if tab_marker: + fill = tab_marker + " " * (fillwidth - 1) + else: + fill = " " * fillwidth + # We avoid str.replace in case tab_marker is \t + text = text[:tabpos] + fill + text[tabpos+1:] + tabpos = text.find("\t", tabpos + 1) + return text + + +def Fold(text, limit=85, indent=5, offset=0, tabsize=8, mark_tabs=False): + """Break a long string into multiple lines. + + Lines longer than 'limit' are broken up into pieces of at most + 'limit' characters; continuation lines start with 'indent' spaces. + + 'offset' is used to indicate if 'text' itself doesn't align with + the beginning of line e.g. we are trying to Fold a line when we have + already printed 'offset' number of characters to the output. + + This also translates tabs into 'tabsize' spaces. If 'mark_tabs' is true, + then we indicate the first character of each expanded tab visually. + + Input and output are assumed to be in UTF-8; the computation is done + in Unicode. (Still not good enough if zero-width characters are + present.) If the input is not valid UTF-8, then the encoding is + passed through, potentially breaking up multi-byte characters. + We pass the line through cgi.escape before returning it. + + A trailing newline is always stripped from the input first. + """ + assert tabsize > 0, tabsize + if text.endswith("\n"): + text = text[:-1] + try: + text = unicode(text, "utf-8") + except: + pass + if "\t" in text: + # If mark_tabs is true, we retain one \t character as a marker during + # expansion so that we later replace it with an HTML snippet. + tab_marker = mark_tabs and "\t" or None + rest = text[indent-offset:] + text = text[:indent-offset] + ExpandTabs(rest, tabsize, tab_marker) + # Perform wrapping. + if len(text) > limit - offset: + parts = [] + prefix = "" + i = 0 + j = limit - offset + while i < len(text): + parts.append(prefix + text[i:j]) + i = j + j += limit - indent + prefix = " " * indent + text = "\n".join(parts) + # Colorize tab markers (after calling escape) + text = cgi.escape(text) + text = text.replace("\t", TAB_TAG) + if isinstance(text, unicode): + return text.encode("utf-8", "replace") + return text + + +def CompactBlocks(blocks): + """Compacts adjacent code blocks. + + In many cases 2 adjacent blocks can be merged into one. This allows + to do some further processing on those blocks. + + Args: + blocks: [(offset1, size), ...] + + Returns: + A list with the same structure as the input with adjacent blocks + merged. However, the last block (which is always assumed to have + a zero size) is never merged. For example, the input + [(0, 2), (2, 8), (10, 5), (15, 0)] + will produce the output [(0, 15), (15, 0)]. + """ + if len(blocks) == 1: + return blocks + result = [blocks[0]] + for block in blocks[1:-1]: + last_start, last_len = result[-1] + curr_start, curr_len = block + if last_start + last_len == curr_start: + result[-1] = last_start, last_len + curr_len + else: + result.append(block) + result.append(blocks[-1]) + return result + + +def FilterBlocks(blocks, filter_func): + """Gets rid of any blocks if filter_func evaluates false for them. + + Args: + blocks: [(offset1, offset2, size), ...]; must have at least 1 entry + filter_func: a boolean function taking a single argument of the form + (offset1, offset2, size) + + Returns: + A list with the same structure with entries for which filter_func() + returns false removed. However, the last block is always included. + """ + # We retain the 'special' block at the end. + res = [b for b in blocks[:-1] if filter_func(b)] + res.append(blocks[-1]) + return res + + +def GetDiffParams(expr='b', min_match_ratio=0.6, min_match_size=2, dbg=False): + """Returns a tuple of various parameters which affect intra region diffs. + + Args: + expr: regular expression id to use to identify 'words' in the intra region + diff + min_match_ratio: minimum similarity between regions to qualify for intra + region diff + min_match_size: the smallest matching block size to use. Blocks smaller + than this are ignored. + dbg: to turn on generation of debugging information for the diff + + Returns: + 4 tuple (expr, min_match_ratio, min_match_size, dbg) that can be used to + customize diff. It can be passed to functions like WordDiff and + IntraLineDiff. + """ + assert expr in EXPRS + assert min_match_size in xrange(1,5) + assert min_match_ratio > 0.0 and min_match_ratio < 1.0 + return (expr, min_match_ratio, min_match_size, dbg) + + +def CanDoIRDiff(old_lines, new_lines): + """Tells if it would be worth computing the intra region diff. + + Calculating IR diff is costly and is usually helpful only for small regions. + We use a heuristic that if the total number of characters is more than a + certain threshold then we assume it is not worth computing the IR diff. + + Args: + old_lines: an array of strings containing old text + new_lines: an array of strings containing new text + + Returns: + True if we think it is worth computing IR diff for the region defined + by old_lines and new_lines, False otherwise. + + TODO: Let GetDiffParams handle MAX_TOTAL_LEN param also. + """ + total_chars = (sum(len(line) for line in old_lines) + + sum(len(line) for line in new_lines)) + return total_chars <= MAX_TOTAL_LEN + + +def WordDiff(line1, line2, diff_params): + """Returns blocks with positions indiciating word level diffs. + + Args: + line1: string representing the left part of the diff + line2: string representing the right part of the diff + diff_params: return value of GetDiffParams + + Returns: + A tuple (blocks, ratio) where: + blocks: [(offset1, offset2, size), ...] such that + line1[offset1:offset1+size] == line2[offset2:offset2+size] + and the last block is always (len(line1), len(line2), 0) + ratio: a float giving the diff ratio computed by SequenceMatcher. + """ + match_expr, min_match_ratio, min_match_size, dbg = diff_params + exp = EXPRS[match_expr] + # We want to split at proper character boundaries in UTF8 text. + try: + line1_u = unicode(line1, "utf8") + except: + line1_u = line1 + try: + line2_u = unicode(line2, "utf8") + except: + line2_u = line2 + def _ToUTF8(s): + if isinstance(s, unicode): + return s.encode("utf8") + return s + a = map(_ToUTF8, re.findall(exp, line1_u, re.U)) + b = map(_ToUTF8, re.findall(exp, line2_u, re.U)) + s = difflib.SequenceMatcher(None, a, b) + matching_blocks = s.get_matching_blocks() + ratio = s.ratio() + # Don't show intra region diffs if both lines are too different and there is + # more than one block of difference. If there is only one change then we + # still show the intra region diff regardless of how different the blocks + # are. + # Note: We compare len(matching_blocks) with 3 because one block of change + # results in 2 matching blocks. We add the one special block and we get 3 + # matching blocks per one block of change. + if ratio < min_match_ratio and len(matching_blocks) > 3: + return ([(0, 0, 0)], ratio) + # For now convert to character level blocks because we already have + # the code to deal with folding across lines for character blocks. + # Create arrays lena an lenb which have cumulative word lengths + # corresponding to word positions in a and b + lena = [] + last = 0 + for w in a: + lena.append(last) + last += len(w) + lenb = [] + last = 0 + for w in b: + lenb.append(last) + last += len(w) + lena.append(len(line1)) + lenb.append(len(line2)) + # Convert to character blocks + blocks = [] + for s1, s2, blen in matching_blocks[:-1]: + apos = lena[s1] + bpos = lenb[s2] + block_len = lena[s1+blen] - apos + blocks.append((apos, bpos, block_len)) + # Recreate the special block. + blocks.append((len(line1), len(line2), 0)) + # Filter any matching blocks which are smaller than the desired threshold. + # We don't remove matching blocks with only a newline character as doing so + # results in showing the matching newline character as non matching which + # doesn't look good. + blocks = FilterBlocks(blocks, lambda b: (b[2] >= min_match_size or + line1[b[0]:b[0]+b[2]] == '\n')) + return (blocks, ratio) + + +def IntraLineDiff(line1, line2, diff_params, diff_func=WordDiff): + """Computes intraline diff blocks. + + Args: + line1: string representing the left part of the diff + line2: string representing the right part of the diff + diff_params: return value of GetDiffParams + diff_func: a function whose signature matches that of WordDiff() above + + Returns: + A tuple of (blocks1, blocks2) corresponding to line1 and line2. + Each element of the tuple is an array of (start_pos, length) + tuples denoting a diff block. + """ + blocks, ratio = diff_func(line1, line2, diff_params) + blocks1 = [(start1, length) for (start1, start2, length) in blocks] + blocks2 = [(start2, length) for (start1, start2, length) in blocks] + + return (blocks1, blocks2, ratio) + + +def DumpDiff(blocks, line1, line2): + """Helper function to debug diff related problems. + + Args: + blocks: [(offset1, offset2, size), ...] + line1: string representing the left part of the diff + line2: string representing the right part of the diff + """ + for offset1, offset2, size in blocks: + print offset1, offset2, size + print offset1, size, ": ", line1[offset1:offset1+size] + print offset2, size, ": ", line2[offset2:offset2+size] + + +def RenderIntraLineDiff(blocks, line, tag, dbg_info=None, limit=80, indent=5, + tabsize=8, mark_tabs=False): + """Renders the diff blocks returned by IntraLineDiff function. + + Args: + blocks: [(start_pos, size), ...] + line: line of code on which the blocks are to be rendered. + tag: 'new' or 'old' to control the color scheme. + dbg_info: a string that holds debugging informaion header. Debug + information is rendered only if dbg_info is not None. + limit: folding limit to be passed to the Fold function. + indent: indentation size to be passed to the Fold function. + tabsize: the number of spaces that a tab represents + mark_tabs: if True, mark the first character of each expanded tab visually + + Returns: + A tuple of two elements. First element is the rendered version of + the input 'line'. Second element tells if the line has a matching + newline character. + """ + res = "" + prev_start, prev_len = 0, 0 + has_newline = False + debug_info = dbg_info + if dbg_info: + debug_info += "\nBlock Count: %d\nBlocks: " % (len(blocks) - 1) + for curr_start, curr_len in blocks: + if dbg_info and curr_len > 0: + debug_info += Fold("\n(%d, %d):|%s|" % + (curr_start, curr_len, + line[curr_start:curr_start+curr_len]), + limit, indent, tabsize, mark_tabs) + res += FoldBlock(line, prev_start + prev_len, curr_start, limit, indent, + tag, 'diff', tabsize, mark_tabs) + res += FoldBlock(line, curr_start, curr_start + curr_len, limit, indent, + tag, 'match', tabsize, mark_tabs) + # TODO: This test should be out of loop rather than inside. Once we + # filter out some junk from blocks (e.g. some empty blocks) we should do + # this test only on the last matching block. + if line[curr_start:curr_start+curr_len].endswith('\n'): + has_newline = True + prev_start, prev_len = curr_start, curr_len + return (res, has_newline, debug_info) + + +def FoldBlock(src, start, end, limit, indent, tag, btype, tabsize=8, + mark_tabs=False): + """Folds and renders a block. + + Args: + src: line of code + start: starting position of the block within 'src'. + end: ending position of the block within 'src'. + limit: folding limit + indent: indentation to use for folding. + tag: 'new' or 'old' to control the color scheme. + btype: block type i.e. 'match' or 'diff' to control the color schme. + tabsize: the number of spaces that a tab represents + mark_tabs: if True, mark the first character of each expanded tab visually + + Returns: + A string represeting the rendered block. + """ + text = src[start:end] + # We ignore newlines because we do newline management ourselves. + # Any other new lines with at the end will be stripped off by the Fold + # method. + if start >= end or text == '\n': + return "" + fbegin, lend, nl_plus_indent = GetTags(tag, btype, indent) + # 'bol' is beginning of line + offset_from_bol = start % limit + res = "" + # If this is the first block of the line and this is not the first line then + # insert newline + indent. This special case is not dealt with in the for + # loop below. + if offset_from_bol == 0 and not start == 0: + res = nl_plus_indent + text = Fold(text, limit, 0, offset_from_bol, tabsize, mark_tabs) + folded_lines = text.split("\n") + for (j, l) in enumerate(folded_lines): + if l: + res += (fbegin + l + lend) + # Add new line plus indent except for the last line. + if j < len(folded_lines) - 1: + res += nl_plus_indent + return res + + +def GetTags(tag, btype, indent): + """Returns various tags for rendering diff blocks. + + Args: + tag: a key from COLOR_SCHEME + btype: 'match' or 'diff' + indent: indentation to use + Returns + A 3 tuple (begin_tag, end_tag, formatted_indent_block) + """ + assert tag in COLOR_SCHEME + assert btype in ['match', 'diff'] + fbegin = BEGIN_TAG % COLOR_SCHEME[tag][btype] + bbegin = BEGIN_TAG % COLOR_SCHEME[tag]['bckgrnd'] + lend = END_TAG + nl_plus_indent = '\n' + if indent > 0: + nl_plus_indent += bbegin + cgi.escape(" "*indent) + lend + return fbegin, lend, nl_plus_indent + + +def ConvertToSingleLine(lines): + """Transforms a sequence of strings into a single line. + + Returns the state that can be used to reconstruct the original lines with + the newline separators placed at the original place. + + Args: + lines: sequence of strings + + Returns: + Returns (single_line, state) tuple. 'state' shouldn't be modified by the + caller. It is only used to pass to other functions which will do certain + operations on this state. + + 'state' is an array containing a dictionary for each item in lines. Each + dictionary has two elements 'pos' and 'blocks'. 'pos' is the end position + of each line in the final converted string. 'blocks' is an array of blocks + for each line of code. These blocks are added using MarkBlock function. + """ + state = [] + total_length = 0 + for l in lines: + total_length += len(l) + # TODO: Use a tuple instead. + state.append({ 'pos': total_length, # the line split point + 'blocks': [] # blocks which belong to this line + }) + result = "".join(lines) + assert len(state) == len(lines) + return (result, state) + + +def MarkBlock(state, begin, end): + """Marks a block on a region such that it doesn't cross line boundaries. + + It is an operation that can be performed on the single line which was + returned by the ConvertToSingleLine function. This operation marks arbitrary + block [begin,end) on the text. It also ensures that if [begin,end) crosses + line boundaries in the original region then it splits the section up in 2 or + more blocks such that no block crosses the boundaries. + + Args: + state: the state returned by ConvertToSingleLine function. The state + contained is modified by this function. + begin: Beginning of the block. + end: End of the block (exclusive). + + Returns: + None. + """ + # TODO: Make sure already existing blocks don't overlap + if begin == end: + return + last_pos = 0 + for entry in state: + pos = entry['pos'] + if begin >= last_pos and begin < pos: + if end < pos: + # block doesn't cross any line boundary + entry['blocks'].append((begin, end)) + else: + # block crosses the line boundary + entry['blocks'].append((begin, pos)) + MarkBlock(state, pos, end) + break + last_pos = pos + + +def GetBlocks(state): + """Returns all the blocks corresponding to the lines in the region. + + Args: + state: the state returned by ConvertToSingleLine(). + + Returns: + An array of [(start_pos, length), ..] with an entry for each line in the + region. + """ + result = [] + last_pos = 0 + for entry in state: + pos = entry['pos'] + # Calculate block start points from the beginning of individual lines. + blocks = [(s[0]-last_pos, s[1]-s[0]) for s in entry['blocks']] + # Add one end marker block. + blocks.append((pos-last_pos, 0)) + result.append(blocks) + last_pos = pos + return result + + +def IntraRegionDiff(old_lines, new_lines, diff_params): + """Computes intra region diff. + + Args: + old_lines: array of strings + new_lines: array of strings + diff_params: return value of GetDiffParams + + Returns: + A tuple (old_blocks, new_blocks) containing matching blocks for old and new + lines. + """ + old_line, old_state = ConvertToSingleLine(old_lines) + new_line, new_state = ConvertToSingleLine(new_lines) + old_blocks, new_blocks, ratio = IntraLineDiff(old_line, new_line, diff_params) + for begin, length in old_blocks: + MarkBlock(old_state, begin, begin+length) + old_blocks = GetBlocks(old_state) + + for begin, length in new_blocks: + MarkBlock(new_state, begin, begin+length) + new_blocks = GetBlocks(new_state) + + return (old_blocks, new_blocks, ratio) + + +def NormalizeBlocks(blocks, line): + """Normalizes block representation of an intra line diff. + + One diff can have multiple representations. Some times the diff returned by + the difflib for similar text sections is different even within same region. + For example if 2 already indented lines were indented with one additional + space character, the difflib may return the non matching space character to + be any of the already existing spaces. So one line may show non matching + space character as the first space character and the second line may show it + to be the last space character. This is sometimes confusing. This is the + side effect of the new regular expression we are using in WordDiff for + identifying indvidual words. This regular expression ('b') treats a sequence + of punctuation and whitespace characters as individual characters. It has + some visual advantages for showing a character level punctuation change as + one character change rather than a group of character change. + + Making the normalization too generic can have performance implications. So + this implementation of normalize blocks intends to handle only one case. + Let's say S represents the space character and () marks a matching block. + Then the normalize operation will do following: + + SSSS(SS)(ABCD) => SSSS(SS)(ABCD) + (SS)SSSS(ABCD) => SSSS(SS)(ABCD) + (SSSS)SS(ABCD) => SS(SSSS)(ABCD) + + and so on.. + + Args: + blocks: An array of (offset, len) tuples defined on 'line'. These blocks + mark the matching areas. Anything between these matching blocks is + considered non-matching. + line: The text string on which the blocks are defined. + + Returns: + An array of (offset, len) tuples representing the same diff but in + normalized form. + """ + result = [] + prev_start, prev_len = blocks[0] + for curr_start, curr_len in blocks[1:]: + # Note: nm_ is a prefix for non matching and m_ is a prefix for matching. + m_len, nm_len = prev_len, curr_start - (prev_start+prev_len) + # This if condition checks if matching and non matching parts are greater + # than zero length and are comprised of spaces ONLY. The last condition + # deals with most of the observed cases of strange diffs. + # Note: curr_start - prev_start == m_l + nm_l + # So line[prev_start:curr_start] == matching_part + non_matching_part. + text = line[prev_start:curr_start] + if m_len > 0 and nm_len > 0 and text == ' ' * len(text): + # Move the matching block towards the end i.e. normalize. + result.append((prev_start + nm_len, m_len)) + else: + # Keep the existing matching block. + result.append((prev_start, prev_len)) + prev_start, prev_len = curr_start, curr_len + result.append(blocks[-1]) + assert len(result) == len(blocks) + return result + + +def RenderIntraRegionDiff(lines, diff_blocks, tag, ratio, limit=80, indent=5, + tabsize=8, mark_tabs=False, dbg=False): + """Renders intra region diff for one side. + + Args: + lines: list of strings representing source code in the region + diff_blocks: blocks that were returned for this region by IntraRegionDiff() + tag: 'new' or 'old' + ratio: similarity ratio returned by the diff computing function + limit: folding limit + indent: indentation size + tabsize: the number of spaces that a tab represents + mark_tabs: if True, mark the first character of each expanded tab visually + dbg: indicates if debug information should be rendered + + Returns: + A list of strings representing the rendered version of each item in input + 'lines'. + """ + result = [] + dbg_info = None + if dbg: + dbg_info = 'Ratio: %.1f' % ratio + for line, blocks in zip(lines, diff_blocks): + blocks = NormalizeBlocks(blocks, line) + blocks = CompactBlocks(blocks) + diff = RenderIntraLineDiff(blocks, + line, + tag, + dbg_info=dbg_info, + limit=limit, + indent=indent, + tabsize=tabsize, + mark_tabs=mark_tabs) + result.append(diff) + assert len(result) == len(lines) + return result |