From ba89c8116b2781245eb4ee24638f752af17cf5d0 Mon Sep 17 00:00:00 2001 From: Dan Helfman Date: Fri, 2 May 2008 21:03:46 +0000 Subject: [PATCH] Updated to handle the following case: * If a phrase becomes italicized, it should show the non-italic version in red and immediately after, the italic version in green. This required doing a pass on the HTML to merge certain elements together before doing the actual diff. --- controller/Html_differ.py | 80 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 75 insertions(+), 5 deletions(-) diff --git a/controller/Html_differ.py b/controller/Html_differ.py index f50aae5..3566dd7 100644 --- a/controller/Html_differ.py +++ b/controller/Html_differ.py @@ -45,6 +45,8 @@ class Html_differ( HTMLParser ): # used to replace, for instance, "
" with "
" INVALID_TAG_PATTERN = re.compile( "(\S)/>" ) INVALID_TAG_FIX = "\\1 />" + START_TAG_PATTERN = re.compile( "<([^/][^>]*)>" ) + END_TAG_PATTERN = re.compile( "]+)>" ) def diff( self, html_a, html_b ): """ @@ -64,24 +66,92 @@ class Html_differ( HTMLParser ): self.feed( html_b ) b = [ x for x in self.result if x != "" ] + ( a, b ) = self.__prepare_lists( a, b ) return self.__diff_lists( a, b ) + @staticmethod + def __track_open_tags( item, open_tags ): + """ + Add or remove from the open_tags list based on whether the given item is a start or end + tag. + """ + match = Html_differ.START_TAG_PATTERN.search( item ) + if match: + open_tags.append( match.group( 1 ) ) + return + + match = Html_differ.END_TAG_PATTERN.search( item ) + if not match: return + + tag = match.group( 1 ) + if match and tag in open_tags: + open_tags.remove( tag ) + + def __prepare_lists( self, a, b ): + """ + Prepare the two lists for diffing by merging together adjacent elements within + modified/inserted/deleted start and end HTML tags. + """ + matcher = SequenceMatcher( None, a, b ) + result_a = [] + result_b = [] + open_tags = [] # modified start tags + open_del_items = [] # deleted items within modified start and end tags + open_ins_items = [] # inserted items within modified start and end tags + + for ( change_type, i1, i2, j1, j2 ) in matcher.get_opcodes(): + if change_type == "equal": + equal_items = b[ j1:j2 ] + if len( open_tags ) == 0: + result_a.extend( equal_items ) + result_b.extend( equal_items ) + else: + open_del_items.extend( equal_items ) + open_ins_items.extend( equal_items ) + continue + + # go through the altered items looking for start and end tags + for i in range( i1, i2 ): + Html_differ.__track_open_tags( a[ i ], open_tags ) + for j in range( j1, j2 ): + Html_differ.__track_open_tags( b[ j ], open_tags ) + + if change_type == "replace": + open_del_items.extend( a[ i1:i2 ] ) + open_ins_items.extend( b[ j1:j2 ] ) + elif change_type == "delete": + open_del_items.extend( a[ i1:i2 ] ) + elif change_type == "insert": + open_ins_items.extend( b[ j1:j2 ] ) + + if len( open_tags ) == 0: + result_a.append( ''.join( open_del_items ) ) + result_b.append( ''.join( open_ins_items ) ) + open_del_items = [] + open_ins_items = [] + + return ( result_a, result_b ) + def __diff_lists( self, a, b ): + """ + Diff two prepared lists and return the result as an HTML string. + """ matcher = SequenceMatcher( None, a, b ) result = [] + open_tags = [] # inspired by http://www.aaronsw.com/2002/diff/ - for ( tag, i1, i2, j1, j2 ) in matcher.get_opcodes(): - if tag == "replace": + for ( change_type, i1, i2, j1, j2 ) in matcher.get_opcodes(): + if change_type == "replace": result.append( '' + ''.join( a[ i1:i2 ] ) + '' + \ '' + ''.join( b[ j1:j2 ] ) + '' ) - elif tag == "delete": + elif change_type == "delete": result.append( '' + ''.join( a[ i1:i2 ] ) + '' ) - elif tag == "insert": + elif change_type == "insert": result.append( '' + ''.join( b[ j1:j2 ] ) + '' ) - elif tag == "equal": + elif change_type == "equal": result.append( ''.join( b[ j1:j2 ] ) ) return "".join( result )