diff --git a/controller/Html_differ.py b/controller/Html_differ.py index 3566dd7..0777c7f 100644 --- a/controller/Html_differ.py +++ b/controller/Html_differ.py @@ -17,6 +17,7 @@ class Html_differ( HTMLParser ): WORD_AND_WHITESPACE_PATTERN = re.compile( "\S*\s*" ) def handle_data( self, data ): + # this turns "foo bar baz" into [ "foo ", "bar ", "baz" ] and extends the result with it self.result.extend( self.WORD_AND_WHITESPACE_PATTERN.findall( data ) ) def handle_charref( self, ref ): @@ -33,8 +34,7 @@ class Html_differ( HTMLParser ): def handle_endtag( self, tag, attrs ): if tag not in self.requires_no_close: - bracketed = "" % tag - self.result.append( bracketed ) + self.result.append( "" % tag ) def unknown_starttag( self, tag, attr ): self.handle_starttag( tag, None, attr ) @@ -45,35 +45,56 @@ class Html_differ( HTMLParser ): # used to replace, for instance, "
" with "
" INVALID_TAG_PATTERN = re.compile( "(\S)/>" ) INVALID_TAG_FIX = "\\1 />" - START_TAG_PATTERN = re.compile( "<([^/][^>]*)>" ) - END_TAG_PATTERN = re.compile( "]+)>" ) + + def convert_html_to_list( self, html ): + """ + Given an HTML string, produce a list of its constituent elements (tags and text). + + @type html: unicode + @param html: HTML string to parse + @rtype: [ unicode, ... ] + @return: parsed list of HTML elements + """ + self.reset() + self.result = [] + html = self.INVALID_TAG_PATTERN.sub( self.INVALID_TAG_FIX, html ) + self.feed( html ) + return [ x for x in self.result if x != "" ] def diff( self, html_a, html_b ): """ - Return a composite HTML diff of the given HTML input strings. + Return a composite HTML diff of the given HTML input strings. The returned string contains the + entirety of the input strings, but with deleted/modified text from html_a wrapped in tags, + and inserted/modified text from html_b wrapped in tags. + + @type html_a: unicode + @param html_a: original HTML string + @type html_b: unicode + @param html-b: modified HTML string + @rtype: unicode + @return: composite HTML diff """ - # parse html_a into a list - self.reset() - self.result = [] - html_a = self.INVALID_TAG_PATTERN.sub( self.INVALID_TAG_FIX, html_a ) - self.feed( html_a ) - a = [ x for x in self.result if x != "" ] + # parse the two html strings into lists + a = self.convert_html_to_list( html_a ) + b = self.convert_html_to_list( html_b ) - # parse html_b into a list - self.reset() - self.result = [] - html_b = self.INVALID_TAG_PATTERN.sub( self.INVALID_TAG_FIX, html_b ) - self.feed( html_b ) - b = [ x for x in self.result if x != "" ] + # prepare the two lists for diffing, and then diff 'em + ( a, b ) = self.prepare_lists( a, b ) + return self.diff_lists( a, b ) - ( a, b ) = self.__prepare_lists( a, b ) - return self.__diff_lists( a, b ) + START_TAG_PATTERN = re.compile( "<([^/][^>]*)>" ) + END_TAG_PATTERN = re.compile( "]+)>" ) @staticmethod - def __track_open_tags( item, open_tags ): + def track_open_tags( item, open_tags ): """ - Add or remove from the open_tags list based on whether the given item is a start or end - tag. + Add or remove from the open_tags list based on whether the given item contains a start or end + tag. If item does not contain any tag, then open_tags remains unchanged. + + @type item: unicode + @param item: chunk of HTML, containing either an HTML tag or just text + @type open_tags: [ unicode, ... ] + @param open_tags: list of open tags """ match = Html_differ.START_TAG_PATTERN.search( item ) if match: @@ -87,10 +108,26 @@ class Html_differ( HTMLParser ): if match and tag in open_tags: open_tags.remove( tag ) - def __prepare_lists( self, a, b ): + def prepare_lists( self, a, b ): """ - Prepare the two lists for diffing by merging together adjacent elements within - modified/inserted/deleted start and end HTML tags. + Prepare the two lists for diffing by merging together adjacent elements that occur within + modified start and end HTML tags. + + For instance, if: + a = [ 'foo ', 'bar ', 'baz ', 'quux' ] + b = [ 'foo ', '', 'bar ', 'baz', ' ', 'quux' ] + then the returned lists are as follows: + a = [ 'foo ', 'bar baz ', 'quux' ] + b = [ 'foo ', 'bar baz ', 'quux' ] + + Merging these elements together ensures that they're diffed as a single unit. Failing to perform + this step would mean that when a phrase in list a becomes italicized in list b, then it wouldn't + show up as modified in the resulting diff. + + @type a: [ unicode, ... ] + @type b: [ unicode, ... ] + @rtype: ( [ unicode, ... ], [ unicode, ... ] ) + @return: tuple of resulting list a and list b """ matcher = SequenceMatcher( None, a, b ) result_a = [] @@ -112,9 +149,9 @@ class Html_differ( HTMLParser ): # go through the altered items looking for start and end tags for i in range( i1, i2 ): - Html_differ.__track_open_tags( a[ i ], open_tags ) + Html_differ.track_open_tags( a[ i ], open_tags ) for j in range( j1, j2 ): - Html_differ.__track_open_tags( b[ j ], open_tags ) + Html_differ.track_open_tags( b[ j ], open_tags ) if change_type == "replace": open_del_items.extend( a[ i1:i2 ] ) @@ -125,16 +162,23 @@ class Html_differ( HTMLParser ): open_ins_items.extend( b[ j1:j2 ] ) if len( open_tags ) == 0: - result_a.append( ''.join( open_del_items ) ) - result_b.append( ''.join( open_ins_items ) ) + if len( open_del_items ) > 0: + result_a.append( ''.join( open_del_items ) ) + if len( open_ins_items ) > 0: + result_b.append( ''.join( open_ins_items ) ) open_del_items = [] open_ins_items = [] return ( result_a, result_b ) - def __diff_lists( self, a, b ): + def diff_lists( self, a, b ): """ Diff two prepared lists and return the result as an HTML string. + + @type a: [ unicode, ... ] + @type b: [ unicode, ... ] + @rtype: unicode + @return: composite HTML diff """ matcher = SequenceMatcher( None, a, b ) result = [] diff --git a/controller/test/Test_html_differ.py b/controller/test/Test_html_differ.py new file mode 100644 index 0000000..254505a --- /dev/null +++ b/controller/test/Test_html_differ.py @@ -0,0 +1,201 @@ +from controller.Html_differ import Html_differ + + +class Test_html_differ( object ): + def setUp( self ): + self.differ = Html_differ() + + def test_convert_html_to_list( self ): + result = self.differ.convert_html_to_list( u"foo bar baz quux" ) + + assert len( result ) == 7 + assert result[ 0 ] == u"foo " + assert result[ 1 ] == u"" + assert result[ 2 ] == u"bar " + assert result[ 3 ] == u"baz" + assert result[ 4 ] == u"" + assert result[ 5 ] == u" " + assert result[ 6 ] == u"quux" + + def test_convert_html_to_list_with_character_ref( self ): + result = self.differ.convert_html_to_list( u"foo # quux" ) + + assert len( result ) == 4 + assert result[ 0 ] == u"foo " + assert result[ 1 ] == u"#" + assert result[ 2 ] == u" " + assert result[ 3 ] == u"quux" + + def test_convert_html_to_list_with_entity_ref( self ): + result = self.differ.convert_html_to_list( u"foo   quux" ) + + assert len( result ) == 4 + assert result[ 0 ] == u"foo " + assert result[ 1 ] == u" " + assert result[ 2 ] == u" " + assert result[ 3 ] == u"quux" + + def test_diff_with_insert( self ): + a = 'foo bar baz quux' + b = 'foo bar whee baz quux' + + result = self.differ.diff( a, b ) + + assert result == 'foo bar whee baz quux' + + def test_diff_with_delete( self ): + a = 'foo bar baz quux' + b = 'foo bar quux' + + result = self.differ.diff( a, b ) + + assert result == 'foo bar baz quux' + + def test_diff_with_replace( self ): + a = 'foo bar baz quux' + b = 'foo bar whee quux' + + result = self.differ.diff( a, b ) + + assert result == 'foo bar baz whee quux' + + def test_diff_with_italics( self ): + a = 'foo bar baz quux' + b = 'foo bar baz quux' + + result = self.differ.diff( a, b ) + + assert result == 'foo bar baz bar baz quux' + + def test_diff_with_italics_and_insert( self ): + a = 'foo bar baz quux' + b = 'foo bar whee baz quux' + + result = self.differ.diff( a, b ) + + assert result == 'foo bar baz bar whee baz quux' + + def test_track_open_tags( self ): + open_tags = [] + + self.differ.track_open_tags( u"foo ", open_tags ) + assert open_tags == [] + self.differ.track_open_tags( u"", open_tags ) + assert open_tags == [ u"i" ] + self.differ.track_open_tags( u"bar ", open_tags ) + assert open_tags == [ u"i" ] + self.differ.track_open_tags( u"", open_tags ) + assert open_tags == [ u"i", u"b" ] + self.differ.track_open_tags( u"baz", open_tags ) + assert open_tags == [ u"i", u"b" ] + self.differ.track_open_tags( u"", open_tags ) + assert open_tags == [ u"i" ] + self.differ.track_open_tags( u"", open_tags ) + assert open_tags == [] + self.differ.track_open_tags( u"quux", open_tags ) + assert open_tags == [] + + def test_prepare_lists_with_insert( self ): + a = [ 'foo ', 'bar ', 'baz ', 'quux' ] + b = [ 'foo ', 'bar ', 'whee ', 'baz ', 'quux' ] + + result = self.differ.prepare_lists( a, b ) + + assert len( result ) == 2 + ( new_a, new_b ) = result + + # there should be no change + assert new_a == a + assert new_b == b + + def test_prepare_lists_with_delete( self ): + a = [ 'foo ', 'bar ', 'baz ', 'quux' ] + b = [ 'foo ', 'bar ', 'quux' ] + + result = self.differ.prepare_lists( a, b ) + + assert len( result ) == 2 + ( new_a, new_b ) = result + + # there should be no change + assert new_a == a + assert new_b == b + + def test_prepare_lists_with_replace( self ): + a = [ 'foo ', 'bar ', 'baz ', 'quux' ] + b = [ 'foo ', 'bar ', 'whee ', 'quux' ] + + result = self.differ.prepare_lists( a, b ) + + assert len( result ) == 2 + ( new_a, new_b ) = result + + # there should be no change + assert new_a == a + assert new_b == b + + def test_prepare_lists_with_italics( self ): + a = [ 'foo ', 'bar ', 'baz ', 'quux' ] + b = [ 'foo ', '', 'bar ', 'baz', ' ', 'quux' ] + + result = self.differ.prepare_lists( a, b ) + + assert len( result ) == 2 + ( new_a, new_b ) = result + + # the elements within italics should be merged + assert new_a == [ 'foo ', 'bar baz ', 'quux' ] + assert new_b == [ 'foo ', 'bar baz ', 'quux' ] + + def test_prepare_lists_with_italics_and_insert( self ): + a = [ 'foo ', 'bar ', 'baz ', 'quux' ] + b = [ 'foo ', '', 'bar ', 'whee ', 'baz', ' ', 'quux' ] + + result = self.differ.prepare_lists( a, b ) + + assert len( result ) == 2 + ( new_a, new_b ) = result + + # the elements within italics should be merged + assert new_a == [ 'foo ', 'bar baz ', 'quux' ] + assert new_b == [ 'foo ', 'bar whee baz ', 'quux' ] + + def test_diff_lists_with_insert( self ): + a = [ 'foo ', 'bar ', 'baz ', 'quux' ] + b = [ 'foo ', 'bar ', 'whee ', 'baz ', 'quux' ] + + result = self.differ.diff_lists( a, b ) + + assert result == 'foo bar whee baz quux' + + def test_diff_lists_with_delete( self ): + a = [ 'foo ', 'bar ', 'baz ', 'quux' ] + b = [ 'foo ', 'bar ', 'quux' ] + + result = self.differ.diff_lists( a, b ) + + assert result == 'foo bar baz quux' + + def test_diff_lists_with_replace( self ): + a = [ 'foo ', 'bar ', 'baz ', 'quux' ] + b = [ 'foo ', 'bar ', 'whee ', 'quux' ] + + result = self.differ.diff_lists( a, b ) + + assert result == 'foo bar baz whee quux' + + def test_diff_lists_with_italics( self ): + a = [ 'foo ', 'bar baz ', 'quux' ] + b = [ 'foo ', 'bar baz ', 'quux' ] + + result = self.differ.diff_lists( a, b ) + + assert result == 'foo bar baz bar baz quux' + + def test_diff_lists_with_italics_and_insert( self ): + a = [ 'foo ', 'bar baz ', 'quux' ] + b = [ 'foo ', 'bar whee baz ', 'quux' ] + + result = self.differ.diff_lists( a, b ) + + assert result == 'foo bar baz bar whee baz quux'