diff --git a/controller/Html_differ.py b/controller/Html_differ.py
index 3566dd7..0777c7f 100644
--- a/controller/Html_differ.py
+++ b/controller/Html_differ.py
@@ -17,6 +17,7 @@ class Html_differ( HTMLParser ):
WORD_AND_WHITESPACE_PATTERN = re.compile( "\S*\s*" )
def handle_data( self, data ):
+ # this turns "foo bar baz" into [ "foo ", "bar ", "baz" ] and extends the result with it
self.result.extend( self.WORD_AND_WHITESPACE_PATTERN.findall( data ) )
def handle_charref( self, ref ):
@@ -33,8 +34,7 @@ class Html_differ( HTMLParser ):
def handle_endtag( self, tag, attrs ):
if tag not in self.requires_no_close:
- bracketed = "%s>" % tag
- self.result.append( bracketed )
+ self.result.append( "%s>" % tag )
def unknown_starttag( self, tag, attr ):
self.handle_starttag( tag, None, attr )
@@ -45,35 +45,56 @@ class Html_differ( HTMLParser ):
# used to replace, for instance, "
" with "
"
INVALID_TAG_PATTERN = re.compile( "(\S)/>" )
INVALID_TAG_FIX = "\\1 />"
- START_TAG_PATTERN = re.compile( "<([^/][^>]*)>" )
- END_TAG_PATTERN = re.compile( "([^>]+)>" )
+
+ def convert_html_to_list( self, html ):
+ """
+ Given an HTML string, produce a list of its constituent elements (tags and text).
+
+ @type html: unicode
+ @param html: HTML string to parse
+ @rtype: [ unicode, ... ]
+ @return: parsed list of HTML elements
+ """
+ self.reset()
+ self.result = []
+ html = self.INVALID_TAG_PATTERN.sub( self.INVALID_TAG_FIX, html )
+ self.feed( html )
+ return [ x for x in self.result if x != "" ]
def diff( self, html_a, html_b ):
"""
- Return a composite HTML diff of the given HTML input strings.
+ Return a composite HTML diff of the given HTML input strings. The returned string contains the
+ entirety of the input strings, but with deleted/modified text from html_a wrapped in tags,
+ and inserted/modified text from html_b wrapped in tags.
+
+ @type html_a: unicode
+ @param html_a: original HTML string
+ @type html_b: unicode
+ @param html-b: modified HTML string
+ @rtype: unicode
+ @return: composite HTML diff
"""
- # parse html_a into a list
- self.reset()
- self.result = []
- html_a = self.INVALID_TAG_PATTERN.sub( self.INVALID_TAG_FIX, html_a )
- self.feed( html_a )
- a = [ x for x in self.result if x != "" ]
+ # parse the two html strings into lists
+ a = self.convert_html_to_list( html_a )
+ b = self.convert_html_to_list( html_b )
- # parse html_b into a list
- self.reset()
- self.result = []
- html_b = self.INVALID_TAG_PATTERN.sub( self.INVALID_TAG_FIX, html_b )
- self.feed( html_b )
- b = [ x for x in self.result if x != "" ]
+ # prepare the two lists for diffing, and then diff 'em
+ ( a, b ) = self.prepare_lists( a, b )
+ return self.diff_lists( a, b )
- ( a, b ) = self.__prepare_lists( a, b )
- return self.__diff_lists( a, b )
+ START_TAG_PATTERN = re.compile( "<([^/][^>]*)>" )
+ END_TAG_PATTERN = re.compile( "([^>]+)>" )
@staticmethod
- def __track_open_tags( item, open_tags ):
+ def track_open_tags( item, open_tags ):
"""
- Add or remove from the open_tags list based on whether the given item is a start or end
- tag.
+ Add or remove from the open_tags list based on whether the given item contains a start or end
+ tag. If item does not contain any tag, then open_tags remains unchanged.
+
+ @type item: unicode
+ @param item: chunk of HTML, containing either an HTML tag or just text
+ @type open_tags: [ unicode, ... ]
+ @param open_tags: list of open tags
"""
match = Html_differ.START_TAG_PATTERN.search( item )
if match:
@@ -87,10 +108,26 @@ class Html_differ( HTMLParser ):
if match and tag in open_tags:
open_tags.remove( tag )
- def __prepare_lists( self, a, b ):
+ def prepare_lists( self, a, b ):
"""
- Prepare the two lists for diffing by merging together adjacent elements within
- modified/inserted/deleted start and end HTML tags.
+ Prepare the two lists for diffing by merging together adjacent elements that occur within
+ modified start and end HTML tags.
+
+ For instance, if:
+ a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
+ b = [ 'foo ', '', 'bar ', 'baz', ' ', 'quux' ]
+ then the returned lists are as follows:
+ a = [ 'foo ', 'bar baz ', 'quux' ]
+ b = [ 'foo ', 'bar baz ', 'quux' ]
+
+ Merging these elements together ensures that they're diffed as a single unit. Failing to perform
+ this step would mean that when a phrase in list a becomes italicized in list b, then it wouldn't
+ show up as modified in the resulting diff.
+
+ @type a: [ unicode, ... ]
+ @type b: [ unicode, ... ]
+ @rtype: ( [ unicode, ... ], [ unicode, ... ] )
+ @return: tuple of resulting list a and list b
"""
matcher = SequenceMatcher( None, a, b )
result_a = []
@@ -112,9 +149,9 @@ class Html_differ( HTMLParser ):
# go through the altered items looking for start and end tags
for i in range( i1, i2 ):
- Html_differ.__track_open_tags( a[ i ], open_tags )
+ Html_differ.track_open_tags( a[ i ], open_tags )
for j in range( j1, j2 ):
- Html_differ.__track_open_tags( b[ j ], open_tags )
+ Html_differ.track_open_tags( b[ j ], open_tags )
if change_type == "replace":
open_del_items.extend( a[ i1:i2 ] )
@@ -125,16 +162,23 @@ class Html_differ( HTMLParser ):
open_ins_items.extend( b[ j1:j2 ] )
if len( open_tags ) == 0:
- result_a.append( ''.join( open_del_items ) )
- result_b.append( ''.join( open_ins_items ) )
+ if len( open_del_items ) > 0:
+ result_a.append( ''.join( open_del_items ) )
+ if len( open_ins_items ) > 0:
+ result_b.append( ''.join( open_ins_items ) )
open_del_items = []
open_ins_items = []
return ( result_a, result_b )
- def __diff_lists( self, a, b ):
+ def diff_lists( self, a, b ):
"""
Diff two prepared lists and return the result as an HTML string.
+
+ @type a: [ unicode, ... ]
+ @type b: [ unicode, ... ]
+ @rtype: unicode
+ @return: composite HTML diff
"""
matcher = SequenceMatcher( None, a, b )
result = []
diff --git a/controller/test/Test_html_differ.py b/controller/test/Test_html_differ.py
new file mode 100644
index 0000000..254505a
--- /dev/null
+++ b/controller/test/Test_html_differ.py
@@ -0,0 +1,201 @@
+from controller.Html_differ import Html_differ
+
+
+class Test_html_differ( object ):
+ def setUp( self ):
+ self.differ = Html_differ()
+
+ def test_convert_html_to_list( self ):
+ result = self.differ.convert_html_to_list( u"foo bar baz quux" )
+
+ assert len( result ) == 7
+ assert result[ 0 ] == u"foo "
+ assert result[ 1 ] == u""
+ assert result[ 2 ] == u"bar "
+ assert result[ 3 ] == u"baz"
+ assert result[ 4 ] == u""
+ assert result[ 5 ] == u" "
+ assert result[ 6 ] == u"quux"
+
+ def test_convert_html_to_list_with_character_ref( self ):
+ result = self.differ.convert_html_to_list( u"foo # quux" )
+
+ assert len( result ) == 4
+ assert result[ 0 ] == u"foo "
+ assert result[ 1 ] == u"#"
+ assert result[ 2 ] == u" "
+ assert result[ 3 ] == u"quux"
+
+ def test_convert_html_to_list_with_entity_ref( self ):
+ result = self.differ.convert_html_to_list( u"foo quux" )
+
+ assert len( result ) == 4
+ assert result[ 0 ] == u"foo "
+ assert result[ 1 ] == u" "
+ assert result[ 2 ] == u" "
+ assert result[ 3 ] == u"quux"
+
+ def test_diff_with_insert( self ):
+ a = 'foo bar baz quux'
+ b = 'foo bar whee baz quux'
+
+ result = self.differ.diff( a, b )
+
+ assert result == 'foo bar whee baz quux'
+
+ def test_diff_with_delete( self ):
+ a = 'foo bar baz quux'
+ b = 'foo bar quux'
+
+ result = self.differ.diff( a, b )
+
+ assert result == 'foo bar baz quux'
+
+ def test_diff_with_replace( self ):
+ a = 'foo bar baz quux'
+ b = 'foo bar whee quux'
+
+ result = self.differ.diff( a, b )
+
+ assert result == 'foo bar baz whee quux'
+
+ def test_diff_with_italics( self ):
+ a = 'foo bar baz quux'
+ b = 'foo bar baz quux'
+
+ result = self.differ.diff( a, b )
+
+ assert result == 'foo bar baz bar baz quux'
+
+ def test_diff_with_italics_and_insert( self ):
+ a = 'foo bar baz quux'
+ b = 'foo bar whee baz quux'
+
+ result = self.differ.diff( a, b )
+
+ assert result == 'foo bar baz bar whee baz quux'
+
+ def test_track_open_tags( self ):
+ open_tags = []
+
+ self.differ.track_open_tags( u"foo ", open_tags )
+ assert open_tags == []
+ self.differ.track_open_tags( u"", open_tags )
+ assert open_tags == [ u"i" ]
+ self.differ.track_open_tags( u"bar ", open_tags )
+ assert open_tags == [ u"i" ]
+ self.differ.track_open_tags( u"", open_tags )
+ assert open_tags == [ u"i", u"b" ]
+ self.differ.track_open_tags( u"baz", open_tags )
+ assert open_tags == [ u"i", u"b" ]
+ self.differ.track_open_tags( u"", open_tags )
+ assert open_tags == [ u"i" ]
+ self.differ.track_open_tags( u"", open_tags )
+ assert open_tags == []
+ self.differ.track_open_tags( u"quux", open_tags )
+ assert open_tags == []
+
+ def test_prepare_lists_with_insert( self ):
+ a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
+ b = [ 'foo ', 'bar ', 'whee ', 'baz ', 'quux' ]
+
+ result = self.differ.prepare_lists( a, b )
+
+ assert len( result ) == 2
+ ( new_a, new_b ) = result
+
+ # there should be no change
+ assert new_a == a
+ assert new_b == b
+
+ def test_prepare_lists_with_delete( self ):
+ a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
+ b = [ 'foo ', 'bar ', 'quux' ]
+
+ result = self.differ.prepare_lists( a, b )
+
+ assert len( result ) == 2
+ ( new_a, new_b ) = result
+
+ # there should be no change
+ assert new_a == a
+ assert new_b == b
+
+ def test_prepare_lists_with_replace( self ):
+ a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
+ b = [ 'foo ', 'bar ', 'whee ', 'quux' ]
+
+ result = self.differ.prepare_lists( a, b )
+
+ assert len( result ) == 2
+ ( new_a, new_b ) = result
+
+ # there should be no change
+ assert new_a == a
+ assert new_b == b
+
+ def test_prepare_lists_with_italics( self ):
+ a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
+ b = [ 'foo ', '', 'bar ', 'baz', ' ', 'quux' ]
+
+ result = self.differ.prepare_lists( a, b )
+
+ assert len( result ) == 2
+ ( new_a, new_b ) = result
+
+ # the elements within italics should be merged
+ assert new_a == [ 'foo ', 'bar baz ', 'quux' ]
+ assert new_b == [ 'foo ', 'bar baz ', 'quux' ]
+
+ def test_prepare_lists_with_italics_and_insert( self ):
+ a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
+ b = [ 'foo ', '', 'bar ', 'whee ', 'baz', ' ', 'quux' ]
+
+ result = self.differ.prepare_lists( a, b )
+
+ assert len( result ) == 2
+ ( new_a, new_b ) = result
+
+ # the elements within italics should be merged
+ assert new_a == [ 'foo ', 'bar baz ', 'quux' ]
+ assert new_b == [ 'foo ', 'bar whee baz ', 'quux' ]
+
+ def test_diff_lists_with_insert( self ):
+ a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
+ b = [ 'foo ', 'bar ', 'whee ', 'baz ', 'quux' ]
+
+ result = self.differ.diff_lists( a, b )
+
+ assert result == 'foo bar whee baz quux'
+
+ def test_diff_lists_with_delete( self ):
+ a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
+ b = [ 'foo ', 'bar ', 'quux' ]
+
+ result = self.differ.diff_lists( a, b )
+
+ assert result == 'foo bar baz quux'
+
+ def test_diff_lists_with_replace( self ):
+ a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
+ b = [ 'foo ', 'bar ', 'whee ', 'quux' ]
+
+ result = self.differ.diff_lists( a, b )
+
+ assert result == 'foo bar baz whee quux'
+
+ def test_diff_lists_with_italics( self ):
+ a = [ 'foo ', 'bar baz ', 'quux' ]
+ b = [ 'foo ', 'bar baz ', 'quux' ]
+
+ result = self.differ.diff_lists( a, b )
+
+ assert result == 'foo bar baz bar baz quux'
+
+ def test_diff_lists_with_italics_and_insert( self ):
+ a = [ 'foo ', 'bar baz ', 'quux' ]
+ b = [ 'foo ', 'bar whee baz ', 'quux' ]
+
+ result = self.differ.diff_lists( a, b )
+
+ assert result == 'foo bar baz bar whee baz quux'