Completed Html_differ unit tests.
This commit is contained in:
parent
ba89c8116b
commit
42ae3e0ba1
|
@ -17,6 +17,7 @@ class Html_differ( HTMLParser ):
|
||||||
WORD_AND_WHITESPACE_PATTERN = re.compile( "\S*\s*" )
|
WORD_AND_WHITESPACE_PATTERN = re.compile( "\S*\s*" )
|
||||||
|
|
||||||
def handle_data( self, data ):
|
def handle_data( self, data ):
|
||||||
|
# this turns "foo bar baz" into [ "foo ", "bar ", "baz" ] and extends the result with it
|
||||||
self.result.extend( self.WORD_AND_WHITESPACE_PATTERN.findall( data ) )
|
self.result.extend( self.WORD_AND_WHITESPACE_PATTERN.findall( data ) )
|
||||||
|
|
||||||
def handle_charref( self, ref ):
|
def handle_charref( self, ref ):
|
||||||
|
@ -33,8 +34,7 @@ class Html_differ( HTMLParser ):
|
||||||
|
|
||||||
def handle_endtag( self, tag, attrs ):
|
def handle_endtag( self, tag, attrs ):
|
||||||
if tag not in self.requires_no_close:
|
if tag not in self.requires_no_close:
|
||||||
bracketed = "</%s>" % tag
|
self.result.append( "</%s>" % tag )
|
||||||
self.result.append( bracketed )
|
|
||||||
|
|
||||||
def unknown_starttag( self, tag, attr ):
|
def unknown_starttag( self, tag, attr ):
|
||||||
self.handle_starttag( tag, None, attr )
|
self.handle_starttag( tag, None, attr )
|
||||||
|
@ -45,35 +45,56 @@ class Html_differ( HTMLParser ):
|
||||||
# used to replace, for instance, "<br/>" with "<br />"
|
# used to replace, for instance, "<br/>" with "<br />"
|
||||||
INVALID_TAG_PATTERN = re.compile( "(\S)/>" )
|
INVALID_TAG_PATTERN = re.compile( "(\S)/>" )
|
||||||
INVALID_TAG_FIX = "\\1 />"
|
INVALID_TAG_FIX = "\\1 />"
|
||||||
START_TAG_PATTERN = re.compile( "<([^/][^>]*)>" )
|
|
||||||
END_TAG_PATTERN = re.compile( "</([^>]+)>" )
|
def convert_html_to_list( self, html ):
|
||||||
|
"""
|
||||||
|
Given an HTML string, produce a list of its constituent elements (tags and text).
|
||||||
|
|
||||||
|
@type html: unicode
|
||||||
|
@param html: HTML string to parse
|
||||||
|
@rtype: [ unicode, ... ]
|
||||||
|
@return: parsed list of HTML elements
|
||||||
|
"""
|
||||||
|
self.reset()
|
||||||
|
self.result = []
|
||||||
|
html = self.INVALID_TAG_PATTERN.sub( self.INVALID_TAG_FIX, html )
|
||||||
|
self.feed( html )
|
||||||
|
return [ x for x in self.result if x != "" ]
|
||||||
|
|
||||||
def diff( self, html_a, html_b ):
|
def diff( self, html_a, html_b ):
|
||||||
"""
|
"""
|
||||||
Return a composite HTML diff of the given HTML input strings.
|
Return a composite HTML diff of the given HTML input strings. The returned string contains the
|
||||||
|
entirety of the input strings, but with deleted/modified text from html_a wrapped in <del> tags,
|
||||||
|
and inserted/modified text from html_b wrapped in <ins> tags.
|
||||||
|
|
||||||
|
@type html_a: unicode
|
||||||
|
@param html_a: original HTML string
|
||||||
|
@type html_b: unicode
|
||||||
|
@param html-b: modified HTML string
|
||||||
|
@rtype: unicode
|
||||||
|
@return: composite HTML diff
|
||||||
"""
|
"""
|
||||||
# parse html_a into a list
|
# parse the two html strings into lists
|
||||||
self.reset()
|
a = self.convert_html_to_list( html_a )
|
||||||
self.result = []
|
b = self.convert_html_to_list( html_b )
|
||||||
html_a = self.INVALID_TAG_PATTERN.sub( self.INVALID_TAG_FIX, html_a )
|
|
||||||
self.feed( html_a )
|
|
||||||
a = [ x for x in self.result if x != "" ]
|
|
||||||
|
|
||||||
# parse html_b into a list
|
# prepare the two lists for diffing, and then diff 'em
|
||||||
self.reset()
|
( a, b ) = self.prepare_lists( a, b )
|
||||||
self.result = []
|
return self.diff_lists( a, b )
|
||||||
html_b = self.INVALID_TAG_PATTERN.sub( self.INVALID_TAG_FIX, html_b )
|
|
||||||
self.feed( html_b )
|
|
||||||
b = [ x for x in self.result if x != "" ]
|
|
||||||
|
|
||||||
( a, b ) = self.__prepare_lists( a, b )
|
START_TAG_PATTERN = re.compile( "<([^/][^>]*)>" )
|
||||||
return self.__diff_lists( a, b )
|
END_TAG_PATTERN = re.compile( "</([^>]+)>" )
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def __track_open_tags( item, open_tags ):
|
def track_open_tags( item, open_tags ):
|
||||||
"""
|
"""
|
||||||
Add or remove from the open_tags list based on whether the given item is a start or end
|
Add or remove from the open_tags list based on whether the given item contains a start or end
|
||||||
tag.
|
tag. If item does not contain any tag, then open_tags remains unchanged.
|
||||||
|
|
||||||
|
@type item: unicode
|
||||||
|
@param item: chunk of HTML, containing either an HTML tag or just text
|
||||||
|
@type open_tags: [ unicode, ... ]
|
||||||
|
@param open_tags: list of open tags
|
||||||
"""
|
"""
|
||||||
match = Html_differ.START_TAG_PATTERN.search( item )
|
match = Html_differ.START_TAG_PATTERN.search( item )
|
||||||
if match:
|
if match:
|
||||||
|
@ -87,10 +108,26 @@ class Html_differ( HTMLParser ):
|
||||||
if match and tag in open_tags:
|
if match and tag in open_tags:
|
||||||
open_tags.remove( tag )
|
open_tags.remove( tag )
|
||||||
|
|
||||||
def __prepare_lists( self, a, b ):
|
def prepare_lists( self, a, b ):
|
||||||
"""
|
"""
|
||||||
Prepare the two lists for diffing by merging together adjacent elements within
|
Prepare the two lists for diffing by merging together adjacent elements that occur within
|
||||||
modified/inserted/deleted start and end HTML tags.
|
modified start and end HTML tags.
|
||||||
|
|
||||||
|
For instance, if:
|
||||||
|
a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
|
||||||
|
b = [ 'foo ', '<i>', 'bar ', 'baz', '</i> ', 'quux' ]
|
||||||
|
then the returned lists are as follows:
|
||||||
|
a = [ 'foo ', 'bar baz ', 'quux' ]
|
||||||
|
b = [ 'foo ', '<i>bar baz</i> ', 'quux' ]
|
||||||
|
|
||||||
|
Merging these elements together ensures that they're diffed as a single unit. Failing to perform
|
||||||
|
this step would mean that when a phrase in list a becomes italicized in list b, then it wouldn't
|
||||||
|
show up as modified in the resulting diff.
|
||||||
|
|
||||||
|
@type a: [ unicode, ... ]
|
||||||
|
@type b: [ unicode, ... ]
|
||||||
|
@rtype: ( [ unicode, ... ], [ unicode, ... ] )
|
||||||
|
@return: tuple of resulting list a and list b
|
||||||
"""
|
"""
|
||||||
matcher = SequenceMatcher( None, a, b )
|
matcher = SequenceMatcher( None, a, b )
|
||||||
result_a = []
|
result_a = []
|
||||||
|
@ -112,9 +149,9 @@ class Html_differ( HTMLParser ):
|
||||||
|
|
||||||
# go through the altered items looking for start and end tags
|
# go through the altered items looking for start and end tags
|
||||||
for i in range( i1, i2 ):
|
for i in range( i1, i2 ):
|
||||||
Html_differ.__track_open_tags( a[ i ], open_tags )
|
Html_differ.track_open_tags( a[ i ], open_tags )
|
||||||
for j in range( j1, j2 ):
|
for j in range( j1, j2 ):
|
||||||
Html_differ.__track_open_tags( b[ j ], open_tags )
|
Html_differ.track_open_tags( b[ j ], open_tags )
|
||||||
|
|
||||||
if change_type == "replace":
|
if change_type == "replace":
|
||||||
open_del_items.extend( a[ i1:i2 ] )
|
open_del_items.extend( a[ i1:i2 ] )
|
||||||
|
@ -125,16 +162,23 @@ class Html_differ( HTMLParser ):
|
||||||
open_ins_items.extend( b[ j1:j2 ] )
|
open_ins_items.extend( b[ j1:j2 ] )
|
||||||
|
|
||||||
if len( open_tags ) == 0:
|
if len( open_tags ) == 0:
|
||||||
result_a.append( ''.join( open_del_items ) )
|
if len( open_del_items ) > 0:
|
||||||
result_b.append( ''.join( open_ins_items ) )
|
result_a.append( ''.join( open_del_items ) )
|
||||||
|
if len( open_ins_items ) > 0:
|
||||||
|
result_b.append( ''.join( open_ins_items ) )
|
||||||
open_del_items = []
|
open_del_items = []
|
||||||
open_ins_items = []
|
open_ins_items = []
|
||||||
|
|
||||||
return ( result_a, result_b )
|
return ( result_a, result_b )
|
||||||
|
|
||||||
def __diff_lists( self, a, b ):
|
def diff_lists( self, a, b ):
|
||||||
"""
|
"""
|
||||||
Diff two prepared lists and return the result as an HTML string.
|
Diff two prepared lists and return the result as an HTML string.
|
||||||
|
|
||||||
|
@type a: [ unicode, ... ]
|
||||||
|
@type b: [ unicode, ... ]
|
||||||
|
@rtype: unicode
|
||||||
|
@return: composite HTML diff
|
||||||
"""
|
"""
|
||||||
matcher = SequenceMatcher( None, a, b )
|
matcher = SequenceMatcher( None, a, b )
|
||||||
result = []
|
result = []
|
||||||
|
|
|
@ -0,0 +1,201 @@
|
||||||
|
from controller.Html_differ import Html_differ
|
||||||
|
|
||||||
|
|
||||||
|
class Test_html_differ( object ):
|
||||||
|
def setUp( self ):
|
||||||
|
self.differ = Html_differ()
|
||||||
|
|
||||||
|
def test_convert_html_to_list( self ):
|
||||||
|
result = self.differ.convert_html_to_list( u"foo <i>bar baz</i> quux" )
|
||||||
|
|
||||||
|
assert len( result ) == 7
|
||||||
|
assert result[ 0 ] == u"foo "
|
||||||
|
assert result[ 1 ] == u"<i>"
|
||||||
|
assert result[ 2 ] == u"bar "
|
||||||
|
assert result[ 3 ] == u"baz"
|
||||||
|
assert result[ 4 ] == u"</i>"
|
||||||
|
assert result[ 5 ] == u" "
|
||||||
|
assert result[ 6 ] == u"quux"
|
||||||
|
|
||||||
|
def test_convert_html_to_list_with_character_ref( self ):
|
||||||
|
result = self.differ.convert_html_to_list( u"foo # quux" )
|
||||||
|
|
||||||
|
assert len( result ) == 4
|
||||||
|
assert result[ 0 ] == u"foo "
|
||||||
|
assert result[ 1 ] == u"#"
|
||||||
|
assert result[ 2 ] == u" "
|
||||||
|
assert result[ 3 ] == u"quux"
|
||||||
|
|
||||||
|
def test_convert_html_to_list_with_entity_ref( self ):
|
||||||
|
result = self.differ.convert_html_to_list( u"foo quux" )
|
||||||
|
|
||||||
|
assert len( result ) == 4
|
||||||
|
assert result[ 0 ] == u"foo "
|
||||||
|
assert result[ 1 ] == u" "
|
||||||
|
assert result[ 2 ] == u" "
|
||||||
|
assert result[ 3 ] == u"quux"
|
||||||
|
|
||||||
|
def test_diff_with_insert( self ):
|
||||||
|
a = 'foo bar baz quux'
|
||||||
|
b = 'foo bar whee baz quux'
|
||||||
|
|
||||||
|
result = self.differ.diff( a, b )
|
||||||
|
|
||||||
|
assert result == 'foo bar <ins class="diff">whee </ins>baz quux'
|
||||||
|
|
||||||
|
def test_diff_with_delete( self ):
|
||||||
|
a = 'foo bar baz quux'
|
||||||
|
b = 'foo bar quux'
|
||||||
|
|
||||||
|
result = self.differ.diff( a, b )
|
||||||
|
|
||||||
|
assert result == 'foo bar <del class="diff">baz </del>quux'
|
||||||
|
|
||||||
|
def test_diff_with_replace( self ):
|
||||||
|
a = 'foo bar baz quux'
|
||||||
|
b = 'foo bar whee quux'
|
||||||
|
|
||||||
|
result = self.differ.diff( a, b )
|
||||||
|
|
||||||
|
assert result == 'foo bar <del class="diff modified">baz </del><ins class="diff modified">whee </ins>quux'
|
||||||
|
|
||||||
|
def test_diff_with_italics( self ):
|
||||||
|
a = 'foo bar baz quux'
|
||||||
|
b = 'foo <i>bar baz</i> quux'
|
||||||
|
|
||||||
|
result = self.differ.diff( a, b )
|
||||||
|
|
||||||
|
assert result == 'foo <del class="diff modified">bar baz </del><ins class="diff modified"><i>bar baz</i> </ins>quux'
|
||||||
|
|
||||||
|
def test_diff_with_italics_and_insert( self ):
|
||||||
|
a = 'foo bar baz quux'
|
||||||
|
b = 'foo <i>bar whee baz</i> quux'
|
||||||
|
|
||||||
|
result = self.differ.diff( a, b )
|
||||||
|
|
||||||
|
assert result == 'foo <del class="diff modified">bar baz </del><ins class="diff modified"><i>bar whee baz</i> </ins>quux'
|
||||||
|
|
||||||
|
def test_track_open_tags( self ):
|
||||||
|
open_tags = []
|
||||||
|
|
||||||
|
self.differ.track_open_tags( u"foo ", open_tags )
|
||||||
|
assert open_tags == []
|
||||||
|
self.differ.track_open_tags( u"<i>", open_tags )
|
||||||
|
assert open_tags == [ u"i" ]
|
||||||
|
self.differ.track_open_tags( u"bar ", open_tags )
|
||||||
|
assert open_tags == [ u"i" ]
|
||||||
|
self.differ.track_open_tags( u"<b>", open_tags )
|
||||||
|
assert open_tags == [ u"i", u"b" ]
|
||||||
|
self.differ.track_open_tags( u"baz", open_tags )
|
||||||
|
assert open_tags == [ u"i", u"b" ]
|
||||||
|
self.differ.track_open_tags( u"</b>", open_tags )
|
||||||
|
assert open_tags == [ u"i" ]
|
||||||
|
self.differ.track_open_tags( u"</i>", open_tags )
|
||||||
|
assert open_tags == []
|
||||||
|
self.differ.track_open_tags( u"quux", open_tags )
|
||||||
|
assert open_tags == []
|
||||||
|
|
||||||
|
def test_prepare_lists_with_insert( self ):
|
||||||
|
a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
|
||||||
|
b = [ 'foo ', 'bar ', 'whee ', 'baz ', 'quux' ]
|
||||||
|
|
||||||
|
result = self.differ.prepare_lists( a, b )
|
||||||
|
|
||||||
|
assert len( result ) == 2
|
||||||
|
( new_a, new_b ) = result
|
||||||
|
|
||||||
|
# there should be no change
|
||||||
|
assert new_a == a
|
||||||
|
assert new_b == b
|
||||||
|
|
||||||
|
def test_prepare_lists_with_delete( self ):
|
||||||
|
a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
|
||||||
|
b = [ 'foo ', 'bar ', 'quux' ]
|
||||||
|
|
||||||
|
result = self.differ.prepare_lists( a, b )
|
||||||
|
|
||||||
|
assert len( result ) == 2
|
||||||
|
( new_a, new_b ) = result
|
||||||
|
|
||||||
|
# there should be no change
|
||||||
|
assert new_a == a
|
||||||
|
assert new_b == b
|
||||||
|
|
||||||
|
def test_prepare_lists_with_replace( self ):
|
||||||
|
a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
|
||||||
|
b = [ 'foo ', 'bar ', 'whee ', 'quux' ]
|
||||||
|
|
||||||
|
result = self.differ.prepare_lists( a, b )
|
||||||
|
|
||||||
|
assert len( result ) == 2
|
||||||
|
( new_a, new_b ) = result
|
||||||
|
|
||||||
|
# there should be no change
|
||||||
|
assert new_a == a
|
||||||
|
assert new_b == b
|
||||||
|
|
||||||
|
def test_prepare_lists_with_italics( self ):
|
||||||
|
a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
|
||||||
|
b = [ 'foo ', '<i>', 'bar ', 'baz', '</i> ', 'quux' ]
|
||||||
|
|
||||||
|
result = self.differ.prepare_lists( a, b )
|
||||||
|
|
||||||
|
assert len( result ) == 2
|
||||||
|
( new_a, new_b ) = result
|
||||||
|
|
||||||
|
# the elements within italics should be merged
|
||||||
|
assert new_a == [ 'foo ', 'bar baz ', 'quux' ]
|
||||||
|
assert new_b == [ 'foo ', '<i>bar baz</i> ', 'quux' ]
|
||||||
|
|
||||||
|
def test_prepare_lists_with_italics_and_insert( self ):
|
||||||
|
a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
|
||||||
|
b = [ 'foo ', '<i>', 'bar ', 'whee ', 'baz', '</i> ', 'quux' ]
|
||||||
|
|
||||||
|
result = self.differ.prepare_lists( a, b )
|
||||||
|
|
||||||
|
assert len( result ) == 2
|
||||||
|
( new_a, new_b ) = result
|
||||||
|
|
||||||
|
# the elements within italics should be merged
|
||||||
|
assert new_a == [ 'foo ', 'bar baz ', 'quux' ]
|
||||||
|
assert new_b == [ 'foo ', '<i>bar whee baz</i> ', 'quux' ]
|
||||||
|
|
||||||
|
def test_diff_lists_with_insert( self ):
|
||||||
|
a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
|
||||||
|
b = [ 'foo ', 'bar ', 'whee ', 'baz ', 'quux' ]
|
||||||
|
|
||||||
|
result = self.differ.diff_lists( a, b )
|
||||||
|
|
||||||
|
assert result == 'foo bar <ins class="diff">whee </ins>baz quux'
|
||||||
|
|
||||||
|
def test_diff_lists_with_delete( self ):
|
||||||
|
a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
|
||||||
|
b = [ 'foo ', 'bar ', 'quux' ]
|
||||||
|
|
||||||
|
result = self.differ.diff_lists( a, b )
|
||||||
|
|
||||||
|
assert result == 'foo bar <del class="diff">baz </del>quux'
|
||||||
|
|
||||||
|
def test_diff_lists_with_replace( self ):
|
||||||
|
a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
|
||||||
|
b = [ 'foo ', 'bar ', 'whee ', 'quux' ]
|
||||||
|
|
||||||
|
result = self.differ.diff_lists( a, b )
|
||||||
|
|
||||||
|
assert result == 'foo bar <del class="diff modified">baz </del><ins class="diff modified">whee </ins>quux'
|
||||||
|
|
||||||
|
def test_diff_lists_with_italics( self ):
|
||||||
|
a = [ 'foo ', 'bar baz ', 'quux' ]
|
||||||
|
b = [ 'foo ', '<i>bar baz</i> ', 'quux' ]
|
||||||
|
|
||||||
|
result = self.differ.diff_lists( a, b )
|
||||||
|
|
||||||
|
assert result == 'foo <del class="diff modified">bar baz </del><ins class="diff modified"><i>bar baz</i> </ins>quux'
|
||||||
|
|
||||||
|
def test_diff_lists_with_italics_and_insert( self ):
|
||||||
|
a = [ 'foo ', 'bar baz ', 'quux' ]
|
||||||
|
b = [ 'foo ', '<i>bar whee baz</i> ', 'quux' ]
|
||||||
|
|
||||||
|
result = self.differ.diff_lists( a, b )
|
||||||
|
|
||||||
|
assert result == 'foo <del class="diff modified">bar baz </del><ins class="diff modified"><i>bar whee baz</i> </ins>quux'
|
Reference in New Issue