diff --git a/controller/Html_differ.py b/controller/Html_differ.py new file mode 100644 index 0000000..f50aae5 --- /dev/null +++ b/controller/Html_differ.py @@ -0,0 +1,87 @@ +import re +from difflib import SequenceMatcher +from htmllib import HTMLParser +from formatter import AbstractFormatter, NullWriter +from xml.sax.saxutils import quoteattr + + +class Html_differ( HTMLParser ): + """ + Generates an HTML diff for two HTML strings. It assumed that the input HTML is already cleaned. + """ + def __init__( self ): + HTMLParser.__init__( self, AbstractFormatter( NullWriter() ) ) + self.result = [] + self.requires_no_close = [ 'img', 'br' ] + + WORD_AND_WHITESPACE_PATTERN = re.compile( "\S*\s*" ) + + def handle_data( self, data ): + self.result.extend( self.WORD_AND_WHITESPACE_PATTERN.findall( data ) ) + + def handle_charref( self, ref ): + self.result.append( '&#%s;' % ref ) + + def handle_entityref( self, ref ): + self.result.append( '&%s;' % ref ) + + def handle_comment( self, comment ): + pass # ignore comments + + def handle_starttag( self, tag, method, attrs ): + self.result.append( self.get_starttag_text() ) + + def handle_endtag( self, tag, attrs ): + if tag not in self.requires_no_close: + bracketed = "" % tag + self.result.append( bracketed ) + + def unknown_starttag( self, tag, attr ): + self.handle_starttag( tag, None, attr ) + + def unknown_endtag( self, tag ): + self.handle_endtag( tag, None ) + + # used to replace, for instance, "
" with "
" + INVALID_TAG_PATTERN = re.compile( "(\S)/>" ) + INVALID_TAG_FIX = "\\1 />" + + def diff( self, html_a, html_b ): + """ + Return a composite HTML diff of the given HTML input strings. + """ + # parse html_a into a list + self.reset() + self.result = [] + html_a = self.INVALID_TAG_PATTERN.sub( self.INVALID_TAG_FIX, html_a ) + self.feed( html_a ) + a = [ x for x in self.result if x != "" ] + + # parse html_b into a list + self.reset() + self.result = [] + html_b = self.INVALID_TAG_PATTERN.sub( self.INVALID_TAG_FIX, html_b ) + self.feed( html_b ) + b = [ x for x in self.result if x != "" ] + + return self.__diff_lists( a, b ) + + def __diff_lists( self, a, b ): + matcher = SequenceMatcher( None, a, b ) + result = [] + + # inspired by http://www.aaronsw.com/2002/diff/ + for ( tag, i1, i2, j1, j2 ) in matcher.get_opcodes(): + if tag == "replace": + result.append( + '' + ''.join( a[ i1:i2 ] ) + '' + \ + '' + ''.join( b[ j1:j2 ] ) + '' + ) + elif tag == "delete": + result.append( '' + ''.join( a[ i1:i2 ] ) + '' ) + elif tag == "insert": + result.append( '' + ''.join( b[ j1:j2 ] ) + '' ) + elif tag == "equal": + result.append( ''.join( b[ j1:j2 ] ) ) + + return "".join( result )