Personal wiki notebook (not under development)

Html_differ.py 7.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. import re
  2. from difflib import SequenceMatcher
  3. from htmllib import HTMLParser
  4. from formatter import AbstractFormatter, NullWriter
  5. from xml.sax.saxutils import quoteattr
  6. class Html_differ( HTMLParser ):
  7. """
  8. Generates an HTML diff for two HTML strings. It assumed that the input HTML is already cleaned.
  9. """
  10. def __init__( self ):
  11. HTMLParser.__init__( self, AbstractFormatter( NullWriter() ) )
  12. self.result = []
  13. self.requires_no_close = [ 'img', 'br' ]
  14. WORD_AND_WHITESPACE_PATTERN = re.compile( "\S*\s*" )
  15. def handle_data( self, data ):
  16. # this turns "foo bar baz" into [ "foo ", "bar ", "baz" ] and extends the result with it
  17. self.result.extend( self.WORD_AND_WHITESPACE_PATTERN.findall( data ) )
  18. def handle_charref( self, ref ):
  19. self.result.append( '&#%s;' % ref )
  20. def handle_entityref( self, ref ):
  21. self.result.append( '&%s;' % ref )
  22. def handle_comment( self, comment ):
  23. pass # ignore comments
  24. def handle_starttag( self, tag, method, attrs ):
  25. self.result.append( self.get_starttag_text() )
  26. def handle_endtag( self, tag, attrs ):
  27. if tag not in self.requires_no_close:
  28. self.result.append( "</%s>" % tag )
  29. def unknown_starttag( self, tag, attr ):
  30. self.handle_starttag( tag, None, attr )
  31. def unknown_endtag( self, tag ):
  32. self.handle_endtag( tag, None )
  33. # used to replace, for instance, "<br/>" with "<br />"
  34. INVALID_TAG_PATTERN = re.compile( "(\S)/>" )
  35. INVALID_TAG_FIX = "\\1 />"
  36. def convert_html_to_list( self, html ):
  37. """
  38. Given an HTML string, produce a list of its constituent elements (tags and text).
  39. @type html: unicode
  40. @param html: HTML string to parse
  41. @rtype: [ unicode, ... ]
  42. @return: parsed list of HTML elements
  43. """
  44. self.reset()
  45. self.result = []
  46. html = self.INVALID_TAG_PATTERN.sub( self.INVALID_TAG_FIX, html )
  47. self.feed( html )
  48. return [ x for x in self.result if x != "" ]
  49. def diff( self, html_a, html_b ):
  50. """
  51. Return a composite HTML diff of the given HTML input strings. The returned string contains the
  52. entirety of the input strings, but with deleted/modified text from html_a wrapped in <del> tags,
  53. and inserted/modified text from html_b wrapped in <ins> tags.
  54. @type html_a: unicode
  55. @param html_a: original HTML string
  56. @type html_b: unicode
  57. @param html-b: modified HTML string
  58. @rtype: unicode
  59. @return: composite HTML diff
  60. """
  61. # parse the two html strings into lists
  62. a = self.convert_html_to_list( html_a )
  63. b = self.convert_html_to_list( html_b )
  64. # prepare the two lists for diffing, and then diff 'em
  65. ( a, b ) = self.prepare_lists( a, b )
  66. return self.diff_lists( a, b )
  67. SINGLE_TAG_PATTERN = re.compile( "<(\w+).*/>" ) # e.g. '<br/>' or '<br />' or '<img src="foo" />'
  68. START_TAG_PATTERN = re.compile( "<(\w+).*>" ) # e.g. '<i>' or '<a href="foo">'
  69. END_TAG_PATTERN = re.compile( "</(\w+)>" ) # e.g. '</i>' or '</a>'
  70. @staticmethod
  71. def track_open_tags( item, open_tags ):
  72. """
  73. Add or remove from the open_tags list based on whether the given item contains a start or end
  74. tag. If item does not contain any tag, then open_tags remains unchanged.
  75. @type item: unicode
  76. @param item: chunk of HTML, containing either an HTML tag or just text
  77. @type open_tags: [ unicode, ... ]
  78. @param open_tags: list of open tags
  79. """
  80. match = Html_differ.SINGLE_TAG_PATTERN.search( item )
  81. if match: return
  82. match = Html_differ.START_TAG_PATTERN.search( item )
  83. if match:
  84. open_tags.append( match.group( 1 ) )
  85. return
  86. match = Html_differ.END_TAG_PATTERN.search( item )
  87. if not match: return
  88. tag = match.group( 1 )
  89. if match and tag in open_tags:
  90. open_tags.remove( tag )
  91. def prepare_lists( self, a, b ):
  92. """
  93. Prepare the two lists for diffing by merging together adjacent elements that occur within
  94. modified start and end HTML tags.
  95. For instance, if:
  96. a = [ 'foo ', 'bar ', 'baz ', 'quux' ]
  97. b = [ 'foo ', '<i>', 'bar ', 'baz', '</i> ', 'quux' ]
  98. then the returned lists are as follows:
  99. a = [ 'foo ', 'bar baz ', 'quux' ]
  100. b = [ 'foo ', '<i>bar baz</i> ', 'quux' ]
  101. Merging these elements together ensures that they're diffed as a single unit. Failing to perform
  102. this step would mean that when a phrase in list a becomes italicized in list b, then it wouldn't
  103. show up as modified in the resulting diff.
  104. @type a: [ unicode, ... ]
  105. @type b: [ unicode, ... ]
  106. @rtype: ( [ unicode, ... ], [ unicode, ... ] )
  107. @return: tuple of resulting list a and list b
  108. """
  109. matcher = SequenceMatcher( None, a, b )
  110. result_a = []
  111. result_b = []
  112. open_tags = [] # modified start tags
  113. open_del_items = [] # deleted items within modified start and end tags
  114. open_ins_items = [] # inserted items within modified start and end tags
  115. for ( change_type, i1, i2, j1, j2 ) in matcher.get_opcodes():
  116. if change_type == "equal":
  117. equal_items = b[ j1:j2 ]
  118. if len( open_tags ) == 0:
  119. result_a.extend( equal_items )
  120. result_b.extend( equal_items )
  121. else:
  122. open_del_items.extend( equal_items )
  123. open_ins_items.extend( equal_items )
  124. continue
  125. # go through the altered items looking for start and end tags
  126. orig_len_open_tags = len( open_tags )
  127. for i in range( i1, i2 ):
  128. Html_differ.track_open_tags( a[ i ], open_tags )
  129. for j in range( j1, j2 ):
  130. Html_differ.track_open_tags( b[ j ], open_tags )
  131. all_tags_got_closed = ( orig_len_open_tags > 0 and len( open_tags ) == 0 )
  132. if change_type == "replace":
  133. open_del_items.extend( a[ i1:i2 ] )
  134. open_ins_items.extend( b[ j1:j2 ] )
  135. elif change_type == "delete":
  136. open_del_items.extend( a[ i1:i2 ] )
  137. elif change_type == "insert":
  138. open_ins_items.extend( b[ j1:j2 ] )
  139. if all_tags_got_closed:
  140. # if all tags were just closed, then merge the items that were in those tags
  141. if len( open_del_items ) > 0:
  142. result_a.append( ''.join( open_del_items ) )
  143. if len( open_ins_items ) > 0:
  144. result_b.append( ''.join( open_ins_items ) )
  145. open_del_items = []
  146. open_ins_items = []
  147. elif len( open_tags ) == 0:
  148. result_a.extend( open_del_items )
  149. result_b.extend( open_ins_items )
  150. open_del_items = []
  151. open_ins_items = []
  152. if len( open_del_items ):
  153. result_a.extend( open_del_items )
  154. if len( open_ins_items ):
  155. result_b.extend( open_ins_items )
  156. return ( result_a, result_b )
  157. def diff_lists( self, a, b ):
  158. """
  159. Diff two prepared lists and return the result as an HTML string.
  160. @type a: [ unicode, ... ]
  161. @type b: [ unicode, ... ]
  162. @rtype: unicode
  163. @return: composite HTML diff
  164. """
  165. matcher = SequenceMatcher( None, a, b )
  166. result = []
  167. open_tags = []
  168. # inspired by http://www.aaronsw.com/2002/diff/
  169. for ( change_type, i1, i2, j1, j2 ) in matcher.get_opcodes():
  170. if change_type == "replace":
  171. result.append(
  172. '<del class="diff modified">' + ''.join( a[ i1:i2 ] ) + '</del>' + \
  173. '<ins class="diff modified">' + ''.join( b[ j1:j2 ] ) + '</ins>'
  174. )
  175. elif change_type == "delete":
  176. result.append( '<del class="diff">' + ''.join( a[ i1:i2 ] ) + '</del>' )
  177. elif change_type == "insert":
  178. result.append( '<ins class="diff">' + ''.join( b[ j1:j2 ] ) + '</ins>' )
  179. elif change_type == "equal":
  180. result.append( ''.join( b[ j1:j2 ] ) )
  181. return "".join( result )