Personal wiki notebook (not under development)

Html_nuker.py 1.6KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. from htmllib import HTMLParser
  2. from formatter import AbstractFormatter, NullWriter
  3. class Html_nuker( HTMLParser ):
  4. """
  5. Nukes HTML of all tags, and optionally all entity/characters references.
  6. """
  7. def __init__( self, allow_refs = False ):
  8. HTMLParser.__init__( self, AbstractFormatter( NullWriter() ) )
  9. self.result = []
  10. self.allow_refs = allow_refs
  11. def handle_data( self, data ):
  12. if data and "<" not in data and ">" not in data:
  13. self.result.append( data )
  14. def handle_charref( self, ref ):
  15. ref = int( ref )
  16. if self.allow_refs:
  17. self.result.append( "&#%s;" % ref )
  18. # convert ascii references to their character equivalents
  19. elif ref >= 32 and ref < 128:
  20. self.result.append( chr( ref ) )
  21. def handle_entityref( self, ref ):
  22. if self.allow_refs:
  23. if ref == "nbsp":
  24. self.result.append( " " )
  25. else:
  26. self.result.append( "&%s;" % ref )
  27. else:
  28. self.result.append( {
  29. "amp": "&",
  30. "lt": "<",
  31. "gt": ">",
  32. "quot": '"',
  33. "nbsp": " ",
  34. }.get ( ref, "" ) )
  35. def handle_comment( self, comment ):
  36. pass
  37. def handle_starttag( self, tag, method, attrs ):
  38. pass
  39. def handle_endtag( self, tag, attrs ):
  40. pass
  41. def unknown_starttag( self, tag, attributes ):
  42. pass
  43. def unknown_endtag( self, tag ):
  44. pass
  45. def nuke( self, rawstring ):
  46. """
  47. Nukes the given string of all HTML tags.
  48. """
  49. if rawstring is None:
  50. return u""
  51. self.reset()
  52. self.result = []
  53. self.feed( rawstring )
  54. result = u"".join( self.result ).strip()
  55. return result