70 lines
1.6 KiB
Python
70 lines
1.6 KiB
Python
from htmllib import HTMLParser
|
|
from formatter import AbstractFormatter, NullWriter
|
|
|
|
|
|
class Html_nuker( HTMLParser ):
|
|
"""
|
|
Nukes HTML of all tags, and optionally all entity/characters references.
|
|
"""
|
|
def __init__( self, allow_refs = False ):
|
|
HTMLParser.__init__( self, AbstractFormatter( NullWriter() ) )
|
|
self.result = []
|
|
self.allow_refs = allow_refs
|
|
|
|
def handle_data( self, data ):
|
|
if data and "<" not in data and ">" not in data:
|
|
self.result.append( data )
|
|
|
|
def handle_charref( self, ref ):
|
|
ref = int( ref )
|
|
if self.allow_refs:
|
|
self.result.append( "&#%s;" % ref )
|
|
# convert ascii references to their character equivalents
|
|
elif ref >= 32 and ref < 128:
|
|
self.result.append( chr( ref ) )
|
|
|
|
def handle_entityref( self, ref ):
|
|
if self.allow_refs:
|
|
if ref == "nbsp":
|
|
self.result.append( " " )
|
|
else:
|
|
self.result.append( "&%s;" % ref )
|
|
else:
|
|
self.result.append( {
|
|
"amp": "&",
|
|
"lt": "<",
|
|
"gt": ">",
|
|
"quot": '"',
|
|
"nbsp": " ",
|
|
}.get ( ref, "" ) )
|
|
|
|
def handle_comment( self, comment ):
|
|
pass
|
|
|
|
def handle_starttag( self, tag, method, attrs ):
|
|
pass
|
|
|
|
def handle_endtag( self, tag, attrs ):
|
|
pass
|
|
|
|
def unknown_starttag( self, tag, attributes ):
|
|
pass
|
|
|
|
def unknown_endtag( self, tag ):
|
|
pass
|
|
|
|
def nuke( self, rawstring ):
|
|
"""
|
|
Nukes the given string of all HTML tags.
|
|
"""
|
|
if rawstring is None:
|
|
return u""
|
|
|
|
self.reset()
|
|
self.result = []
|
|
self.feed( rawstring )
|
|
|
|
result = u"".join( self.result ).strip()
|
|
|
|
return result
|