witten
/
luminotes
Archived
1
0
Fork 0
This repository has been archived on 2023-12-16. You can view files and clone it, but cannot push or open issues or pull requests.
luminotes/controller/Html_nuker.py

70 lines
1.6 KiB
Python

from htmllib import HTMLParser
from formatter import AbstractFormatter, NullWriter
class Html_nuker( HTMLParser ):
"""
Nukes HTML of all tags, and optionally all entity/characters references.
"""
def __init__( self, allow_refs = False ):
HTMLParser.__init__( self, AbstractFormatter( NullWriter() ) )
self.result = []
self.allow_refs = allow_refs
def handle_data( self, data ):
if data and "<" not in data and ">" not in data:
self.result.append( data )
def handle_charref( self, ref ):
ref = int( ref )
if self.allow_refs:
self.result.append( "&#%s;" % ref )
# convert ascii references to their character equivalents
elif ref >= 32 and ref < 128:
self.result.append( chr( ref ) )
def handle_entityref( self, ref ):
if self.allow_refs:
if ref == "nbsp":
self.result.append( " " )
else:
self.result.append( "&%s;" % ref )
else:
self.result.append( {
"amp": "&",
"lt": "<",
"gt": ">",
"quot": '"',
"nbsp": " ",
}.get ( ref, "" ) )
def handle_comment( self, comment ):
pass
def handle_starttag( self, tag, method, attrs ):
pass
def handle_endtag( self, tag, attrs ):
pass
def unknown_starttag( self, tag, attributes ):
pass
def unknown_endtag( self, tag ):
pass
def nuke( self, rawstring ):
"""
Nukes the given string of all HTML tags.
"""
if rawstring is None:
return u""
self.reset()
self.result = []
self.feed( rawstring )
result = u"".join( self.result ).strip()
return result