witten
/
luminotes
Archived
1
0
Fork 0
This repository has been archived on 2023-12-16. You can view files and clone it, but cannot push or open issues or pull requests.
luminotes/controller/Html_cleaner.py

147 lines
4.6 KiB
Python

# originally from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496942
from htmllib import HTMLParser
from cgi import escape
from urlparse import urlparse
from formatter import AbstractFormatter, NullWriter
from htmlentitydefs import entitydefs
from xml.sax.saxutils import quoteattr
def xssescape(text):
"""Gets rid of < and > and & and, for good measure, :"""
return escape(text, quote=True).replace(':','&#58;')
class Html_cleaner(HTMLParser):
"""
Cleans HTML of any tags not matching a whitelist.
"""
def __init__( self ):
HTMLParser.__init__( self, AbstractFormatter( NullWriter() ) )
self.result = []
self.open_tags = []
# A list of the only tags allowed. Be careful adding to this. Adding
# "script," for example, would not be smart. 'img' is out by default
# because of the danger of IMG embedded commands, and/or web bugs.
self.permitted_tags = [
'a',
'b',
'br',
'em',
'h3',
'i',
'li',
'ol',
'ul',
'p',
'strong',
]
# A list of tags that are forcibly removed from the input. Tags that
# are not in permitted_tags and not in stripped_tags are simply
# escaped.
self.stripped_tags = [
'span',
]
# A list of tags that require no closing tag.
self.requires_no_close = [ 'img', 'br' ]
# A dictionary showing the only attributes allowed for particular tags.
# If a tag is not listed here, it is allowed no attributes. Adding
# "on" tags, like "onhover," would not be smart. Also be very careful
# of "background" and "style."
self.allowed_attributes = {
'a': [ 'href' ],
}
# The only schemes allowed in URLs (for href and src attributes).
# Adding "javascript" or "vbscript" to this list would not be smart.
self.allowed_schemes = ['http','https','ftp', '']
def handle_data(self, data):
if data:
self.result.append( xssescape(data) )
def handle_charref(self, ref):
if len(ref) < 7 and ref.isdigit():
self.result.append( '&#%s;' % ref )
else:
self.result.append( xssescape('&#%s' % ref) )
def handle_entityref(self, ref):
if ref in entitydefs:
self.result.append( '&%s;' % ref )
else:
self.result.append( xssescape('&%s' % ref) )
def handle_comment(self, comment):
if comment:
self.result.append( xssescape("<!--%s-->" % comment) )
def handle_starttag(self, tag, method, attrs):
if tag not in self.permitted_tags:
if tag not in self.stripped_tags:
self.result.append( xssescape("<%s>" % tag) )
else:
bt = "<" + tag
if tag in self.allowed_attributes:
attrs = dict(attrs)
self.allowed_attributes_here = \
[x for x in self.allowed_attributes[tag] if x in attrs \
and len(attrs[x]) > 0]
for attribute in self.allowed_attributes_here:
if attribute in ['href', 'src', 'background']:
if self.url_is_acceptable(attrs[attribute]):
bt += ' %s="%s"' % (attribute, attrs[attribute])
else:
bt += ' %s=%s' % \
(xssescape(attribute), quoteattr(attrs[attribute]))
if bt == "<a" or bt == "<img":
return
if tag in self.requires_no_close:
bt += "/"
bt += ">"
self.result.append( bt )
self.open_tags.insert(0, tag)
def handle_endtag(self, tag, attrs):
bracketed = "</%s>" % tag
if tag not in self.permitted_tags:
if tag not in self.stripped_tags:
self.result.append( xssescape(bracketed) )
elif tag in self.open_tags:
self.result.append( bracketed )
self.open_tags.remove(tag)
def unknown_starttag(self, tag, attributes):
self.handle_starttag(tag, None, attributes)
def unknown_endtag(self, tag):
self.handle_endtag(tag, None)
def url_is_acceptable(self,url):
parsed = urlparse(url)
return parsed[0] in self.allowed_schemes
def strip(self, rawstring):
"""Returns the argument stripped of potentially harmful HTML or Javascript code"""
self.reset()
self.result = []
self.feed(rawstring)
for endtag in self.open_tags:
if endtag not in self.requires_no_close:
self.result.append( "</%s>" % endtag )
return "".join( self.result )
def xtags(self):
"""Returns a printable string informing the user which tags are allowed"""
self.permitted_tags.sort()
tg = ""
for x in self.permitted_tags:
tg += "<" + x
if x in self.allowed_attributes:
for y in self.allowed_attributes[x]:
tg += ' %s=""' % y
tg += "> "
return xssescape(tg.strip())