# originally from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496942 import re import urlparse from htmllib import HTMLParser from cgi import escape from formatter import AbstractFormatter, NullWriter from htmlentitydefs import entitydefs from xml.sax.saxutils import quoteattr def xssescape(text): """Gets rid of < and > and & and, for good measure""" return escape(text, quote=True) class Html_cleaner(HTMLParser): """ Cleans HTML of any tags not matching a whitelist. """ NOTE_LINK_URL_PATTERN = re.compile( '[^"]*/notebooks/\w+\?note_id=\w+', re.IGNORECASE ) def __init__( self, require_link_target = False ): HTMLParser.__init__( self, AbstractFormatter( NullWriter() ) ) self.result = [] self.open_tags = [] # A list of the only tags allowed. Be careful adding to this. Adding # "script," for example, would not be smart. 'img' is out by default # because of the danger of IMG embedded commands, and/or web bugs. self.permitted_tags = [ 'a', 'b', 'br', 'em', 'h3', 'i', 'li', 'ol', 'ul', 'p', 'strong', 'u', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'blockquote', 'q', 'cite', 'code', 'samp', 'kbd', 'var', 'dfn', 'address', 'big', 'small', 'ins', 'del', 'acronym', 'abbr', 'strike', 's', 'sub', 'sup', 'tt', 'pre', 'center', 'font', 'basefont', 'multicol', 'spacer', 'layer', 'ilayer', 'nolayer', 'img', 'map', 'area', 'param', 'hr', 'nobr', 'wbr', 'ul', 'ol', 'li', 'dl', 'dt', 'dd', 'menu', 'dir', 'form', 'input', 'button', 'label', 'select', 'option', 'optgroup', 'textarea', 'fieldset', 'legend', 'table', 'tr', 'td', 'th', 'tbody', 'tfoot', 'thead', 'caption', 'col', 'colgroup', ] # A list of tags that are forcibly removed from the input. Tags that # are not in permitted_tags and not in stripped_tags are simply # escaped. self.stripped_tags = [ 'span', 'blink', 'marquee', 'bgsound', 'meta', 'object', 'iframe', 'script', 'noscript', 'applet', 'embed', 'style', 'link', 'html', 'title', 'head', 'body', ] # A list of tags that require no closing tag. self.requires_no_close = [ 'img', 'br' ] # A dictionary showing the only attributes allowed for particular tags. # If a tag is not listed here, it is allowed no attributes. Adding # "on" tags, like "onhover," would not be smart. Also be very careful # of "background" and "style." self.allowed_attributes = { 'a': [ 'href', 'target' ], 'p': [ 'align' ], 'img': [ 'src', 'alt', 'border', 'title', "class" ], 'table': [ 'cellpadding', 'cellspacing', 'border', 'width', 'height' ], 'font': [ 'color', 'size', 'face' ], 'td': [ 'rowspan', 'colspan', 'width', 'height' ], 'th': [ 'rowspan', 'colspan', 'width', 'height' ], } # The only schemes allowed in URLs (for href and src attributes). # Adding "javascript" or "vbscript" to this list would not be smart. self.allowed_schemes = ['http','https','ftp', 'irc', ''] # Boolean indicating whether links need to have a target attribute. self.require_link_target = require_link_target def handle_data(self, data): if data: self.result.append( xssescape(data) ) def handle_charref(self, ref): if len(ref) < 7 and ref.isdigit(): self.result.append( '&#%s;' % ref ) else: self.result.append( xssescape('&#%s' % ref) ) def handle_entityref(self, ref): if ref in entitydefs: self.result.append( '&%s;' % ref ) else: self.result.append( xssescape('&%s' % ref) ) def handle_comment(self, comment): pass # strip comments def handle_starttag(self, tag, method, attrs): if tag not in self.permitted_tags: if tag not in self.stripped_tags: self.result.append( xssescape("<%s>" % tag) ) else: bt = "<" + tag if tag in self.allowed_attributes: attrs = dict(attrs) self.allowed_attributes_here = \ [x for x in self.allowed_attributes[tag] if x in attrs \ and len(attrs[x]) > 0] for attribute in self.allowed_attributes_here: if attribute in ['href', 'src', 'background']: if self.url_is_acceptable(attrs[attribute]): bt += ' %s="%s"' % (attribute, attrs[attribute]) else: bt += ' %s=%s' % \ (xssescape(attribute), quoteattr(attrs[attribute])) if self.require_link_target and tag == "a" and not attrs.get( 'target' ) and \ ( not attrs.get( 'href' ) or not self.NOTE_LINK_URL_PATTERN.search( attrs.get( 'href' ) ) ): bt += ' target="_new"' if bt == "" % endtag ) return "".join( self.result ) def xtags(self): """Returns a printable string informing the user which tags are allowed""" self.permitted_tags.sort() tg = "" for x in self.permitted_tags: tg += "<" + x if x in self.allowed_attributes: for y in self.allowed_attributes[x]: tg += ' %s=""' % y tg += "> " return xssescape(tg.strip())