luminotes/controller/Html_cleaner.py

# originally from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496942

import urlparse
from htmllib import HTMLParser
from cgi import escape
from formatter import AbstractFormatter, NullWriter
from htmlentitydefs import entitydefs
from xml.sax.saxutils import quoteattr


def xssescape(text):
  """Gets rid of < and > and & and, for good measure"""
  return escape(text, quote=True)


class Html_cleaner(HTMLParser):
  """
  Cleans HTML of any tags not matching a whitelist.
  """
  def __init__( self ):
    HTMLParser.__init__( self, AbstractFormatter( NullWriter() ) )
    self.result = []
    self.open_tags = []
    # A list of the only tags allowed.  Be careful adding to this.  Adding
    # "script," for example, would not be smart.  'img' is out by default 
    # because of the danger of IMG embedded commands, and/or web bugs.
    self.permitted_tags = [
      'a',
      'b',
      'br',
      'em',
      'h3',
      'i',
      'li',
      'ol',
      'ul',
      'p',
      'strong',
      'u',
    ]

    # A list of tags that are forcibly removed from the input. Tags that
    # are not in permitted_tags and not in stripped_tags are simply
    # escaped.
    self.stripped_tags = [
      'span',
    ]

    # A list of tags that require no closing tag.
    self.requires_no_close = [ 'img', 'br' ]

    # A dictionary showing the only attributes allowed for particular tags.
    # If a tag is not listed here, it is allowed no attributes.  Adding
    # "on" tags, like "onhover," would not be smart.  Also be very careful
    # of "background" and "style."
    self.allowed_attributes = {
      'a': [ 'href', 'target' ],
    }

    # The only schemes allowed in URLs (for href and src attributes).
    # Adding "javascript" or "vbscript" to this list would not be smart.
    self.allowed_schemes = ['http','https','ftp', '']

  def handle_data(self, data):
    if data:
      self.result.append( xssescape(data) )

  def handle_charref(self, ref):
    if len(ref) < 7 and ref.isdigit():
      self.result.append( '&#%s;' % ref )
    else:
      self.result.append( xssescape('&#%s' % ref) )

  def handle_entityref(self, ref):
    if ref in entitydefs:
      self.result.append( '&%s;' % ref )
    else:
      self.result.append( xssescape('&%s' % ref) )

  def handle_comment(self, comment):
    if comment:
      self.result.append( xssescape("<!--%s-->" % comment) )

  def handle_starttag(self, tag, method, attrs):
    if tag not in self.permitted_tags:
      if tag not in self.stripped_tags:
        self.result.append( xssescape("<%s>" %  tag) )
    else:
      bt = "<" + tag
      if tag in self.allowed_attributes:
        attrs = dict(attrs)
        self.allowed_attributes_here = \
          [x for x in self.allowed_attributes[tag] if x in attrs \
           and len(attrs[x]) > 0]
        for attribute in self.allowed_attributes_here:
          if attribute in ['href', 'src', 'background']:
            if self.url_is_acceptable(attrs[attribute]):
              bt += ' %s="%s"' % (attribute, attrs[attribute])
          else:
            bt += ' %s=%s' % \
               (xssescape(attribute), quoteattr(attrs[attribute]))
      if bt == "<a" or bt == "<img":
        return
      if tag in self.requires_no_close:
        bt += "/"
      bt += ">"           
      self.result.append( bt )
      self.open_tags.insert(0, tag)
      
  def handle_endtag(self, tag, attrs):
    bracketed = "</%s>" % tag
    if tag not in self.permitted_tags:
      if tag not in self.stripped_tags:
        self.result.append( xssescape(bracketed) )
    elif tag in self.open_tags:
      self.result.append( bracketed )
      self.open_tags.remove(tag)
      
  def unknown_starttag(self, tag, attributes):
    self.handle_starttag(tag, None, attributes)

  def unknown_endtag(self, tag):
    self.handle_endtag(tag, None)

  def url_is_acceptable(self,url):
    parsed = urlparse.urlparse(url)

    # Work-around a nasty bug. urlparse() caches parsed results and returns them on future calls,
    # and if the cache isn't cleared here, then a unicode string gets added to the cache, which
    # freaks out cherrypy when it independently calls urlparse() with the same URL later.
    urlparse.clear_cache()

    return parsed[0] in self.allowed_schemes

  def strip(self, rawstring):
    """Returns the argument stripped of potentially harmful HTML or JavaScript code"""
    self.reset()
    self.result = []
    self.feed(rawstring)
    for endtag in self.open_tags:
      if endtag not in self.requires_no_close:
        self.result.append( "</%s>" % endtag )
    return "".join( self.result )

  def xtags(self):
    """Returns a printable string informing the user which tags are allowed"""
    self.permitted_tags.sort()
    tg = ""
    for x in self.permitted_tags:
      tg += "<" + x
      if x in self.allowed_attributes:
        for y in self.allowed_attributes[x]:
          tg += ' %s=""' % y
      tg += "> "
    return xssescape(tg.strip())
Renaming repository to new name: luminotes 2007-07-16 20:22:38 +00:00			`# originally from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496942`

* Can now click on revision timestamps to open up the contents of previous note revisions with a small timestamp at the top. * Revisions can be opened either in the current page or in a new window/tab. * Added ability for a read-write notebook to contain read-only notes. This supports showing read-only revisions. * Fixed updatedb.py to properly load the anonymous user. * Updated initdb.py and updatedb.py to deadl with new-style /notebooks/notebookid?note_id=noteid wiki links. * Made Persistent copy the revisions_list on each revision update so different revisions don't share lists. * Prevented Note from updating its revision twice upon construction. Now it's only updated once. * Work-around for nasty urlparse() caching bug related to unicode strings that cherrypy barfs on. * Added optional revision flag to various controller.Notebooks methods to allow opening of a notebook with a particular note revision displayed. 2007-07-31 22:53:57 +00:00			`import urlparse`
Renaming repository to new name: luminotes 2007-07-16 20:22:38 +00:00			`from htmllib import HTMLParser`
			`from cgi import escape`
			`from formatter import AbstractFormatter, NullWriter`
			`from htmlentitydefs import entitydefs`
			`from xml.sax.saxutils import quoteattr`

More docstrings. 2007-08-09 19:44:26 +00:00
Renaming repository to new name: luminotes 2007-07-16 20:22:38 +00:00			`def xssescape(text):`
More docstrings. 2007-08-09 19:44:26 +00:00			`"""Gets rid of < and > and & and, for good measure"""`
Html_cleaner no longer converts ":" to ":". 2007-07-19 19:23:59 +00:00			`return escape(text, quote=True)`
Renaming repository to new name: luminotes 2007-07-16 20:22:38 +00:00
More docstrings. 2007-08-09 19:44:26 +00:00
Renaming repository to new name: luminotes 2007-07-16 20:22:38 +00:00			`class Html_cleaner(HTMLParser):`
			`"""`
			`Cleans HTML of any tags not matching a whitelist.`
			`"""`
			`def __init__( self ):`
			`HTMLParser.__init__( self, AbstractFormatter( NullWriter() ) )`
			`self.result = []`
			`self.open_tags = []`
			`# A list of the only tags allowed. Be careful adding to this. Adding`
			`# "script," for example, would not be smart. 'img' is out by default`
			`# because of the danger of IMG embedded commands, and/or web bugs.`
			`self.permitted_tags = [`
			`'a',`
			`'b',`
			`'br',`
			`'em',`
			`'h3',`
			`'i',`
			`'li',`
			`'ol',`
			`'ul',`
			`'p',`
			`'strong',`
Added underline button to toolbar. Useful for things like book titles. 2007-08-27 21:01:42 +00:00			`'u',`
Renaming repository to new name: luminotes 2007-07-16 20:22:38 +00:00			`]`

			`# A list of tags that are forcibly removed from the input. Tags that`
			`# are not in permitted_tags and not in stripped_tags are simply`
			`# escaped.`
			`self.stripped_tags = [`
			`'span',`
			`]`

			`# A list of tags that require no closing tag.`
			`self.requires_no_close = [ 'img', 'br' ]`

			`# A dictionary showing the only attributes allowed for particular tags.`
			`# If a tag is not listed here, it is allowed no attributes. Adding`
			`# "on" tags, like "onhover," would not be smart. Also be very careful`
			`# of "background" and "style."`
			`self.allowed_attributes = {`
Got external links working! * Altered Html_cleaner to allow link targets. * Modified Wiki.load_editor(), Wiki.resolve_link(), and Link_pulldown() to support external links with "_new" targets. * Modified Editor.mouse_clicked() to explicitly open a new window for external links clicked in read-write editors, because the browser won't open clicked link by itself unless they're in read-only iframes. * Removed a duplicate Editor.contents() function. * Increased Link_pulldown title field size to 30 characters. 2007-08-16 22:27:58 +00:00			`'a': [ 'href', 'target' ],`
Renaming repository to new name: luminotes 2007-07-16 20:22:38 +00:00			`}`

			`# The only schemes allowed in URLs (for href and src attributes).`
			`# Adding "javascript" or "vbscript" to this list would not be smart.`
			`self.allowed_schemes = ['http','https','ftp', '']`

			`def handle_data(self, data):`
			`if data:`
			`self.result.append( xssescape(data) )`

			`def handle_charref(self, ref):`
			`if len(ref) < 7 and ref.isdigit():`
			`self.result.append( '&#%s;' % ref )`
			`else:`
			`self.result.append( xssescape('&#%s' % ref) )`

			`def handle_entityref(self, ref):`
			`if ref in entitydefs:`
			`self.result.append( '&%s;' % ref )`
			`else:`
			`self.result.append( xssescape('&%s' % ref) )`

			`def handle_comment(self, comment):`
			`if comment:`
			`self.result.append( xssescape("<!--%s-->" % comment) )`

			`def handle_starttag(self, tag, method, attrs):`
			`if tag not in self.permitted_tags:`
			`if tag not in self.stripped_tags:`
			`self.result.append( xssescape("<%s>" % tag) )`
			`else:`
			`bt = "<" + tag`
			`if tag in self.allowed_attributes:`
			`attrs = dict(attrs)`
			`self.allowed_attributes_here = \`
			`[x for x in self.allowed_attributes[tag] if x in attrs \`
			`and len(attrs[x]) > 0]`
			`for attribute in self.allowed_attributes_here:`
			`if attribute in ['href', 'src', 'background']:`
			`if self.url_is_acceptable(attrs[attribute]):`
			`bt += ' %s="%s"' % (attribute, attrs[attribute])`
			`else:`
			`bt += ' %s=%s' % \`
			`(xssescape(attribute), quoteattr(attrs[attribute]))`
			`if bt == "<a" or bt == "<img":`
			`return`
			`if tag in self.requires_no_close:`
			`bt += "/"`
			`bt += ">"`
			`self.result.append( bt )`
			`self.open_tags.insert(0, tag)`

			`def handle_endtag(self, tag, attrs):`
			`bracketed = "</%s>" % tag`
			`if tag not in self.permitted_tags:`
			`if tag not in self.stripped_tags:`
			`self.result.append( xssescape(bracketed) )`
			`elif tag in self.open_tags:`
			`self.result.append( bracketed )`
			`self.open_tags.remove(tag)`

			`def unknown_starttag(self, tag, attributes):`
			`self.handle_starttag(tag, None, attributes)`

			`def unknown_endtag(self, tag):`
			`self.handle_endtag(tag, None)`

			`def url_is_acceptable(self,url):`
* Can now click on revision timestamps to open up the contents of previous note revisions with a small timestamp at the top. * Revisions can be opened either in the current page or in a new window/tab. * Added ability for a read-write notebook to contain read-only notes. This supports showing read-only revisions. * Fixed updatedb.py to properly load the anonymous user. * Updated initdb.py and updatedb.py to deadl with new-style /notebooks/notebookid?note_id=noteid wiki links. * Made Persistent copy the revisions_list on each revision update so different revisions don't share lists. * Prevented Note from updating its revision twice upon construction. Now it's only updated once. * Work-around for nasty urlparse() caching bug related to unicode strings that cherrypy barfs on. * Added optional revision flag to various controller.Notebooks methods to allow opening of a notebook with a particular note revision displayed. 2007-07-31 22:53:57 +00:00			`parsed = urlparse.urlparse(url)`

			`# Work-around a nasty bug. urlparse() caches parsed results and returns them on future calls,`
			`# and if the cache isn't cleared here, then a unicode string gets added to the cache, which`
			`# freaks out cherrypy when it independently calls urlparse() with the same URL later.`
			`urlparse.clear_cache()`

Renaming repository to new name: luminotes 2007-07-16 20:22:38 +00:00			`return parsed[0] in self.allowed_schemes`

			`def strip(self, rawstring):`
Learned to spell JavaScript. 2007-09-10 19:43:51 +00:00			`"""Returns the argument stripped of potentially harmful HTML or JavaScript code"""`
Renaming repository to new name: luminotes 2007-07-16 20:22:38 +00:00			`self.reset()`
			`self.result = []`
			`self.feed(rawstring)`
			`for endtag in self.open_tags:`
			`if endtag not in self.requires_no_close:`
			`self.result.append( "</%s>" % endtag )`
			`return "".join( self.result )`

			`def xtags(self):`
			`"""Returns a printable string informing the user which tags are allowed"""`
			`self.permitted_tags.sort()`
			`tg = ""`
			`for x in self.permitted_tags:`
			`tg += "<" + x`
			`if x in self.allowed_attributes:`
			`for y in self.allowed_attributes[x]:`
			`tg += ' %s=""' % y`
			`tg += "> "`
			`return xssescape(tg.strip())`