Personal wiki notebook (not under development)
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

Html_cleaner.py 6.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. # originally from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496942
  2. import re
  3. import urlparse
  4. from htmllib import HTMLParser
  5. from cgi import escape
  6. from formatter import AbstractFormatter, NullWriter
  7. from htmlentitydefs import entitydefs
  8. from xml.sax.saxutils import quoteattr
  9. def xssescape(text):
  10. """Gets rid of < and > and & and, for good measure"""
  11. return escape(text, quote=True)
  12. class Html_cleaner(HTMLParser):
  13. """
  14. Cleans HTML of any tags not matching a whitelist.
  15. """
  16. NOTE_LINK_URL_PATTERN = re.compile( '[^"]*/notebooks/\w+\?[^"]*note_id=\w+', re.IGNORECASE )
  17. COLOR_RGB_PATTERN = re.compile( "^rgb(\s*\d{1,3}\s*,\s*\d{1,3}\s*,\s*\d{1,3}\s*)$" )
  18. COLOR_HEX_PATTERN = re.compile( "^#\d{6}$" )
  19. def __init__( self, require_link_target = False ):
  20. HTMLParser.__init__( self, AbstractFormatter( NullWriter() ) )
  21. self.result = []
  22. self.open_tags = []
  23. # A list of the only tags allowed. Be careful adding to this. Adding
  24. # "script," for example, would not be smart. 'img' is out by default
  25. # because of the danger of IMG embedded commands, and/or web bugs.
  26. self.permitted_tags = [
  27. 'a',
  28. 'b',
  29. 'br',
  30. 'em',
  31. 'i',
  32. 'li',
  33. 'ol',
  34. 'ul',
  35. 'p',
  36. 'strong',
  37. 'u',
  38. 'div',
  39. 'h1',
  40. 'h2',
  41. 'h3',
  42. 'h4',
  43. 'h5',
  44. 'h6',
  45. 'blockquote',
  46. 'q',
  47. 'cite',
  48. 'code',
  49. 'samp',
  50. 'kbd',
  51. 'var',
  52. 'dfn',
  53. 'address',
  54. 'big',
  55. 'small',
  56. 'ins',
  57. 'del',
  58. 'acronym',
  59. 'abbr',
  60. 'strike',
  61. 's',
  62. 'sub',
  63. 'sup',
  64. 'tt',
  65. 'pre',
  66. 'center',
  67. 'font',
  68. 'basefont',
  69. 'multicol',
  70. 'spacer',
  71. 'layer',
  72. 'ilayer',
  73. 'nolayer',
  74. 'img',
  75. 'map',
  76. 'area',
  77. 'param',
  78. 'hr',
  79. 'nobr',
  80. 'wbr',
  81. 'ul',
  82. 'ol',
  83. 'li',
  84. 'dl',
  85. 'dt',
  86. 'dd',
  87. 'menu',
  88. 'dir',
  89. 'form',
  90. 'input',
  91. 'button',
  92. 'label',
  93. 'select',
  94. 'option',
  95. 'optgroup',
  96. 'textarea',
  97. 'fieldset',
  98. 'legend',
  99. 'table',
  100. 'tr',
  101. 'td',
  102. 'th',
  103. 'tbody',
  104. 'tfoot',
  105. 'thead',
  106. 'caption',
  107. 'col',
  108. 'colgroup',
  109. 'span',
  110. ]
  111. # A list of tags that require no closing tag.
  112. self.requires_no_close = [ 'img', 'br' ]
  113. # A dictionary showing the only attributes allowed for particular tags.
  114. # If a tag is not listed here, it is allowed no attributes. Adding
  115. # "on" tags, like "onhover," would not be smart. Also be very careful
  116. # of "background" and "style."
  117. self.allowed_attributes = {
  118. 'a': [ 'href', 'target', 'rel' ],
  119. 'p': [ 'align' ],
  120. 'img': [ 'src', 'alt', 'border', 'title', "class" ],
  121. 'table': [ 'cellpadding', 'cellspacing', 'border', 'width', 'height' ],
  122. 'font': [ 'size', 'face', 'color', 'style', 'class' ],
  123. 'span': [ 'style' ],
  124. 'h3': [ 'style' ],
  125. 'td': [ 'rowspan', 'colspan', 'width', 'height' ],
  126. 'th': [ 'rowspan', 'colspan', 'width', 'height' ],
  127. }
  128. # The only schemes allowed in URLs (for href and src attributes).
  129. # Adding "javascript" or "vbscript" to this list would not be smart.
  130. self.allowed_schemes = ['http','https','ftp', 'irc','mailto','']
  131. # Boolean indicating whether links need to have a target attribute.
  132. self.require_link_target = require_link_target
  133. def handle_data(self, data):
  134. if data:
  135. self.result.append( xssescape(data) )
  136. def handle_charref(self, ref):
  137. if len(ref) < 7 and ref.isdigit():
  138. self.result.append( '&#%s;' % ref )
  139. def handle_entityref(self, ref):
  140. if ref in entitydefs:
  141. self.result.append( '&%s;' % ref )
  142. def handle_comment(self, comment):
  143. pass # strip comments
  144. def handle_starttag(self, tag, method, attrs):
  145. if tag not in self.permitted_tags:
  146. return
  147. bt = "<" + tag
  148. if tag in self.allowed_attributes:
  149. attrs = dict(attrs)
  150. self.allowed_attributes_here = \
  151. [x for x in self.allowed_attributes[tag] if x in attrs \
  152. and len(attrs[x]) > 0]
  153. for attribute in self.allowed_attributes_here:
  154. if attribute in ['href', 'src', 'background']:
  155. if self.url_is_acceptable(attrs[attribute]):
  156. bt += ' %s="%s"' % (attribute, attrs[attribute])
  157. else:
  158. bt += ' %s=%s' % \
  159. (xssescape(attribute), quoteattr(attrs[attribute]))
  160. if attribute == 'style':
  161. if self.style_is_acceptable( attrs[ attribute ] ):
  162. bt += ' %s="%s"' % (attribute, attrs[attribute])
  163. else:
  164. bt += ' %s=%s' % \
  165. (xssescape(attribute), quoteattr(attrs[attribute]))
  166. if tag == "a" and \
  167. ( not attrs.get( 'href' ) or not self.NOTE_LINK_URL_PATTERN.search( attrs.get( 'href' ) ) ):
  168. if self.require_link_target and not attrs.get( 'target' ):
  169. bt += ' target="_new"'
  170. rel = attrs.get( 'rel' )
  171. if not rel or rel != "nofollow":
  172. bt += ' rel="nofollow"'
  173. if bt == "<a" or bt == "<img":
  174. return
  175. if tag in self.requires_no_close:
  176. bt += " /"
  177. bt += ">"
  178. self.result.append( bt )
  179. self.open_tags.insert(0, tag)
  180. def handle_endtag(self, tag, attrs):
  181. tag = tag.split( ":" )[ 0 ]
  182. bracketed = "</%s>" % tag
  183. if tag not in self.permitted_tags:
  184. return
  185. if tag in self.open_tags:
  186. self.result.append( bracketed )
  187. self.open_tags.remove(tag)
  188. def unknown_starttag(self, tag, attributes):
  189. self.handle_starttag(tag, None, attributes)
  190. def unknown_endtag(self, tag):
  191. self.handle_endtag(tag, None)
  192. def url_is_acceptable(self,url):
  193. parsed = urlparse.urlparse(url)
  194. # Work-around a nasty bug. urlparse() caches parsed results and returns them on future calls,
  195. # and if the cache isn't cleared here, then a unicode string gets added to the cache, which
  196. # freaks out cherrypy when it independently calls urlparse() with the same URL later.
  197. urlparse.clear_cache()
  198. return parsed[0] in self.allowed_schemes
  199. def style_is_acceptable(self, style):
  200. pieces = style.split( ";" )
  201. for piece in pieces:
  202. piece = piece.strip()
  203. if piece == "":
  204. continue
  205. param_and_value = piece.split( ":" )
  206. if len( param_and_value ) != 2:
  207. return False
  208. ( param, value ) = param_and_value
  209. value = value.strip()
  210. if param.strip().lower() not in ( "color", "background-color" ):
  211. return False
  212. if not self.COLOR_RGB_PATTERN.search( value ) and \
  213. not self.COLOR_HEX_PATTERN.search( value ):
  214. return False
  215. return True
  216. def strip(self, rawstring):
  217. """Returns the argument stripped of potentially harmful HTML or JavaScript code"""
  218. self.reset()
  219. self.result = []
  220. self.feed(rawstring)
  221. for endtag in self.open_tags:
  222. if endtag not in self.requires_no_close:
  223. self.result.append( "</%s>" % endtag )
  224. return "".join( self.result )