From 970bc77defe6b725559eb51e3ec0d2c154cd75ef Mon Sep 17 00:00:00 2001
From: Dan Helfman <dan@luminotes.com>
Date: Wed, 25 Feb 2009 19:12:14 -0800
Subject: [PATCH] Html_cleaner now strips out all unknown HTML tags instead of
 just escaping them.

---
 NEWS                              |   3 +
 controller/Html_cleaner.py        | 106 +++++++++---------------------
 controller/test/Test_notebooks.py |  84 +++++++++++++++++++++++
 3 files changed, 119 insertions(+), 74 deletions(-)

diff --git a/NEWS b/NEWS
index 9198310..8009a4d 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,9 @@
  * Changed the order of exported HTML and CSV notebooks so that after all the
    "startup" notes are included, the remaining notes are included in
    alphabetical order (instead of reverse chronological order).
+ * Instead of converting unsupported HTML tags to plain text when a note is
+   saved, Luminotes now simply strips out all unsupported tags. This further
+   improves copy and pasting text from programs like MS Word.
  * Fixed a compatibility problem with newer versions of SQLite. (Luminotes
    was using a reserved keyword as an identifier.) This only affected those
    people who installed Luminotes Server themselves.
diff --git a/controller/Html_cleaner.py b/controller/Html_cleaner.py
index 192fe76..b562083 100644
--- a/controller/Html_cleaner.py
+++ b/controller/Html_cleaner.py
@@ -112,31 +112,6 @@ class Html_cleaner(HTMLParser):
       'colgroup',
     ]
 
-    # A list of tags that are forcibly removed from the input. Tags that
-    # are not in permitted_tags and not in stripped_tags are simply
-    # escaped.
-    self.stripped_tags = [
-      'span',
-      'blink',
-      'marquee',
-      'bgsound',
-      'meta',
-      'object',
-      'iframe',
-      'script',
-      'noscript',
-      'applet',
-      'embed',
-      'style',
-      'link',
-      'html',
-      'title',
-      'head',
-      'body',
-      'o',
-      'm',
-    ]
-
     # A list of tags that require no closing tag.
     self.requires_no_close = [ 'img', 'br' ]
 
@@ -168,58 +143,53 @@ class Html_cleaner(HTMLParser):
   def handle_charref(self, ref):
     if len(ref) < 7 and ref.isdigit():
       self.result.append( '&#%s;' % ref )
-    else:
-      self.result.append( xssescape('&#%s' % ref) )
 
   def handle_entityref(self, ref):
     if ref in entitydefs:
       self.result.append( '&%s;' % ref )
-    else:
-      self.result.append( xssescape('&%s' % ref) )
 
   def handle_comment(self, comment):
     pass # strip comments
 
   def handle_starttag(self, tag, method, attrs):
     if tag not in self.permitted_tags:
-      if tag not in self.stripped_tags:
-        self.result.append( xssescape("<%s>" %  tag) )
-    else:
-      bt = "<" + tag
-      if tag in self.allowed_attributes:
-        attrs = dict(attrs)
-        self.allowed_attributes_here = \
-          [x for x in self.allowed_attributes[tag] if x in attrs \
-           and len(attrs[x]) > 0]
-        for attribute in self.allowed_attributes_here:
-          if attribute in ['href', 'src', 'background']:
-            if self.url_is_acceptable(attrs[attribute]):
-              bt += ' %s="%s"' % (attribute, attrs[attribute])
-          else:
-            bt += ' %s=%s' % \
-               (xssescape(attribute), quoteattr(attrs[attribute]))
-        if tag == "a" and \
-           ( not attrs.get( 'href' ) or not self.NOTE_LINK_URL_PATTERN.search( attrs.get( 'href' ) ) ):
-          if self.require_link_target and not attrs.get( 'target' ):
-            bt += ' target="_new"'
-          rel = attrs.get( 'rel' )
-          if not rel or rel != "nofollow":
-            bt += ' rel="nofollow"'
-      if bt == "<a" or bt == "<img":
-        return
-      if tag in self.requires_no_close:
-        bt += " /"
-      bt += ">"           
-      self.result.append( bt )
-      self.open_tags.insert(0, tag)
+      return
+
+    bt = "<" + tag
+    if tag in self.allowed_attributes:
+      attrs = dict(attrs)
+      self.allowed_attributes_here = \
+        [x for x in self.allowed_attributes[tag] if x in attrs \
+         and len(attrs[x]) > 0]
+      for attribute in self.allowed_attributes_here:
+        if attribute in ['href', 'src', 'background']:
+          if self.url_is_acceptable(attrs[attribute]):
+            bt += ' %s="%s"' % (attribute, attrs[attribute])
+        else:
+          bt += ' %s=%s' % \
+             (xssescape(attribute), quoteattr(attrs[attribute]))
+      if tag == "a" and \
+         ( not attrs.get( 'href' ) or not self.NOTE_LINK_URL_PATTERN.search( attrs.get( 'href' ) ) ):
+        if self.require_link_target and not attrs.get( 'target' ):
+          bt += ' target="_new"'
+        rel = attrs.get( 'rel' )
+        if not rel or rel != "nofollow":
+          bt += ' rel="nofollow"'
+    if bt == "<a" or bt == "<img":
+      return
+    if tag in self.requires_no_close:
+      bt += " /"
+    bt += ">"           
+    self.result.append( bt )
+    self.open_tags.insert(0, tag)
       
   def handle_endtag(self, tag, attrs):
     tag = tag.split( ":" )[ 0 ]
     bracketed = "</%s>" % tag
     if tag not in self.permitted_tags:
-      if tag not in self.stripped_tags:
-        self.result.append( xssescape(bracketed) )
-    elif tag in self.open_tags:
+      return
+
+    if tag in self.open_tags:
       self.result.append( bracketed )
       self.open_tags.remove(tag)
       
@@ -248,15 +218,3 @@ class Html_cleaner(HTMLParser):
       if endtag not in self.requires_no_close:
         self.result.append( "</%s>" % endtag )
     return "".join( self.result )
-
-  def xtags(self):
-    """Returns a printable string informing the user which tags are allowed"""
-    self.permitted_tags.sort()
-    tg = ""
-    for x in self.permitted_tags:
-      tg += "<" + x
-      if x in self.allowed_attributes:
-        for y in self.allowed_attributes[x]:
-          tg += ' %s=""' % y
-      tg += "> "
-    return xssescape(tg.strip())
diff --git a/controller/test/Test_notebooks.py b/controller/test/Test_notebooks.py
index 8958655..bc7c554 100644
--- a/controller/test/Test_notebooks.py
+++ b/controller/test/Test_notebooks.py
@@ -3154,6 +3154,48 @@ class Test_notebooks( Test_controller ):
     # before_position should be ignored for such notebooks
     self.test_save_new_note_in_notebook_with_read_write_for_own_notes( after_note_id, before_note_id )
 
+  def test_save_new_note_with_allowed_tags( self ):
+    self.login()
+
+    # save a completely new note
+    title_with_tags = u"<h3>my funny title</h3>"
+    body = u"<p>this is a <b>note</b></p>"
+    new_note = Note.create( "55", title_with_tags + body )
+    previous_revision = new_note.revision
+
+    result = self.http_post( "/notebooks/save_note/", dict(
+      notebook_id = self.notebook.object_id,
+      note_id = new_note.object_id,
+      contents = new_note.contents,
+      startup = False,
+      previous_revision = None,
+    ), session_id = self.session_id )
+
+    assert result[ "new_revision" ]
+    assert result[ "new_revision" ] != previous_revision
+    assert result[ "new_revision" ].user_id == self.user.object_id
+    assert result[ "new_revision" ].username == self.username
+    assert result[ "previous_revision" ] == None
+    user = self.database.load( User, self.user.object_id )
+    assert user.storage_bytes > 0
+    assert result[ "storage_bytes" ] == user.storage_bytes
+    assert result[ "rank" ] == 0.0
+
+    # make sure the new title is now loadable
+    result = self.http_post( "/notebooks/load_note_by_title/", dict(
+      notebook_id = self.notebook.object_id,
+      note_title = new_note.title,
+    ), session_id = self.session_id )
+
+    note = result[ "note" ]
+
+    expected_contents = title_with_tags + body
+
+    assert note.object_id == new_note.object_id
+    assert note.title == new_note.title
+    assert note.contents == expected_contents
+    assert note.user_id == self.user.object_id
+
   def test_save_new_note_with_disallowed_tags( self ):
     self.login()
 
@@ -3240,6 +3282,48 @@ class Test_notebooks( Test_controller ):
     assert note.contents == expected_contents
     assert note.user_id == self.user.object_id
 
+  def test_save_new_note_with_unknown_tags( self ):
+    self.login()
+
+    # save a completely new note
+    title_with_tags = u"<h3>my funny title</h3>"
+    junk = u"foo<whee>blah</whee>bar"
+    new_note = Note.create( "55", title_with_tags + junk )
+    previous_revision = new_note.revision
+
+    result = self.http_post( "/notebooks/save_note/", dict(
+      notebook_id = self.notebook.object_id,
+      note_id = new_note.object_id,
+      contents = new_note.contents,
+      startup = False,
+      previous_revision = None,
+    ), session_id = self.session_id )
+
+    assert result[ "new_revision" ]
+    assert result[ "new_revision" ] != previous_revision
+    assert result[ "new_revision" ].user_id == self.user.object_id
+    assert result[ "new_revision" ].username == self.username
+    assert result[ "previous_revision" ] == None
+    user = self.database.load( User, self.user.object_id )
+    assert user.storage_bytes > 0
+    assert result[ "storage_bytes" ] == user.storage_bytes
+    assert result[ "rank" ] == 0.0
+
+    # make sure the new title is now loadable
+    result = self.http_post( "/notebooks/load_note_by_title/", dict(
+      notebook_id = self.notebook.object_id,
+      note_title = new_note.title,
+    ), session_id = self.session_id )
+
+    note = result[ "note" ]
+
+    expected_contents = title_with_tags + u"fooblahbar"
+
+    assert note.object_id == new_note.object_id
+    assert note.title == new_note.title
+    assert note.contents == expected_contents
+    assert note.user_id == self.user.object_id
+
   def test_save_new_note_with_bad_characters( self ):
     self.login()