From 24a7205d3aa63fa0be73ba1770a3e325d9feecfa Mon Sep 17 00:00:00 2001 From: Dan Helfman Date: Thu, 19 Jul 2007 19:03:40 +0000 Subject: [PATCH] Html_nuker now converts some character/entity refs to their ascii equivalents, which allows searching for things that contain non-alphanumeric characters encoded as char/entity refs. --- controller/Html_nuker.py | 13 ++++++++++++- controller/test/Test_notebooks.py | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/controller/Html_nuker.py b/controller/Html_nuker.py index 5b3677e..225f3e9 100644 --- a/controller/Html_nuker.py +++ b/controller/Html_nuker.py @@ -4,7 +4,7 @@ from formatter import AbstractFormatter, NullWriter class Html_nuker( HTMLParser ): """ - Nukes HTML of all tags. + Nukes HTML of all tags, and optionally all entity/characters references. """ def __init__( self, allow_refs = False ): HTMLParser.__init__( self, AbstractFormatter( NullWriter() ) ) @@ -16,12 +16,23 @@ class Html_nuker( HTMLParser ): self.result.append( data ) def handle_charref( self, ref ): + ref = int( ref ) if self.allow_refs: self.result.append( "&#%s;" % ref ) + # convert ascii references to their character equivalents + elif ref >= 32 and ref < 128: + self.result.append( chr( ref ) ) def handle_entityref( self, ref ): if self.allow_refs: self.result.append( "&%s;" % ref ) + else: + self.result.append( { + "amp": "&", + "lt": "<", + "gt": ">", + "quot": '"', + }.get ( ref ) ) def handle_comment( self, comment ): pass diff --git a/controller/test/Test_notebooks.py b/controller/test/Test_notebooks.py index b70a2fb..c2859f9 100644 --- a/controller/test/Test_notebooks.py +++ b/controller/test/Test_notebooks.py @@ -635,6 +635,24 @@ class Test_notebooks( Test_controller ): assert len( notes ) == 0 + def test_search_character_refs( self ): + self.login() + + note3 = Note( "55", u"

foo: bar

baz" ) + self.notebook.add_note( note3 ) + + search_text = "oo: b" + + result = self.http_post( "/notebooks/search/", dict( + notebook_id = self.notebook.object_id, + search_text = search_text, + ), session_id = self.session_id ) + + notes = result.get( "notes" ) + + assert len( notes ) == 1 + assert notes[ 0 ].object_id == note3.object_id + def test_recent_notes( self ): self.login()