From 37e886f27c718afeb29b4ef0260b0fc931831ddd Mon Sep 17 00:00:00 2001 From: Dan Helfman Date: Fri, 2 Nov 2007 20:19:53 +0000 Subject: [PATCH] Revamped searching to use PostgreSQL's tsearch2 full-text searching. --- INSTALL | 3 ++- controller/Notebooks.py | 29 ++++++++------------- controller/test/Test_notebooks.py | 32 ++++++++++++----------- model/Note.py | 4 +-- model/Notebook.py | 42 +++++++++++++++++++++++-------- static/js/Wiki.js | 41 ++++++++---------------------- view/Search_form.py | 2 +- 7 files changed, 75 insertions(+), 78 deletions(-) diff --git a/INSTALL b/INSTALL index 733f506..ba1a007 100644 --- a/INSTALL +++ b/INSTALL @@ -14,7 +14,8 @@ First, install the prerequisites: In Debian GNU/Linux, you can issue the following command to install these packages: - apt-get install python2.4 python-cherrypy postgresql-8.1 python-psycopg2 python-simplejson python-tz + apt-get install python2.4 python-cherrypy postgresql-8.1 \ + postgresql-contrib-8.1 python-psycopg2 python-simplejson python-tz development mode diff --git a/controller/Notebooks.py b/controller/Notebooks.py index 233962c..adae9ad 100644 --- a/controller/Notebooks.py +++ b/controller/Notebooks.py @@ -576,14 +576,15 @@ class Notebooks( object ): @grab_user_id @validate( notebook_id = Valid_id(), - search_text = Valid_string( min = 0, max = 100 ), + search_text = unicode, user_id = Valid_id( none_okay = True ), ) def search( self, notebook_id, search_text, user_id ): """ Search the notes within a particular notebook for the given search text. Note that the search - is case-insensitive, and all HTML tags are ignored. The matching notes are returned with title - matches first, followed by all other matches. + is case-insensitive, and all HTML tags are ignored. Notes with title matches are generally + ranked higher than matches that are only in the note contents. The returned notes have their + normal contents replaced with summary contents with the search terms highlighted. @type notebook_id: unicode @param notebook_id: id of notebook to search @@ -595,6 +596,7 @@ class Notebooks( object ): @return: { 'notes': [ matching notes ] } @raise Access_error: the current user doesn't have access to the given notebook @raise Validation_error: one of the arguments is invalid + @raise Search_error: the provided search_text is invalid """ if not self.__users.check_access( user_id, notebook_id ): raise Access_error() @@ -604,26 +606,17 @@ class Notebooks( object ): if not notebook: raise Access_error() - search_text = search_text.lower() - if len( search_text ) == 0: - return dict( notes = [] ) + MAX_SEARCH_TEXT_LENGTH = 256 + if len( search_text ) > MAX_SEARCH_TEXT_LENGTH: + raise Validation_error( u"search_text", None, unicode, message = u"is too long" ) - title_matches = [] - content_matches = [] - nuker = Html_nuker() + if len( search_text ) == 0: + raise Validation_error( u"search_text", None, unicode, message = u"is missing" ) notes = self.__database.select_many( Note, notebook.sql_search_notes( search_text ) ) - # further narrow the search results by making sure notes still match after all HTML tags are - # stripped out - for note in notes: - if search_text in nuker.nuke( note.title ).lower(): - title_matches.append( note ) - elif search_text in nuker.nuke( note.contents ).lower(): - content_matches.append( note ) - return dict( - notes = title_matches + content_matches, + notes = notes, ) @expose( view = Json ) diff --git a/controller/test/Test_notebooks.py b/controller/test/Test_notebooks.py index 3453610..76a40eb 100644 --- a/controller/test/Test_notebooks.py +++ b/controller/test/Test_notebooks.py @@ -1475,7 +1475,23 @@ class Test_notebooks( Test_controller ): notes = result.get( "notes" ) - assert len( notes ) == 0 + assert result[ "error" ] + assert u"missing" in result[ "error" ] + + def test_long_search( self ): + self.login() + + search_text = "w" * 257 + + result = self.http_post( "/notebooks/search/", dict( + notebook_id = self.notebook.object_id, + search_text = search_text, + ), session_id = self.session_id ) + + notes = result.get( "notes" ) + + assert result[ "error" ] + assert u"too long" in result[ "error" ] def test_search_with_no_results( self ): self.login() @@ -1512,20 +1528,6 @@ class Test_notebooks( Test_controller ): assert notes[ 0 ].object_id == note3.object_id assert notes[ 1 ].object_id == self.note.object_id - def test_search_html_tags( self ): - self.login() - - search_text = "h3" - - result = self.http_post( "/notebooks/search/", dict( - notebook_id = self.notebook.object_id, - search_text = search_text, - ), session_id = self.session_id ) - - notes = result.get( "notes" ) - - assert len( notes ) == 0 - def test_search_character_refs( self ): self.login() diff --git a/model/Note.py b/model/Note.py index 0d7d495..26789ca 100644 --- a/model/Note.py +++ b/model/Note.py @@ -105,9 +105,9 @@ class Note( Persistent ): @staticmethod def sql_load( object_id, revision = None ): if revision: - return "select * from note where id = %s and revision = %s;" % ( quote( object_id ), quote( revision ) ) + return "select id, revision, title, contents, notebook_id, startup, deleted_from_id, rank from note where id = %s and revision = %s;" % ( quote( object_id ), quote( revision ) ) - return "select * from note_current where id = %s;" % quote( object_id ) + return "select id, revision, title, contents, notebook_id, startup, deleted_from_id, rank from note_current where id = %s;" % quote( object_id ) @staticmethod def sql_id_exists( object_id, revision = None ): diff --git a/model/Notebook.py b/model/Notebook.py index 287800e..237db69 100644 --- a/model/Notebook.py +++ b/model/Notebook.py @@ -1,3 +1,4 @@ +import re from copy import copy from Note import Note from Persistent import Persistent, quote @@ -7,6 +8,10 @@ class Notebook( Persistent ): """ A collection of wiki notes. """ + + WHITESPACE_PATTERN = re.compile( r"\s+" ) + SEARCH_OPERATORS = re.compile( r"[&|!()]" ) + def __init__( self, object_id, revision = None, name = None, trash_id = None, read_write = True ): """ Create a new notebook with the given id and name. @@ -78,19 +83,19 @@ class Notebook( Persistent ): """ Return a SQL string to load a list of all the notes within this notebook. """ - return "select * from note_current where notebook_id = %s order by revision desc;" % quote( self.object_id ) + return "select id, revision, title, contents, notebook_id, startup, deleted_from_id, rank from note_current where notebook_id = %s order by revision desc;" % quote( self.object_id ) def sql_load_non_startup_notes( self ): """ Return a SQL string to load a list of the non-startup notes within this notebook. """ - return "select * from note_current where notebook_id = %s and startup = 'f' order by revision desc;" % quote( self.object_id ) + return "select id, revision, title, contents, notebook_id, startup, deleted_from_id, rank from note_current where notebook_id = %s and startup = 'f' order by revision desc;" % quote( self.object_id ) def sql_load_startup_notes( self ): """ Return a SQL string to load a list of the startup notes within this notebook. """ - return "select * from note_current where notebook_id = %s and startup = 't' order by rank;" % quote( self.object_id ) + return "select id, revision, title, contents, notebook_id, startup, deleted_from_id, rank from note_current where notebook_id = %s and startup = 't' order by rank;" % quote( self.object_id ) def sql_load_recent_notes( self, start = 0, count = 10 ): """ @@ -104,7 +109,9 @@ class Notebook( Persistent ): return \ """ select - note_current.*, note_creation.revision as creation + note_current.id, note_current.revision, note_current.title, note_current.contents, + note_current.notebook_id, note_current.startup, note_current.deleted_from_id, + note_current.rank, note_creation.revision as creation from note_current, ( select id, min( revision ) as revision from note where notebook_id = %s group by id ) as note_creation @@ -120,7 +127,7 @@ class Notebook( Persistent ): @type note_id: unicode @param note_id: id of note to load """ - return "select * from note_current where notebook_id = %s and id = %s;" % ( quote( self.object_id ), quote( note_id ) ) + return "select id, revision, title, contents, notebook_id, startup, deleted_from_id, rank from note_current where notebook_id = %s and id = %s;" % ( quote( self.object_id ), quote( note_id ) ) def sql_load_note_by_title( self, title ): """ @@ -129,19 +136,34 @@ class Notebook( Persistent ): @type note_id: unicode @param note_id: title of note to load """ - return "select * from note_current where notebook_id = %s and title = %s;" % ( quote( self.object_id ), quote( title ) ) + return "select id, revision, title, contents, notebook_id, startup, deleted_from_id, rank from note_current where notebook_id = %s and title = %s;" % ( quote( self.object_id ), quote( title ) ) def sql_search_notes( self, search_text ): """ - Return a SQL string to search for notes whose contents contain the given search_text. This - is a case-insensitive search. + Return a SQL string to perform a full-text search for notes whose contents contain the given + search_text. This is a case-insensitive search. @type search_text: unicode @param search_text: text to search for within the notes """ + # strip out all search operators + search_text = self.SEARCH_OPERATORS.sub( u"", search_text ).strip() + + # join all words with boolean "and" operator + search_text = u"&".join( self.WHITESPACE_PATTERN.split( search_text ) ) + + print search_text return \ - "select * from note_current where notebook_id = %s and contents ilike %s;" % \ - ( quote( self.object_id ), quote( "%" + search_text + "%" ) ) + """ + select id, revision, title, headline( drop_html_tags( contents ), query ), notebook_id, startup, deleted_from_id from ( + select + id, revision, title, contents, notebook_id, startup, deleted_from_id, query, rank_cd( search, query ) as rank + from + note_current, to_tsquery( 'default', %s ) query + where + notebook_id = %s and query @@ search order by rank desc limit 20 + ) as sub; + """ % ( quote( search_text ), quote( self.object_id ) ) def sql_highest_rank( self ): """ diff --git a/static/js/Wiki.js b/static/js/Wiki.js index 51310a0..81615e1 100644 --- a/static/js/Wiki.js +++ b/static/js/Wiki.js @@ -989,25 +989,6 @@ Wiki.prototype.display_search_results = function ( result ) { return; } - // TODO: highlight the search term within the search results, idealy showing - // a section of the note contents including the search term - - // if there's only one search result, automatically feel lucky^Wfortunate - if ( result.notes.length == 1 ) { - var note = result.notes[ 0 ] - - // if the editor is already open, highlight it and bail - var iframe = getElement( "note_" + note.object_id ); - if ( iframe ) { - iframe.editor.highlight(); - return; - } - - // otherwise, create an editor for the one note - this.create_editor( note.object_id, note.contents, note.deleted_from_id, note.revision, undefined, this.notebook.read_write, true, true ); - return; - } - // otherwise, there are multiple search results, so create a "magic" search results note. but // first close any open search results notes if ( this.search_results_editor ) @@ -1018,26 +999,24 @@ Wiki.prototype.display_search_results = function ( result ) { var note = result.notes[ i ] if ( !note.title ) continue; - var contents_node = createDOM( "span", {} ); - contents_node.innerHTML = note.contents; - contents = strip( scrapeText( contents_node ) ); - - // remove the title from the scraped contents text - if ( contents.indexOf( note.title ) == 0 ) - contents = contents.substr( note.title.length ); - - if ( contents.length == 0 ) { + if ( note.contents.length == 0 ) { var preview = "empty note"; } else { - var max_preview_length = 160; - var preview = contents.substr( 0, max_preview_length ) + ( ( contents.length > max_preview_length ) ? "..." : "" ); + var preview = note.contents; + + // if the preview appears not to end with a complete sentence, add "..." + if ( !/[?!.]\s*$/.test( preview ) ) + preview = preview + " ..."; } + var preview_span = createDOM( "span" ); + preview_span.innerHTML = preview; + appendChildNodes( list, createDOM( "p", {}, createDOM( "a", { "href": "/notebooks/" + this.notebook_id + "?note_id=" + note.object_id }, note.title ), createDOM( "br" ), - createDOM( "span", {}, preview ) + preview_span ) ); } diff --git a/view/Search_form.py b/view/Search_form.py index 19212d5..b727346 100644 --- a/view/Search_form.py +++ b/view/Search_form.py @@ -8,6 +8,6 @@ class Search_form( Form ): Form.__init__( self, Strong( u"search: " ), - Input( type = u"text", name = u"search_text", id = u"search_text", size = 30, maxlength = 100 ), + Input( type = u"text", name = u"search_text", id = u"search_text", size = 30, maxlength = 512 ), id = u"search_form", )