From fc3849b8bee5bb513333dec83482d2cfbb9e6408 Mon Sep 17 00:00:00 2001 From: Dan Helfman Date: Fri, 21 Nov 2008 14:07:23 -0800 Subject: [PATCH] HTML entities/characters like """ are now stripped from notebook names before turning them into friendly ids. This means that blog posts now have better URLs if they the post titles contain HTML entities/characters. --- model/Notebook.py | 8 ++++++-- model/delta/1.5.8.sql | 15 +++++++++++++++ model/schema.sql | 13 ++++++++++++- model/test/Test_notebook.py | 8 ++++++++ 4 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 model/delta/1.5.8.sql diff --git a/model/Notebook.py b/model/Notebook.py index 58b5a18..5b3839b 100644 --- a/model/Notebook.py +++ b/model/Notebook.py @@ -360,11 +360,15 @@ class Notebook( Persistent ): self.__name = name self.update_revision() + HTML_REFERENCE_PATTERN = re.compile( "&[a-zA-Z]+;|&#\d+;" ) FRIENDLY_ID_STRIP_PATTERN = re.compile( "[^a-zA-Z0-9\-]+" ) def __friendly_id( self ): - friendly_id = self.WHITESPACE_PATTERN.sub( u"-", self.__name.lower() ) - return self.FRIENDLY_ID_STRIP_PATTERN.sub( u"", friendly_id ) + # convert to lowercase, remove HTML character/entity refs, collapse whitespace to dashes, strip + # other punctuation. strip leading/trailing dashes + friendly_id = self.HTML_REFERENCE_PATTERN.sub( u" ", self.__name.lower() ) + friendly_id = self.WHITESPACE_PATTERN.sub( u"-", friendly_id ) + return self.FRIENDLY_ID_STRIP_PATTERN.sub( u"", friendly_id ).strip( "-" ) def __set_read_write( self, read_write ): # The read_write member isn't actually saved to the database, so setting it doesn't need to diff --git a/model/delta/1.5.8.sql b/model/delta/1.5.8.sql new file mode 100644 index 0000000..a0de041 --- /dev/null +++ b/model/delta/1.5.8.sql @@ -0,0 +1,15 @@ +CREATE OR REPLACE FUNCTION friendly_id(text) RETURNS text + AS $_$select trim( both '-' from + regexp_replace( + regexp_replace( + regexp_replace( + lower( $1 ), + '&[a-zA-Z]+;|&#\\d+;', ' ', 'g' + ), + '\\s+', '-', 'g' + ), + '[^a-zA-Z0-9\\-]', '', 'g' + ) + );$_$ + LANGUAGE sql IMMUTABLE; +reindex index notebook_friendly_id_index; diff --git a/model/schema.sql b/model/schema.sql index 2e7543a..8072ab9 100644 --- a/model/schema.sql +++ b/model/schema.sql @@ -26,7 +26,18 @@ create function log_note_revision() returns trigger as $_$ $_$ language plpgsql; ALTER FUNCTION public.log_note_revision() OWNER TO luminotes; CREATE FUNCTION friendly_id(text) RETURNS text - AS $_$select regexp_replace( regexp_replace( lower( $1 ), '\\s+', '-', 'g' ), '[^a-zA-Z0-9\\-]', '', 'g' );$_$ + AS $_$select trim( both '-' from + regexp_replace( + regexp_replace( + regexp_replace( + lower( $1 ), + '&[a-zA-Z]+;|&#\\d+;', ' ', 'g' + ), + '\\s+', '-', 'g' + ), + '[^a-zA-Z0-9\\-]', '', 'g' + ) + );$_$ LANGUAGE sql IMMUTABLE; ALTER FUNCTION public.friendly_id(text) OWNER TO luminotes; CREATE TABLE file ( diff --git a/model/test/Test_notebook.py b/model/test/Test_notebook.py index 994160a..1cbb076 100644 --- a/model/test/Test_notebook.py +++ b/model/test/Test_notebook.py @@ -177,6 +177,14 @@ class Test_notebook( object ): self.notebook.name = u"This is Bob's notebook!" assert self.notebook.friendly_id == u"this-is-bobs-notebook" + def test_friendly_id_with_html_entity_reference( self ): + self.notebook.name = u"This is Bob's "notebook"!" + assert self.notebook.friendly_id == u"this-is-bobs-notebook" + + def test_friendly_id_with_html_character_reference( self ): + self.notebook.name = u"This is Bob's ¥ notebook!" + assert self.notebook.friendly_id == u"this-is-bobs-notebook" + def test_set_read_write( self ): original_revision = self.notebook.revision self.notebook.read_write = Notebook.READ_WRITE_FOR_OWN_NOTES