Fixed a bug in which the character encoding of uploaded CSV files was not properly detected and used to decode the file.

Note that this fix introduces a new dependency: http://chardet.feedparser.org/ found in the python-chardet package.
2008-08-11 22:53:01 -07:00 · 2008-08-11 22:53:01 -07:00 · c6dbeffc10
parent 5ef02e5c0c
commit c6dbeffc10
4 changed files with 22 additions and 4 deletions
--- a/3
+++ b/3
@ -11,13 +11,14 @@ First, install the prerequisites:
 * simplejson 1.3
 * pytz 2006p
 * Python Imaging Library 1.1
+ * Python Universal Encoding Detector 1.0

 In Debian GNU/Linux, you can issue the following command to install these
 packages:

  apt-get install python2.4 python-cherrypy postgresql-8.1 \
          postgresql-contrib-8.1 python-psycopg2 python-simplejson \
-          python-tz python-imaging
+          python-tz python-imaging python-chardet


 database setup
--- a/6
+++ b/6
@ -1,3 +1,9 @@
+1.4.25: August 11, 2008:
+ * Fixed a bug in which the character encoding of uploaded CSV files was not
+   properly detected and used to decode the file. Note that this fix
+   introduces a new dependency: http://chardet.feedparser.org/ found in the
+   python-chardet package.
+
 1.4.24: August 11, 2008:
 * Added a light gray line under note title text to make it clearer that it's
   a title as opposed to just bold text.
--- a/controller/Files.py
+++ b/controller/Files.py
@ -8,6 +8,7 @@ import cherrypy
 from PIL import Image
 from cStringIO import StringIO
 from threading import Lock, Event
+from chardet.universaldetector import UniversalDetector
 from Expose import expose
 from Validate import validate, Valid_int, Valid_bool, Validation_error
 from Database import Valid_id, end_transaction
@ -786,7 +787,7 @@ class Files( object ):
    @return: rows of data from the parsed file. each row is a list of elements
    @raise Parse_error: there was an error in parsing the given file
    """
-    APPROX_SNIFF_SAMPLE_SIZE_BYTES = 1024 * 1024
+    APPROX_SNIFF_SAMPLE_SIZE_BYTES = 1024 * 50

    try:
      import csv
@ -800,6 +801,15 @@ class Files( object ):

      has_header = sniffer.has_header( sniff_sample )

+      # attempt to determine the file's character encoding
+      detector = UniversalDetector()
+      for line in lines:
+        detector.feed( line )
+        if detector.done: break
+
+      detector.close()
+      encoding = detector.result.get( "encoding" )
+
      table_file.seek( 0 )
      reader = csv.reader( table_file )

@ -820,7 +830,7 @@ class Files( object ):
        else:
          expected_row_length = current_row_length

-        yield row
+        yield [ element.decode( encoding ) for element in row ]
    except ( csv.Error, IOError, TypeError ):
      raise Parse_error()

--- a/static/js/Wiki.js
+++ b/static/js/Wiki.js
@ -2247,7 +2247,8 @@ Wiki.prototype.display_import_notebook = function ( result ) {

  var div = createDOM( "div", {},
    createDOM( "p", {}, "Almost done. I just need a little information about your file before I can complete the import and create a new notebook." ),
-    form
+    form,
+    createDOM( "p", {}, "Once you begin the import, it may take several seconds to complete." )
  );
  
  this.create_editor( "import", "<h3>import a notebook</h3>" + div.innerHTML, undefined, undefined, undefined, false, true, true, undefined );