Add US-ASCII-WITH-FALLBACK for files that are almost entirely ASCII

The US-ASCII reader for vtkDelimitedTextReader will die if it encounters even one character with its 8th bit set. This leaves us with no way to read documents from non-UTF code pages that have internationalized characters in the text. I added a new "character set" US-ASCII-WITH-FALLBACK that lets us read those anyway. The reader has a new ivar ReplacementCharacter that will be inserted in place of any characters beyond 0x7f (the same ones that cause US-ASCII to choke).

Add US-ASCII-WITH-FALLBACK for files that are almost entirely ASCII
8e0f7838 · Andy Wilson · 4be10e17 · 8e0f7838 · 8e0f7838
Commit 8e0f7838 authored 14 years ago by Andy Wilson
--- a/Infovis/vtkDelimitedTextReader.cxx
+++ b/Infovis/vtkDelimitedTextReader.cxx
@@ -356,13 +356,22 @@ private:
 // ascii_to_unicode

 template<typename OctetIteratorT, typename OutputIteratorT>
-void ascii_to_unicode(OctetIteratorT begin, OctetIteratorT end, OutputIteratorT output)
+void ascii_to_unicode(OctetIteratorT begin, OctetIteratorT end, OutputIteratorT output, bool replace_8bit_characters, vtkTypeUInt32 fallback_character = 'x')
 {
  while(begin != end)
    {
-    const vtkTypeUInt32 code_point = *begin++;
+    vtkTypeUInt32 code_point = *begin++;
    if(code_point > 0x7f)
-      throw vtkstd::runtime_error("Detected a character that isn't valid US-ASCII.");
+      {
+      if (replace_8bit_characters)
+        {
+        code_point = fallback_character;
+        }
+      else
+        {
+        throw vtkstd::runtime_error("Detected a character that isn't valid US-ASCII.");
+        }
+      }

    *output++ = code_point;
    }
@@ -428,7 +437,8 @@ vtkDelimitedTextReader::vtkDelimitedTextReader() :
  UnicodeStringDelimiters(vtkUnicodeString::from_utf8("\"")),
  UnicodeWhitespace(vtkUnicodeString::from_utf8(" \t\r\n\v\f")),
  UnicodeEscapeCharacter(vtkUnicodeString::from_utf8("\\")),
-  HaveHeaders(false)
+  HaveHeaders(false),
+  ReplacementCharacter('x')
 {
  this->SetNumberOfInputPorts(0);
  this->SetNumberOfOutputPorts(1);
@@ -633,7 +643,11 @@ int vtkDelimitedTextReader::RequestData(

    if("US-ASCII" == character_set || character_set.empty())
      {
-      ascii_to_unicode(content.begin(), content.end(), iterator);
+      ascii_to_unicode(content.begin(), content.end(), iterator, false, 'x');
+      }
+    else if("US-ASCII-WITH-FALLBACK" == character_set)
+      {
+      ascii_to_unicode(content.begin(), content.end(), iterator, true, this->ReplacementCharacter);
      }
    else if("UTF-8" == character_set)
      {

--- a/Infovis/vtkDelimitedTextReader.h
+++ b/Infovis/vtkDelimitedTextReader.h
@@ -32,14 +32,25 @@
 // vtkDelimitedTextReader is an interface for pulling in data from a
 // flat, delimited ascii or unicode text file (delimiter can be any character).
 //
-// The behavior of the reader with respect to ascii or unicode input is controlled
-// by the SetUnicodeCharacterSet() method.  By default (without calling SetUnicodeCharacterSet()),
-// the reader will expect to read ascii text and will output vtkStdString columns.  Use
-// the Set and Get methods to set delimiters that do not contain UTF8 in the name when operating
-// the reader in default ascii mode.  If the SetUnicodeCharacterSet() method is called, the reader
-// will output vtkUnicodeString columns in the output table.  In addition, it is necessary to use
-// the Set and Get methods that contain UTF8 in the name to specify delimiters when operating in
-// unicode mode.
+// The behavior of the reader with respect to ascii or unicode input
+// is controlled by the SetUnicodeCharacterSet() method.  By default
+// (without calling SetUnicodeCharacterSet()), the reader will expect
+// to read ascii text and will output vtkStdString columns.  Use the
+// Set and Get methods to set delimiters that do not contain UTF8 in
+// the name when operating the reader in default ascii mode.  If the
+// SetUnicodeCharacterSet() method is called, the reader will output
+// vtkUnicodeString columns in the output table.  In addition, it is
+// necessary to use the Set and Get methods that contain UTF8 in the
+// name to specify delimiters when operating in unicode mode.
+//
+// There is also a special character set US-ASCII-WITH-FALLBACK that
+// will treat the input text as ASCII no matter what.  If and when it
+// encounters a character with its 8th bit set it will replace that
+// character with the code point ReplacementCharacter.  You may use
+// this if you have text that belongs to a code page like LATIN9 or
+// ISO-8859-1 or friends: mostly ASCII but not entirely.  Eventually
+// this class will acquire the ability to read gracefully text from
+// any code page, making this option obsolete.
 //
 // This class emits ProgressEvent for every 100 lines it reads.
 //
@@ -175,6 +186,13 @@ public:
  // after calling Update().
  vtkStdString GetLastError();

+  // Description:
+  // Fallback character for use in the US-ASCII-WITH-FALLBACK
+  // character set.  Any characters that have their 8th bit set will
+  // be replaced with this code point.  Defaults to 'x'.
+  vtkSetMacro(ReplacementCharacter, vtkTypeUInt32);
+  vtkGetMacro(ReplacementCharacter, vtkTypeUInt32);
+
 //BTX
 protected:
  vtkDelimitedTextReader();
@@ -204,6 +222,7 @@ protected:
  bool GeneratePedigreeIds;
  bool OutputPedigreeIds;
  vtkStdString LastError;
+  vtkTypeUInt32 ReplacementCharacter;

 private:
  vtkDelimitedTextReader(const vtkDelimitedTextReader&); // Not implemented