From 8e0f7838c195fa14416ce98097b3e5d2006c2078 Mon Sep 17 00:00:00 2001
From: Andy Wilson <atwilso@sandia.gov>
Date: Wed, 13 Oct 2010 17:38:00 -0600
Subject: [PATCH] Add US-ASCII-WITH-FALLBACK for files that are almost entirely
 ASCII

The US-ASCII reader for vtkDelimitedTextReader will die if it
encounters even one character with its 8th bit set.  This leaves us
with no way to read documents from non-UTF code pages that have
internationalized characters in the text.

I added a new "character set" US-ASCII-WITH-FALLBACK that lets us
read those anyway.  The reader has a new ivar ReplacementCharacter
that will be inserted in place of any characters beyond 0x7f (the
same ones that cause US-ASCII to choke).
---
 Infovis/vtkDelimitedTextReader.cxx | 24 +++++++++++++++-----
 Infovis/vtkDelimitedTextReader.h   | 35 +++++++++++++++++++++++-------
 2 files changed, 46 insertions(+), 13 deletions(-)

diff --git a/Infovis/vtkDelimitedTextReader.cxx b/Infovis/vtkDelimitedTextReader.cxx
index f6dc3119a61..a1dd1b77b1b 100644
--- a/Infovis/vtkDelimitedTextReader.cxx
+++ b/Infovis/vtkDelimitedTextReader.cxx
@@ -356,13 +356,22 @@ private:
 // ascii_to_unicode
 
 template<typename OctetIteratorT, typename OutputIteratorT>
-void ascii_to_unicode(OctetIteratorT begin, OctetIteratorT end, OutputIteratorT output)
+void ascii_to_unicode(OctetIteratorT begin, OctetIteratorT end, OutputIteratorT output, bool replace_8bit_characters, vtkTypeUInt32 fallback_character = 'x')
 {
   while(begin != end)
     {
-    const vtkTypeUInt32 code_point = *begin++;
+    vtkTypeUInt32 code_point = *begin++;
     if(code_point > 0x7f)
-      throw vtkstd::runtime_error("Detected a character that isn't valid US-ASCII.");
+      {
+      if (replace_8bit_characters)
+        {
+        code_point = fallback_character;
+        }
+      else
+        {
+        throw vtkstd::runtime_error("Detected a character that isn't valid US-ASCII.");
+        }
+      }
 
     *output++ = code_point;
     }
@@ -428,7 +437,8 @@ vtkDelimitedTextReader::vtkDelimitedTextReader() :
   UnicodeStringDelimiters(vtkUnicodeString::from_utf8("\"")),
   UnicodeWhitespace(vtkUnicodeString::from_utf8(" \t\r\n\v\f")),
   UnicodeEscapeCharacter(vtkUnicodeString::from_utf8("\\")),
-  HaveHeaders(false)
+  HaveHeaders(false),
+  ReplacementCharacter('x')
 {
   this->SetNumberOfInputPorts(0);
   this->SetNumberOfOutputPorts(1);
@@ -633,7 +643,11 @@ int vtkDelimitedTextReader::RequestData(
 
     if("US-ASCII" == character_set || character_set.empty())
       {
-      ascii_to_unicode(content.begin(), content.end(), iterator);
+      ascii_to_unicode(content.begin(), content.end(), iterator, false, 'x');
+      }
+    else if("US-ASCII-WITH-FALLBACK" == character_set)
+      {
+      ascii_to_unicode(content.begin(), content.end(), iterator, true, this->ReplacementCharacter);
       }
     else if("UTF-8" == character_set)
       {
diff --git a/Infovis/vtkDelimitedTextReader.h b/Infovis/vtkDelimitedTextReader.h
index c07a6164f21..9fe3e44d95a 100644
--- a/Infovis/vtkDelimitedTextReader.h
+++ b/Infovis/vtkDelimitedTextReader.h
@@ -32,14 +32,25 @@
 // vtkDelimitedTextReader is an interface for pulling in data from a
 // flat, delimited ascii or unicode text file (delimiter can be any character).
 //
-// The behavior of the reader with respect to ascii or unicode input is controlled
-// by the SetUnicodeCharacterSet() method.  By default (without calling SetUnicodeCharacterSet()),
-// the reader will expect to read ascii text and will output vtkStdString columns.  Use
-// the Set and Get methods to set delimiters that do not contain UTF8 in the name when operating
-// the reader in default ascii mode.  If the SetUnicodeCharacterSet() method is called, the reader
-// will output vtkUnicodeString columns in the output table.  In addition, it is necessary to use
-// the Set and Get methods that contain UTF8 in the name to specify delimiters when operating in
-// unicode mode.
+// The behavior of the reader with respect to ascii or unicode input
+// is controlled by the SetUnicodeCharacterSet() method.  By default
+// (without calling SetUnicodeCharacterSet()), the reader will expect
+// to read ascii text and will output vtkStdString columns.  Use the
+// Set and Get methods to set delimiters that do not contain UTF8 in
+// the name when operating the reader in default ascii mode.  If the
+// SetUnicodeCharacterSet() method is called, the reader will output
+// vtkUnicodeString columns in the output table.  In addition, it is
+// necessary to use the Set and Get methods that contain UTF8 in the
+// name to specify delimiters when operating in unicode mode.
+//
+// There is also a special character set US-ASCII-WITH-FALLBACK that
+// will treat the input text as ASCII no matter what.  If and when it
+// encounters a character with its 8th bit set it will replace that
+// character with the code point ReplacementCharacter.  You may use
+// this if you have text that belongs to a code page like LATIN9 or
+// ISO-8859-1 or friends: mostly ASCII but not entirely.  Eventually
+// this class will acquire the ability to read gracefully text from
+// any code page, making this option obsolete.
 //
 // This class emits ProgressEvent for every 100 lines it reads.
 //
@@ -175,6 +186,13 @@ public:
   // after calling Update().
   vtkStdString GetLastError();
 
+  // Description:
+  // Fallback character for use in the US-ASCII-WITH-FALLBACK
+  // character set.  Any characters that have their 8th bit set will
+  // be replaced with this code point.  Defaults to 'x'.
+  vtkSetMacro(ReplacementCharacter, vtkTypeUInt32);
+  vtkGetMacro(ReplacementCharacter, vtkTypeUInt32);
+
 //BTX
 protected:
   vtkDelimitedTextReader();
@@ -204,6 +222,7 @@ protected:
   bool GeneratePedigreeIds;
   bool OutputPedigreeIds;
   vtkStdString LastError;
+  vtkTypeUInt32 ReplacementCharacter;
 
 private:
   vtkDelimitedTextReader(const vtkDelimitedTextReader&); // Not implemented
-- 
GitLab