From 8e0f7838c195fa14416ce98097b3e5d2006c2078 Mon Sep 17 00:00:00 2001 From: Andy Wilson <atwilso@sandia.gov> Date: Wed, 13 Oct 2010 17:38:00 -0600 Subject: [PATCH] Add US-ASCII-WITH-FALLBACK for files that are almost entirely ASCII The US-ASCII reader for vtkDelimitedTextReader will die if it encounters even one character with its 8th bit set. This leaves us with no way to read documents from non-UTF code pages that have internationalized characters in the text. I added a new "character set" US-ASCII-WITH-FALLBACK that lets us read those anyway. The reader has a new ivar ReplacementCharacter that will be inserted in place of any characters beyond 0x7f (the same ones that cause US-ASCII to choke). --- Infovis/vtkDelimitedTextReader.cxx | 24 +++++++++++++++----- Infovis/vtkDelimitedTextReader.h | 35 +++++++++++++++++++++++------- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/Infovis/vtkDelimitedTextReader.cxx b/Infovis/vtkDelimitedTextReader.cxx index f6dc3119a61..a1dd1b77b1b 100644 --- a/Infovis/vtkDelimitedTextReader.cxx +++ b/Infovis/vtkDelimitedTextReader.cxx @@ -356,13 +356,22 @@ private: // ascii_to_unicode template<typename OctetIteratorT, typename OutputIteratorT> -void ascii_to_unicode(OctetIteratorT begin, OctetIteratorT end, OutputIteratorT output) +void ascii_to_unicode(OctetIteratorT begin, OctetIteratorT end, OutputIteratorT output, bool replace_8bit_characters, vtkTypeUInt32 fallback_character = 'x') { while(begin != end) { - const vtkTypeUInt32 code_point = *begin++; + vtkTypeUInt32 code_point = *begin++; if(code_point > 0x7f) - throw vtkstd::runtime_error("Detected a character that isn't valid US-ASCII."); + { + if (replace_8bit_characters) + { + code_point = fallback_character; + } + else + { + throw vtkstd::runtime_error("Detected a character that isn't valid US-ASCII."); + } + } *output++ = code_point; } @@ -428,7 +437,8 @@ vtkDelimitedTextReader::vtkDelimitedTextReader() : UnicodeStringDelimiters(vtkUnicodeString::from_utf8("\"")), UnicodeWhitespace(vtkUnicodeString::from_utf8(" \t\r\n\v\f")), UnicodeEscapeCharacter(vtkUnicodeString::from_utf8("\\")), - HaveHeaders(false) + HaveHeaders(false), + ReplacementCharacter('x') { this->SetNumberOfInputPorts(0); this->SetNumberOfOutputPorts(1); @@ -633,7 +643,11 @@ int vtkDelimitedTextReader::RequestData( if("US-ASCII" == character_set || character_set.empty()) { - ascii_to_unicode(content.begin(), content.end(), iterator); + ascii_to_unicode(content.begin(), content.end(), iterator, false, 'x'); + } + else if("US-ASCII-WITH-FALLBACK" == character_set) + { + ascii_to_unicode(content.begin(), content.end(), iterator, true, this->ReplacementCharacter); } else if("UTF-8" == character_set) { diff --git a/Infovis/vtkDelimitedTextReader.h b/Infovis/vtkDelimitedTextReader.h index c07a6164f21..9fe3e44d95a 100644 --- a/Infovis/vtkDelimitedTextReader.h +++ b/Infovis/vtkDelimitedTextReader.h @@ -32,14 +32,25 @@ // vtkDelimitedTextReader is an interface for pulling in data from a // flat, delimited ascii or unicode text file (delimiter can be any character). // -// The behavior of the reader with respect to ascii or unicode input is controlled -// by the SetUnicodeCharacterSet() method. By default (without calling SetUnicodeCharacterSet()), -// the reader will expect to read ascii text and will output vtkStdString columns. Use -// the Set and Get methods to set delimiters that do not contain UTF8 in the name when operating -// the reader in default ascii mode. If the SetUnicodeCharacterSet() method is called, the reader -// will output vtkUnicodeString columns in the output table. In addition, it is necessary to use -// the Set and Get methods that contain UTF8 in the name to specify delimiters when operating in -// unicode mode. +// The behavior of the reader with respect to ascii or unicode input +// is controlled by the SetUnicodeCharacterSet() method. By default +// (without calling SetUnicodeCharacterSet()), the reader will expect +// to read ascii text and will output vtkStdString columns. Use the +// Set and Get methods to set delimiters that do not contain UTF8 in +// the name when operating the reader in default ascii mode. If the +// SetUnicodeCharacterSet() method is called, the reader will output +// vtkUnicodeString columns in the output table. In addition, it is +// necessary to use the Set and Get methods that contain UTF8 in the +// name to specify delimiters when operating in unicode mode. +// +// There is also a special character set US-ASCII-WITH-FALLBACK that +// will treat the input text as ASCII no matter what. If and when it +// encounters a character with its 8th bit set it will replace that +// character with the code point ReplacementCharacter. You may use +// this if you have text that belongs to a code page like LATIN9 or +// ISO-8859-1 or friends: mostly ASCII but not entirely. Eventually +// this class will acquire the ability to read gracefully text from +// any code page, making this option obsolete. // // This class emits ProgressEvent for every 100 lines it reads. // @@ -175,6 +186,13 @@ public: // after calling Update(). vtkStdString GetLastError(); + // Description: + // Fallback character for use in the US-ASCII-WITH-FALLBACK + // character set. Any characters that have their 8th bit set will + // be replaced with this code point. Defaults to 'x'. + vtkSetMacro(ReplacementCharacter, vtkTypeUInt32); + vtkGetMacro(ReplacementCharacter, vtkTypeUInt32); + //BTX protected: vtkDelimitedTextReader(); @@ -204,6 +222,7 @@ protected: bool GeneratePedigreeIds; bool OutputPedigreeIds; vtkStdString LastError; + vtkTypeUInt32 ReplacementCharacter; private: vtkDelimitedTextReader(const vtkDelimitedTextReader&); // Not implemented -- GitLab