diff --git a/Infovis/vtkDelimitedTextReader.cxx b/Infovis/vtkDelimitedTextReader.cxx index f6dc3119a6115a0850578e78ef491b4910860a73..a1dd1b77b1bcf11d6e024f79ca51f7a8833ef7f7 100644 --- a/Infovis/vtkDelimitedTextReader.cxx +++ b/Infovis/vtkDelimitedTextReader.cxx @@ -356,13 +356,22 @@ private: // ascii_to_unicode template<typename OctetIteratorT, typename OutputIteratorT> -void ascii_to_unicode(OctetIteratorT begin, OctetIteratorT end, OutputIteratorT output) +void ascii_to_unicode(OctetIteratorT begin, OctetIteratorT end, OutputIteratorT output, bool replace_8bit_characters, vtkTypeUInt32 fallback_character = 'x') { while(begin != end) { - const vtkTypeUInt32 code_point = *begin++; + vtkTypeUInt32 code_point = *begin++; if(code_point > 0x7f) - throw vtkstd::runtime_error("Detected a character that isn't valid US-ASCII."); + { + if (replace_8bit_characters) + { + code_point = fallback_character; + } + else + { + throw vtkstd::runtime_error("Detected a character that isn't valid US-ASCII."); + } + } *output++ = code_point; } @@ -428,7 +437,8 @@ vtkDelimitedTextReader::vtkDelimitedTextReader() : UnicodeStringDelimiters(vtkUnicodeString::from_utf8("\"")), UnicodeWhitespace(vtkUnicodeString::from_utf8(" \t\r\n\v\f")), UnicodeEscapeCharacter(vtkUnicodeString::from_utf8("\\")), - HaveHeaders(false) + HaveHeaders(false), + ReplacementCharacter('x') { this->SetNumberOfInputPorts(0); this->SetNumberOfOutputPorts(1); @@ -633,7 +643,11 @@ int vtkDelimitedTextReader::RequestData( if("US-ASCII" == character_set || character_set.empty()) { - ascii_to_unicode(content.begin(), content.end(), iterator); + ascii_to_unicode(content.begin(), content.end(), iterator, false, 'x'); + } + else if("US-ASCII-WITH-FALLBACK" == character_set) + { + ascii_to_unicode(content.begin(), content.end(), iterator, true, this->ReplacementCharacter); } else if("UTF-8" == character_set) { diff --git a/Infovis/vtkDelimitedTextReader.h b/Infovis/vtkDelimitedTextReader.h index c07a6164f21a9152a4ed9ef0e4df5d4436df88f7..9fe3e44d95a7e032d1d001fbc580ceb40ed29763 100644 --- a/Infovis/vtkDelimitedTextReader.h +++ b/Infovis/vtkDelimitedTextReader.h @@ -32,14 +32,25 @@ // vtkDelimitedTextReader is an interface for pulling in data from a // flat, delimited ascii or unicode text file (delimiter can be any character). // -// The behavior of the reader with respect to ascii or unicode input is controlled -// by the SetUnicodeCharacterSet() method. By default (without calling SetUnicodeCharacterSet()), -// the reader will expect to read ascii text and will output vtkStdString columns. Use -// the Set and Get methods to set delimiters that do not contain UTF8 in the name when operating -// the reader in default ascii mode. If the SetUnicodeCharacterSet() method is called, the reader -// will output vtkUnicodeString columns in the output table. In addition, it is necessary to use -// the Set and Get methods that contain UTF8 in the name to specify delimiters when operating in -// unicode mode. +// The behavior of the reader with respect to ascii or unicode input +// is controlled by the SetUnicodeCharacterSet() method. By default +// (without calling SetUnicodeCharacterSet()), the reader will expect +// to read ascii text and will output vtkStdString columns. Use the +// Set and Get methods to set delimiters that do not contain UTF8 in +// the name when operating the reader in default ascii mode. If the +// SetUnicodeCharacterSet() method is called, the reader will output +// vtkUnicodeString columns in the output table. In addition, it is +// necessary to use the Set and Get methods that contain UTF8 in the +// name to specify delimiters when operating in unicode mode. +// +// There is also a special character set US-ASCII-WITH-FALLBACK that +// will treat the input text as ASCII no matter what. If and when it +// encounters a character with its 8th bit set it will replace that +// character with the code point ReplacementCharacter. You may use +// this if you have text that belongs to a code page like LATIN9 or +// ISO-8859-1 or friends: mostly ASCII but not entirely. Eventually +// this class will acquire the ability to read gracefully text from +// any code page, making this option obsolete. // // This class emits ProgressEvent for every 100 lines it reads. // @@ -175,6 +186,13 @@ public: // after calling Update(). vtkStdString GetLastError(); + // Description: + // Fallback character for use in the US-ASCII-WITH-FALLBACK + // character set. Any characters that have their 8th bit set will + // be replaced with this code point. Defaults to 'x'. + vtkSetMacro(ReplacementCharacter, vtkTypeUInt32); + vtkGetMacro(ReplacementCharacter, vtkTypeUInt32); + //BTX protected: vtkDelimitedTextReader(); @@ -204,6 +222,7 @@ protected: bool GeneratePedigreeIds; bool OutputPedigreeIds; vtkStdString LastError; + vtkTypeUInt32 ReplacementCharacter; private: vtkDelimitedTextReader(const vtkDelimitedTextReader&); // Not implemented