Skip to content
Snippets Groups Projects
Commit 8e0f7838 authored by Andy Wilson's avatar Andy Wilson
Browse files

Add US-ASCII-WITH-FALLBACK for files that are almost entirely ASCII

The US-ASCII reader for vtkDelimitedTextReader will die if it
encounters even one character with its 8th bit set.  This leaves us
with no way to read documents from non-UTF code pages that have
internationalized characters in the text.

I added a new "character set" US-ASCII-WITH-FALLBACK that lets us
read those anyway.  The reader has a new ivar ReplacementCharacter
that will be inserted in place of any characters beyond 0x7f (the
same ones that cause US-ASCII to choke).
parent 4be10e17
No related branches found
No related tags found
No related merge requests found
......@@ -356,13 +356,22 @@ private:
// ascii_to_unicode
template<typename OctetIteratorT, typename OutputIteratorT>
void ascii_to_unicode(OctetIteratorT begin, OctetIteratorT end, OutputIteratorT output)
void ascii_to_unicode(OctetIteratorT begin, OctetIteratorT end, OutputIteratorT output, bool replace_8bit_characters, vtkTypeUInt32 fallback_character = 'x')
{
while(begin != end)
{
const vtkTypeUInt32 code_point = *begin++;
vtkTypeUInt32 code_point = *begin++;
if(code_point > 0x7f)
throw vtkstd::runtime_error("Detected a character that isn't valid US-ASCII.");
{
if (replace_8bit_characters)
{
code_point = fallback_character;
}
else
{
throw vtkstd::runtime_error("Detected a character that isn't valid US-ASCII.");
}
}
*output++ = code_point;
}
......@@ -428,7 +437,8 @@ vtkDelimitedTextReader::vtkDelimitedTextReader() :
UnicodeStringDelimiters(vtkUnicodeString::from_utf8("\"")),
UnicodeWhitespace(vtkUnicodeString::from_utf8(" \t\r\n\v\f")),
UnicodeEscapeCharacter(vtkUnicodeString::from_utf8("\\")),
HaveHeaders(false)
HaveHeaders(false),
ReplacementCharacter('x')
{
this->SetNumberOfInputPorts(0);
this->SetNumberOfOutputPorts(1);
......@@ -633,7 +643,11 @@ int vtkDelimitedTextReader::RequestData(
if("US-ASCII" == character_set || character_set.empty())
{
ascii_to_unicode(content.begin(), content.end(), iterator);
ascii_to_unicode(content.begin(), content.end(), iterator, false, 'x');
}
else if("US-ASCII-WITH-FALLBACK" == character_set)
{
ascii_to_unicode(content.begin(), content.end(), iterator, true, this->ReplacementCharacter);
}
else if("UTF-8" == character_set)
{
......
......@@ -32,14 +32,25 @@
// vtkDelimitedTextReader is an interface for pulling in data from a
// flat, delimited ascii or unicode text file (delimiter can be any character).
//
// The behavior of the reader with respect to ascii or unicode input is controlled
// by the SetUnicodeCharacterSet() method. By default (without calling SetUnicodeCharacterSet()),
// the reader will expect to read ascii text and will output vtkStdString columns. Use
// the Set and Get methods to set delimiters that do not contain UTF8 in the name when operating
// the reader in default ascii mode. If the SetUnicodeCharacterSet() method is called, the reader
// will output vtkUnicodeString columns in the output table. In addition, it is necessary to use
// the Set and Get methods that contain UTF8 in the name to specify delimiters when operating in
// unicode mode.
// The behavior of the reader with respect to ascii or unicode input
// is controlled by the SetUnicodeCharacterSet() method. By default
// (without calling SetUnicodeCharacterSet()), the reader will expect
// to read ascii text and will output vtkStdString columns. Use the
// Set and Get methods to set delimiters that do not contain UTF8 in
// the name when operating the reader in default ascii mode. If the
// SetUnicodeCharacterSet() method is called, the reader will output
// vtkUnicodeString columns in the output table. In addition, it is
// necessary to use the Set and Get methods that contain UTF8 in the
// name to specify delimiters when operating in unicode mode.
//
// There is also a special character set US-ASCII-WITH-FALLBACK that
// will treat the input text as ASCII no matter what. If and when it
// encounters a character with its 8th bit set it will replace that
// character with the code point ReplacementCharacter. You may use
// this if you have text that belongs to a code page like LATIN9 or
// ISO-8859-1 or friends: mostly ASCII but not entirely. Eventually
// this class will acquire the ability to read gracefully text from
// any code page, making this option obsolete.
//
// This class emits ProgressEvent for every 100 lines it reads.
//
......@@ -175,6 +186,13 @@ public:
// after calling Update().
vtkStdString GetLastError();
// Description:
// Fallback character for use in the US-ASCII-WITH-FALLBACK
// character set. Any characters that have their 8th bit set will
// be replaced with this code point. Defaults to 'x'.
vtkSetMacro(ReplacementCharacter, vtkTypeUInt32);
vtkGetMacro(ReplacementCharacter, vtkTypeUInt32);
//BTX
protected:
vtkDelimitedTextReader();
......@@ -204,6 +222,7 @@ protected:
bool GeneratePedigreeIds;
bool OutputPedigreeIds;
vtkStdString LastError;
vtkTypeUInt32 ReplacementCharacter;
private:
vtkDelimitedTextReader(const vtkDelimitedTextReader&); // Not implemented
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment