From b3b20cc03ff10d452a5a86c908c7e0801402c5f1 Mon Sep 17 00:00:00 2001 From: Clinton Stimpson <clinton@elemtech.com> Date: Thu, 30 Jan 2014 21:34:30 -0700 Subject: [PATCH] FStream: Add ability to detect BOM. Change-Id: I4d06782730fca0fd68fb62c418b4e2d95f550625 --- FStream.cxx | 76 ++++++++++++++++++++++++++ FStream.hxx.in | 20 ++++++- testFStream.cxx | 142 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 236 insertions(+), 2 deletions(-) create mode 100644 FStream.cxx diff --git a/FStream.cxx b/FStream.cxx new file mode 100644 index 0000000..018652c --- /dev/null +++ b/FStream.cxx @@ -0,0 +1,76 @@ +/*============================================================================ + KWSys - Kitware System Library + Copyright 2000-2009 Kitware, Inc., Insight Software Consortium + + Distributed under the OSI-approved BSD License (the "License"); + see accompanying file Copyright.txt for details. + + This software is distributed WITHOUT ANY WARRANTY; without even the + implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the License for more information. +============================================================================*/ +#include "kwsysPrivate.h" +#include KWSYS_HEADER(FStream.hxx) + +// Work-around CMake dependency scanning limitation. This must +// duplicate the above list of headers. +#if 0 +# include "FStream.hxx.in" +#endif + +namespace KWSYS_NAMESPACE +{ +namespace FStream +{ + +BOM ReadBOM(std::istream& in) +{ + if(!in.good()) + { + return BOM_None; + } + unsigned long orig = in.tellg(); + unsigned char bom[4]; + in.read(reinterpret_cast<char*>(bom), 2); + if(!in.good()) + { + in.seekg(orig); + return BOM_None; + } + if(bom[0] == 0xEF && bom[1] == 0xBB) + { + in.read(reinterpret_cast<char*>(bom+2), 1); + if(in.good() && bom[2] == 0xBF) + { + return BOM_UTF8; + } + } + else if(bom[0] == 0xFE && bom[1] == 0xFF) + { + return BOM_UTF16BE; + } + else if(bom[0] == 0x00 && bom[1] == 0x00) + { + in.read(reinterpret_cast<char*>(bom+2), 2); + if(in.good() && bom[2] == 0xFE && bom[3] == 0xFF) + { + return BOM_UTF32BE; + } + } + else if(bom[0] == 0xFF && bom[1] == 0xFE) + { + unsigned long p = in.tellg(); + in.read(reinterpret_cast<char*>(bom+2), 2); + if(in.good() && bom[2] == 0x00 && bom[3] == 0x00) + { + return BOM_UTF32LE; + } + in.seekg(p); + return BOM_UTF16LE; + } + in.seekg(orig); + return BOM_None; +} + +} // FStream namespace +} //KWSYS_NAMESPACE diff --git a/FStream.hxx.in b/FStream.hxx.in index 00c84ee..45425ff 100644 --- a/FStream.hxx.in +++ b/FStream.hxx.in @@ -165,8 +165,24 @@ class basic_ofstream : public std::basic_ostream<CharType,Traits> using @KWSYS_NAMESPACE@_ios_namespace::ifstream; #endif + namespace FStream + { + enum BOM + { + BOM_None, + BOM_UTF8, + BOM_UTF16BE, + BOM_UTF16LE, + BOM_UTF32BE, + BOM_UTF32LE + }; + + // Read a BOM, if one exists. + // If a BOM exists, the stream is advanced to after the BOM. + // This function requires a seekable stream (but not a relative + // seekable stream). + BOM ReadBOM(std::istream& in); + } } - - #endif diff --git a/testFStream.cxx b/testFStream.cxx index 8942549..9abfd4c 100644 --- a/testFStream.cxx +++ b/testFStream.cxx @@ -16,11 +16,17 @@ #endif #include KWSYS_HEADER(FStream.hxx) +#include KWSYS_HEADER(ios/iostream) +#include <string.h> +#ifdef __BORLANDC__ +# include <mem.h> /* memcmp */ +#endif // Work-around CMake dependency scanning limitation. This must // duplicate the above list of headers. #if 0 # include "FStream.hxx.in" +# include "kwsys_ios_iostream.h.in" #endif @@ -36,6 +42,141 @@ static int testNoFile() return 0; } +static kwsys::FStream::BOM expected_bom[5] = +{ + kwsys::FStream::BOM_UTF8, + kwsys::FStream::BOM_UTF16LE, + kwsys::FStream::BOM_UTF16BE, + kwsys::FStream::BOM_UTF32LE, + kwsys::FStream::BOM_UTF32BE +}; + +static unsigned char expected_bom_data[5][5] = +{ + {3, 0xEF, 0xBB, 0xBF}, + {2, 0xFF, 0xFE}, + {2, 0xFE, 0xFF}, + {4, 0xFF, 0xFE, 0x00, 0x00}, + {4, 0x00, 0x00, 0xFE, 0xFF}, +}; + +static unsigned char file_data[5][45] = +{ + {11, 'H', 'e', 'l', 'l', 'o', ' ', 'W', 'o', 'r', 'l', 'd'}, + {22, 0x48, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F, 0x00, 0x20, 0x00, + 0x57, 0x00, 0x6F, 0x00, 0x72, 0x00, 0x6C, 0x00, 0x64, 0x00}, + {22, 0x00, 0x48, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F, 0x00, 0x20, + 0x00, 0x57, 0x00, 0x6F, 0x00, 0x72, 0x00, 0x6C, 0x00, 0x64}, + {44, 0x48, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, + 0x6C, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x57, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, + 0x6C, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00}, + {44, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x6C, + 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x20, + 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x72, + 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x64}, +}; + +//---------------------------------------------------------------------------- +static int testBOM() +{ + // test various encodings in binary mode + for(int i=0; i<5; i++) + { + { + kwsys::ofstream out("bom.txt", kwsys::ofstream::binary); + out.write(reinterpret_cast<const char*>(expected_bom_data[i]+1), + *expected_bom_data[i]); + out.write(reinterpret_cast<const char*>(file_data[i]+1), + file_data[i][0]); + } + + kwsys::ifstream in("bom.txt", kwsys::ofstream::binary); + kwsys::FStream::BOM bom = kwsys::FStream::ReadBOM(in); + if(bom != expected_bom[i]) + { + kwsys_ios::cout << "Unexpected BOM " << i << std::endl; + return 1; + } + char data[45]; + in.read(data, file_data[i][0]); + if(!in.good()) + { + kwsys_ios::cout << "Unable to read data " << i << std::endl; + return 1; + } + + if(memcmp(data, file_data[i]+1, file_data[i][0]) != 0) + { + kwsys_ios::cout << "Incorrect read data " << i << std::endl; + return 1; + } + + } + + // test text file without bom + { + { + kwsys::ofstream out("bom.txt"); + out << "Hello World"; + } + + kwsys::ifstream in("bom.txt"); + kwsys::FStream::BOM bom = kwsys::FStream::ReadBOM(in); + if(bom != kwsys::FStream::BOM_None) + { + kwsys_ios::cout << "Unexpected BOM for none case" << std::endl; + return 1; + } + char data[45]; + in.read(data, file_data[0][0]); + if(!in.good()) + { + kwsys_ios::cout << "Unable to read data for none case" << std::endl; + return 1; + } + + if(memcmp(data, file_data[0]+1, file_data[0][0]) != 0) + { + kwsys_ios::cout << "Incorrect read data for none case" << std::endl; + return 1; + } + } + + // test text file with utf-8 bom + { + { + kwsys::ofstream out("bom.txt"); + out.write(reinterpret_cast<const char*>(expected_bom_data[0]+1), + *expected_bom_data[0]); + out << "Hello World"; + } + + kwsys::ifstream in("bom.txt"); + kwsys::FStream::BOM bom = kwsys::FStream::ReadBOM(in); + if(bom != kwsys::FStream::BOM_UTF8) + { + kwsys_ios::cout << "Unexpected BOM for utf-8 case" << std::endl; + return 1; + } + char data[45]; + in.read(data, file_data[0][0]); + if(!in.good()) + { + kwsys_ios::cout << "Unable to read data for utf-8 case" << std::endl; + return 1; + } + + if(memcmp(data, file_data[0]+1, file_data[0][0]) != 0) + { + kwsys_ios::cout << "Incorrect read data for utf-8 case" << std::endl; + return 1; + } + } + + return 0; +} + //---------------------------------------------------------------------------- int testFStream(int, char*[]) @@ -43,6 +184,7 @@ int testFStream(int, char*[]) int ret = 0; ret |= testNoFile(); + ret |= testBOM(); return ret; } -- GitLab