From b3b20cc03ff10d452a5a86c908c7e0801402c5f1 Mon Sep 17 00:00:00 2001
From: Clinton Stimpson <clinton@elemtech.com>
Date: Thu, 30 Jan 2014 21:34:30 -0700
Subject: [PATCH] FStream: Add ability to detect BOM.

Change-Id: I4d06782730fca0fd68fb62c418b4e2d95f550625
---
 FStream.cxx     |  76 ++++++++++++++++++++++++++
 FStream.hxx.in  |  20 ++++++-
 testFStream.cxx | 142 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 236 insertions(+), 2 deletions(-)
 create mode 100644 FStream.cxx

diff --git a/FStream.cxx b/FStream.cxx
new file mode 100644
index 0000000..018652c
--- /dev/null
+++ b/FStream.cxx
@@ -0,0 +1,76 @@
+/*============================================================================
+  KWSys - Kitware System Library
+  Copyright 2000-2009 Kitware, Inc., Insight Software Consortium
+
+  Distributed under the OSI-approved BSD License (the "License");
+  see accompanying file Copyright.txt for details.
+
+  This software is distributed WITHOUT ANY WARRANTY; without even the
+  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+  See the License for more information.
+============================================================================*/
+#include "kwsysPrivate.h"
+#include KWSYS_HEADER(FStream.hxx)
+
+// Work-around CMake dependency scanning limitation.  This must
+// duplicate the above list of headers.
+#if 0
+# include "FStream.hxx.in"
+#endif
+
+namespace KWSYS_NAMESPACE
+{
+namespace FStream
+{
+
+BOM ReadBOM(std::istream& in)
+{
+  if(!in.good())
+    {
+    return BOM_None;
+    }
+  unsigned long orig = in.tellg();
+  unsigned char bom[4];
+  in.read(reinterpret_cast<char*>(bom), 2);
+  if(!in.good())
+    {
+    in.seekg(orig);
+    return BOM_None;
+    }
+  if(bom[0] == 0xEF && bom[1] == 0xBB)
+    {
+    in.read(reinterpret_cast<char*>(bom+2), 1);
+    if(in.good() && bom[2] == 0xBF)
+      {
+      return BOM_UTF8;
+      }
+    }
+  else if(bom[0] == 0xFE && bom[1] == 0xFF)
+    {
+    return BOM_UTF16BE;
+    }
+  else if(bom[0] == 0x00 && bom[1] == 0x00)
+    {
+    in.read(reinterpret_cast<char*>(bom+2), 2);
+    if(in.good() && bom[2] == 0xFE && bom[3] == 0xFF)
+      {
+      return BOM_UTF32BE;
+      }
+    }
+  else if(bom[0] == 0xFF && bom[1] == 0xFE)
+    {
+    unsigned long p = in.tellg();
+    in.read(reinterpret_cast<char*>(bom+2), 2);
+    if(in.good() && bom[2] == 0x00 && bom[3] == 0x00)
+      {
+      return BOM_UTF32LE;
+      }
+    in.seekg(p);
+    return BOM_UTF16LE;
+    }
+  in.seekg(orig);
+  return BOM_None;
+}
+
+} // FStream namespace
+} //KWSYS_NAMESPACE
diff --git a/FStream.hxx.in b/FStream.hxx.in
index 00c84ee..45425ff 100644
--- a/FStream.hxx.in
+++ b/FStream.hxx.in
@@ -165,8 +165,24 @@ class basic_ofstream : public std::basic_ostream<CharType,Traits>
   using @KWSYS_NAMESPACE@_ios_namespace::ifstream;
 #endif
 
+  namespace FStream
+  {
+    enum BOM
+    {
+      BOM_None,
+      BOM_UTF8,
+      BOM_UTF16BE,
+      BOM_UTF16LE,
+      BOM_UTF32BE,
+      BOM_UTF32LE
+    };
+
+    // Read a BOM, if one exists.
+    // If a BOM exists, the stream is advanced to after the BOM.
+    // This function requires a seekable stream (but not a relative
+    // seekable stream).
+    BOM ReadBOM(std::istream& in);
+  }
 }
 
-
-
 #endif
diff --git a/testFStream.cxx b/testFStream.cxx
index 8942549..9abfd4c 100644
--- a/testFStream.cxx
+++ b/testFStream.cxx
@@ -16,11 +16,17 @@
 #endif
 
 #include KWSYS_HEADER(FStream.hxx)
+#include KWSYS_HEADER(ios/iostream)
+#include <string.h>
+#ifdef __BORLANDC__
+# include <mem.h> /* memcmp */
+#endif
 
 // Work-around CMake dependency scanning limitation.  This must
 // duplicate the above list of headers.
 #if 0
 # include "FStream.hxx.in"
+# include "kwsys_ios_iostream.h.in"
 #endif
 
 
@@ -36,6 +42,141 @@ static int testNoFile()
   return 0;
 }
 
+static kwsys::FStream::BOM expected_bom[5] =
+{
+  kwsys::FStream::BOM_UTF8,
+  kwsys::FStream::BOM_UTF16LE,
+  kwsys::FStream::BOM_UTF16BE,
+  kwsys::FStream::BOM_UTF32LE,
+  kwsys::FStream::BOM_UTF32BE
+};
+
+static unsigned char expected_bom_data[5][5] =
+{
+    {3, 0xEF, 0xBB, 0xBF},
+    {2, 0xFF, 0xFE},
+    {2, 0xFE, 0xFF},
+    {4, 0xFF, 0xFE, 0x00, 0x00},
+    {4, 0x00, 0x00, 0xFE, 0xFF},
+};
+
+static unsigned char file_data[5][45] =
+{
+    {11, 'H', 'e', 'l', 'l', 'o', ' ', 'W', 'o', 'r', 'l', 'd'},
+    {22, 0x48, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F, 0x00, 0x20, 0x00,
+    0x57, 0x00, 0x6F, 0x00, 0x72, 0x00, 0x6C, 0x00, 0x64, 0x00},
+    {22, 0x00, 0x48, 0x00, 0x65, 0x00, 0x6C, 0x00, 0x6C, 0x00, 0x6F, 0x00, 0x20,
+    0x00, 0x57, 0x00, 0x6F, 0x00, 0x72, 0x00, 0x6C, 0x00, 0x64},
+    {44, 0x48, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00,
+    0x6C, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+    0x57, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00,
+    0x6C, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00},
+    {44, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x65, 0x00, 0x00, 0x00, 0x6C,
+    0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x20,
+    0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x6F, 0x00, 0x00, 0x00, 0x72,
+    0x00, 0x00, 0x00, 0x6C, 0x00, 0x00, 0x00, 0x64},
+};
+
+//----------------------------------------------------------------------------
+static int testBOM()
+{
+  // test various encodings in binary mode
+  for(int i=0; i<5; i++)
+    {
+      {
+      kwsys::ofstream out("bom.txt", kwsys::ofstream::binary);
+      out.write(reinterpret_cast<const char*>(expected_bom_data[i]+1),
+                *expected_bom_data[i]);
+      out.write(reinterpret_cast<const char*>(file_data[i]+1),
+                file_data[i][0]);
+      }
+
+    kwsys::ifstream in("bom.txt", kwsys::ofstream::binary);
+    kwsys::FStream::BOM bom = kwsys::FStream::ReadBOM(in);
+    if(bom != expected_bom[i])
+      {
+      kwsys_ios::cout << "Unexpected BOM " << i << std::endl;
+      return 1;
+      }
+    char data[45];
+    in.read(data, file_data[i][0]);
+    if(!in.good())
+      {
+      kwsys_ios::cout << "Unable to read data " << i << std::endl;
+      return 1;
+      }
+
+    if(memcmp(data, file_data[i]+1, file_data[i][0]) != 0)
+      {
+      kwsys_ios::cout << "Incorrect read data " << i << std::endl;
+      return 1;
+      }
+
+    }
+
+  // test text file without bom
+  {
+    {
+    kwsys::ofstream out("bom.txt");
+    out << "Hello World";
+    }
+
+    kwsys::ifstream in("bom.txt");
+    kwsys::FStream::BOM bom = kwsys::FStream::ReadBOM(in);
+    if(bom != kwsys::FStream::BOM_None)
+      {
+      kwsys_ios::cout << "Unexpected BOM for none case" << std::endl;
+      return 1;
+      }
+    char data[45];
+    in.read(data, file_data[0][0]);
+    if(!in.good())
+      {
+      kwsys_ios::cout << "Unable to read data for none case" << std::endl;
+      return 1;
+      }
+
+    if(memcmp(data, file_data[0]+1, file_data[0][0]) != 0)
+      {
+      kwsys_ios::cout << "Incorrect read data for none case" << std::endl;
+      return 1;
+      }
+  }
+
+  // test text file with utf-8 bom
+  {
+    {
+    kwsys::ofstream out("bom.txt");
+    out.write(reinterpret_cast<const char*>(expected_bom_data[0]+1),
+              *expected_bom_data[0]);
+    out << "Hello World";
+    }
+
+    kwsys::ifstream in("bom.txt");
+    kwsys::FStream::BOM bom = kwsys::FStream::ReadBOM(in);
+    if(bom != kwsys::FStream::BOM_UTF8)
+      {
+      kwsys_ios::cout << "Unexpected BOM for utf-8 case" << std::endl;
+      return 1;
+      }
+    char data[45];
+    in.read(data, file_data[0][0]);
+    if(!in.good())
+      {
+      kwsys_ios::cout << "Unable to read data for utf-8 case" << std::endl;
+      return 1;
+      }
+
+    if(memcmp(data, file_data[0]+1, file_data[0][0]) != 0)
+      {
+      kwsys_ios::cout << "Incorrect read data for utf-8 case" << std::endl;
+      return 1;
+      }
+  }
+
+  return 0;
+}
+
 
 //----------------------------------------------------------------------------
 int testFStream(int, char*[])
@@ -43,6 +184,7 @@ int testFStream(int, char*[])
   int ret = 0;
 
   ret |= testNoFile();
+  ret |= testBOM();
 
   return ret;
 }
-- 
GitLab