Commit bc407ba6 authored by Brad King's avatar Brad King 💬 Committed by Kitware Robot
Browse files

Merge topic 'codecvt-revise'

690acadc

 codecvt: Re-implement do_out and do_unshift

Acked-by: Kitware Robot's avatarKitware Robot <kwrobot@kitware.com>
Merge-request: !891
parents 6f74bbaf 690acadc
/* Distributed under the OSI-approved BSD 3-Clause License. See accompanying
file Copyright.txt or https://cmake.org/licensing for details. */
#include "cm_codecvt.hxx"
#include <limits>
#if defined(_WIN32)
#include <assert.h>
#include <string.h>
#include <windows.h>
#undef max
#include "cmsys/Encoding.hxx"
#endif
#if defined(_WIN32)
/* Number of leading ones before a zero in the byte (see cm_utf8.c). */
extern "C" unsigned char const cm_utf8_ones[256];
#endif
codecvt::codecvt(Encoding e)
: m_lastState(0)
#if defined(_WIN32)
, m_codepage(0)
: m_codepage(0)
#endif
{
switch (e) {
......@@ -45,76 +50,68 @@ std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from,
const char*& from_next, char* to,
char* to_end, char*& to_next) const
{
from_next = from;
to_next = to;
if (m_noconv) {
return noconv;
return std::codecvt_base::noconv;
}
std::codecvt_base::result res = error;
#if defined(_WIN32)
from_next = from;
to_next = to;
bool convert = true;
size_t count = from_end - from;
const char* data = from;
unsigned int& stateId = reinterpret_cast<unsigned int&>(state);
if (count == 0) {
return codecvt::ok;
} else if (count == 1) {
if (stateId == 0) {
// decode first byte for UTF-8
if ((*from & 0xF8) == 0xF0 || // 1111 0xxx; 4 bytes for codepoint
(*from & 0xF0) == 0xE0 || // 1110 xxxx; 3 bytes for codepoint
(*from & 0xE0) == 0xC0) // 110x xxxx; 2 bytes for codepoint
{
stateId = findStateId();
codecvt::State& s = m_states.at(stateId - 1);
s.bytes[0] = *from;
convert = false;
if ((*from & 0xF8) == 0xF0) {
s.totalBytes = 4;
} else if ((*from & 0xF0) == 0xE0) {
s.totalBytes = 3;
} else if ((*from & 0xE0) == 0xC0) {
s.totalBytes = 2;
}
s.bytesLeft = s.totalBytes - 1;
};
// else 1 byte for codepoint
} else {
codecvt::State& s = m_states.at(stateId - 1);
s.bytes[s.totalBytes - s.bytesLeft] = *from;
s.bytesLeft--;
data = s.bytes;
count = s.totalBytes - s.bytesLeft;
if ((*from & 0xC0) == 0x80) { // 10xx xxxx
convert = s.bytesLeft == 0;
} else {
// invalid multi-byte
convert = true;
}
if (convert) {
s.used = false;
if (stateId == m_lastState) {
m_lastState--;
}
stateId = 0;
}
// Use a const view of the state because we should not modify it until we
// have fully processed and consume a byte (with sufficient space in the
// output buffer). We call helpers to re-cast and modify the state
State const& lstate = reinterpret_cast<State&>(state);
while (from_next != from_end) {
// Count leading ones in the bits of the next byte.
unsigned char const ones =
cm_utf8_ones[static_cast<unsigned char>(*from_next)];
if (ones != 1 && lstate.buffered != 0) {
// We have a buffered partial codepoint that we never completed.
return std::codecvt_base::error;
} else if (ones == 1 && lstate.buffered == 0) {
// This is a continuation of a codepoint that never started.
return std::codecvt_base::error;
}
// Compute the number of bytes in the current codepoint.
int need = 0;
switch (ones) {
case 0: // 0xxx xxxx: new codepoint of size 1
need = 1;
break;
case 1: // 10xx xxxx: continues a codepoint
assert(lstate.size != 0);
need = lstate.size;
break;
case 2: // 110x xxxx: new codepoint of size 2
need = 2;
break;
case 3: // 1110 xxxx: new codepoint of size 3
need = 3;
break;
case 4: // 1111 0xxx: new codepoint of size 4
need = 4;
break;
default: // invalid byte
return std::codecvt_base::error;
}
if (convert) {
std::wstring wide = cmsys::Encoding::ToWide(std::string(data, count));
int r = WideCharToMultiByte(m_codepage, 0, wide.c_str(),
static_cast<int>(wide.size()), to,
to_end - to, NULL, NULL);
if (r > 0) {
from_next = from_end;
to_next = to + r;
res = ok;
assert(need > 0);
if (lstate.buffered + 1 == need) {
// This byte completes a codepoint.
std::codecvt_base::result decode_result =
this->Decode(state, need, from_next, to_next, to_end);
if (decode_result != std::codecvt_base::ok) {
return decode_result;
}
} else {
res = partial;
from_next = from_end;
to_next = to;
// This byte does not complete a codepoint.
this->BufferPartial(state, need, from_next);
}
}
return std::codecvt_base::ok;
#else
static_cast<void>(state);
static_cast<void>(from);
......@@ -123,46 +120,118 @@ std::codecvt_base::result codecvt::do_out(mbstate_t& state, const char* from,
static_cast<void>(to);
static_cast<void>(to_end);
static_cast<void>(to_next);
res = codecvt::noconv;
return std::codecvt_base::noconv;
#endif
return res;
};
std::codecvt_base::result codecvt::do_unshift(mbstate_t& state, char* to,
char* to_end,
char*& to_next) const
{
std::codecvt_base::result res = error;
to_next = to;
if (m_noconv) {
return std::codecvt_base::noconv;
}
#if defined(_WIN32)
unsigned int& stateId = reinterpret_cast<unsigned int&>(state);
if (stateId > 0) {
codecvt::State& s = m_states.at(stateId - 1);
s.used = false;
if (stateId == m_lastState) {
m_lastState--;
}
stateId = 0;
std::wstring wide = cmsys::Encoding::ToWide(
std::string(s.bytes, s.totalBytes - s.bytesLeft));
int r = WideCharToMultiByte(m_codepage, 0, wide.c_str(),
static_cast<int>(wide.size()), to, to_end - to,
NULL, NULL);
if (r > 0) {
to_next = to + r;
res = ok;
}
} else {
res = ok;
State& lstate = reinterpret_cast<State&>(state);
if (lstate.buffered != 0) {
return this->DecodePartial(state, to_next, to_end);
}
return std::codecvt_base::ok;
#else
static_cast<void>(state);
static_cast<void>(to_end);
res = ok;
return std::codecvt_base::ok;
#endif
return res;
};
#if defined(_WIN32)
std::codecvt_base::result codecvt::Decode(mbstate_t& state, int size,
const char*& from_next,
char*& to_next, char* to_end) const
{
State& lstate = reinterpret_cast<State&>(state);
// Collect all the bytes for this codepoint.
char buf[4];
memcpy(buf, lstate.partial, lstate.buffered);
buf[lstate.buffered] = *from_next;
// Convert the encoding.
wchar_t wbuf[2];
int wlen =
MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, buf, size, wbuf, 2);
if (wlen <= 0) {
return std::codecvt_base::error;
}
int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
to_end - to_next, NULL, NULL);
if (tlen <= 0) {
if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
return std::codecvt_base::partial;
}
return std::codecvt_base::error;
}
// Move past the now-consumed byte in the input buffer.
++from_next;
// Move past the converted codepoint in the output buffer.
to_next += tlen;
// Re-initialize the state for the next codepoint to start.
lstate = State();
return std::codecvt_base::ok;
}
std::codecvt_base::result codecvt::DecodePartial(mbstate_t& state,
char*& to_next,
char* to_end) const
{
State& lstate = reinterpret_cast<State&>(state);
// Try converting the partial codepoint.
wchar_t wbuf[2];
int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, lstate.partial,
lstate.buffered, wbuf, 2);
if (wlen <= 0) {
return std::codecvt_base::error;
}
int tlen = WideCharToMultiByte(m_codepage, 0, wbuf, wlen, to_next,
to_end - to_next, NULL, NULL);
if (tlen <= 0) {
if (GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
return std::codecvt_base::partial;
}
return std::codecvt_base::error;
}
// Move past the converted codepoint in the output buffer.
to_next += tlen;
// Re-initialize the state for the next codepoint to start.
lstate = State();
return std::codecvt_base::ok;
}
void codecvt::BufferPartial(mbstate_t& state, int size,
const char*& from_next) const
{
State& lstate = reinterpret_cast<State&>(state);
// Save the byte in our buffer for later.
lstate.partial[lstate.buffered++] = *from_next;
lstate.size = size;
// Move past the now-consumed byte in the input buffer.
++from_next;
}
#endif
int codecvt::do_max_length() const throw()
{
return 4;
......@@ -172,44 +241,3 @@ int codecvt::do_encoding() const throw()
{
return 0;
};
unsigned int codecvt::findStateId() const
{
unsigned int stateId = 0;
bool add = false;
const unsigned int maxSize = std::numeric_limits<unsigned int>::max();
if (m_lastState >= maxSize) {
m_lastState = 0;
}
if (m_states.size() <= m_lastState) {
add = true;
} else {
unsigned int i = m_lastState;
while (i < maxSize) {
codecvt::State& s = m_states.at(i);
i++;
if (!s.used) {
m_lastState = i;
stateId = m_lastState;
s.used = true;
s.totalBytes = 0;
s.bytesLeft = 0;
break;
}
if (i >= m_states.size()) {
i = 0;
}
if (i == m_lastState) {
add = true;
break;
}
}
};
if (add) {
codecvt::State s = { true, 0, 0, { 0, 0, 0, 0 } };
m_states.push_back(s);
m_lastState = (unsigned int)m_states.size();
stateId = m_lastState;
}
return stateId;
};
......@@ -6,7 +6,6 @@
#include "cmConfigure.h"
#include <locale>
#include <vector>
#include <wchar.h>
class codecvt : public std::codecvt<char, char, mbstate_t>
......@@ -35,21 +34,30 @@ protected:
int do_encoding() const throw() CM_OVERRIDE;
private:
typedef struct
// The mbstate_t argument to do_out and do_unshift is responsible
// for storing state between calls. We cannot control the type
// since we want to imbue on standard streams. However, we do
// know that it is a trivial type. Define our own type to overlay
// on it safely with no alignment requirements.
struct State
{
bool used;
unsigned char totalBytes;
unsigned char bytesLeft;
char bytes[4];
} State;
// Buffer bytes we have consumed from a partial codepoint.
char partial[3];
unsigned int findStateId() const;
// Number of bytes we have buffered from a partial codepoint.
unsigned char buffered : 4;
// Size of the current codepoint in bytes.
unsigned char size : 4;
};
bool m_noconv;
mutable std::vector<State> m_states;
mutable unsigned int m_lastState;
#if defined(_WIN32)
unsigned int m_codepage;
result Decode(mbstate_t& state, int need, const char*& from_next,
char*& to_next, char* to_end) const;
result DecodePartial(mbstate_t& state, char*& to_next, char* to_end) const;
void BufferPartial(mbstate_t& state, int need, const char*& from_next) const;
#endif
#endif
......
......@@ -15,7 +15,7 @@
*/
/* Number of leading ones before a zero in the byte. */
static unsigned char const cm_utf8_ones[256] = {
unsigned char const cm_utf8_ones[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment