Rewritten UTFDecoder into a function.

This commit is contained in:
Ferdinand Majerech 2014-07-23 01:36:09 +02:00
parent b060699950
commit 0a625ec7a7

View file

@ -75,8 +75,9 @@ final class Reader
uint line_;
// Current column in file.
uint column_;
// Decoder reading data from file and decoding it to UTF-32.
UTFFastDecoder decoder_;
// Original Unicode encoding of the data.
Encoding encoding_;
@ -95,26 +96,26 @@ final class Reader
this(Stream stream) @trusted //!nothrow
auto streamBytes = streamToBytesGC(stream);
auto result = fixUTFByteOrder(streamBytes);
if(result.bytesStripped > 0)
auto endianResult = fixUTFByteOrder(streamBytes);
if(endianResult.bytesStripped > 0)
throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned "
"to 2 or 4 bytes, respectively");
version(unittest) { endian_ = result.endian; }
decoder_ = UTFFastDecoder(result.array, result.encoding);
const msg = decoder_.getAndClearErrorMessage();
version(unittest) { endian_ = endianResult.endian; }
encoding_ = endianResult.encoding;
auto decodeResult = decodeUTF(endianResult.array, endianResult.encoding);
const msg = decodeResult.errorMessage;
if(msg !is null)
throw new ReaderException("UTF decoding error: " ~ msg);
buffer_ = decoder_.decoded;
// The part of buffer excluding trailing zeroes.
buffer_ = decodeResult.decoded;
// The part of buffer_ excluding trailing zeroes.
auto noZeros = buffer_;
while(!noZeros.empty && noZeros.back == '\0') { noZeros.popBack(); }
@ -238,16 +239,146 @@ final class Reader
final size_t charIndex() @safe pure nothrow const @nogc { return charIndex_; }
/// Get encoding of the input buffer.
final Encoding encoding() @safe pure nothrow const @nogc { return decoder_.encoding; }
final Encoding encoding() @safe pure nothrow const @nogc { return encoding_; }
alias UTFDecoder UTFFastDecoder;
struct UTFDecoder
// Decode an UTF-8/16/32 buffer to UTF-32 (for UTF-32 this does nothing).
// Params:
// input = The UTF-8/16/32 buffer to decode.
// encoding = Encoding of input.
// Returns:
// A struct with the following members:
// $(D string errorMessage) In case of a decoding error, the error message is stored
// here. If there was no error, errorMessage is NULL. Always
// check this first before using the other members.
// $(D dchar[] decoded) A GC-allocated buffer with decoded UTF-32 characters.
// $(D size_t maxChars) XXX reserved for future
auto decodeUTF(ubyte[] input, UTFEncoding encoding) @safe pure nothrow
// Documented in function ddoc.
struct Result
string errorMessage;
dchar[] decoded;
Result result;
final switch(encoding)
case UTFEncoding.UTF_8: result.maxChars = input.length; break;
case UTFEncoding.UTF_16: result.maxChars = input.length / 2; break;
case UTFEncoding.UTF_32: result.maxChars = input.length / 2; break;
// Decode input_ if it's encoded as UTF-8 or UTF-16.
// Params:
// buffer = The input buffer to decode.
// result = A Result struct to put decoded result and any error messages to.
// On error, result.errorMessage will be set.
static void decode(C)(C[] input, ref Result result) @safe pure nothrow
// End of part of input that contains complete characters that can be decoded.
const size_t end = endOfLastUTFSequence(input);
// If end is 0, there are no full chars.
// This can happen at the end of file if there is an incomplete UTF sequence.
if(end < input.length)
result.errorMessage = "Invalid UTF character at the end of input";
const srclength = input.length;
try for(size_t srcpos = 0; srcpos < srclength;)
const c = input[srcpos];
if(c < 0x80)
result.decoded ~= c;
result.decoded ~= std.utf.decode(input, srcpos);
catch(UTFException e)
result.errorMessage = e.msg;
catch(Exception e)
assert(false, "Unexpected exception in decode(): " ~ e.msg);
final switch(encoding)
case UTFEncoding.UTF_8: decode(cast(char[])input, result); break;
case UTFEncoding.UTF_16:
assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
decode(cast(wchar[])input, result);
case UTFEncoding.UTF_32:
assert(input.length % 4 == 0,
"UTF-32 buffer size must be a multiple of 4");
// No need to decode anything
result.decoded = cast(dchar[])input;
if(result.errorMessage !is null) { return result; }
// XXX This is risky. We rely on the assumption that the scanner only uses
// peek() to detect the end of the buffer. Should this cause any bugs,
// revert.
// The buffer must be zero terminated for scanner to detect its end.
// if(result.decoded.empty || result.decoded.back() != '\0')
// {
// result.decoded ~= cast(dchar)'\0';
// }
return result;
/// Determine if all characters in an array are printable.
/// Params: chars = Characters to check.
/// Returns: True if all the characters are printable, false otherwise.
bool printable(const dchar[] chars) @safe pure nothrow @nogc
foreach(c; chars)
if(!((c == 0x09 || c == 0x0A || c == 0x0D || c == 0x85) ||
(c >= 0x20 && c <= 0x7E) ||
(c >= 0xA0 && c <= '\uD7FF') ||
(c >= '\uE000' && c <= '\uFFFD')))
return false;
return true;
// Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer.
size_t endOfLastUTFSequence(C)(const C[] buffer)
@safe pure nothrow @nogc
// UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence).
static immutable ubyte[256] utf8Stride =
@ -269,128 +400,6 @@ struct UTFDecoder
// Encoding of the input buffer.
UTFEncoding encoding_;
// Maximum number of characters that might be in the buffer.
size_t maxChars_;
// Part of the input buffer that has not yet been decoded.
ubyte[] input_;
// Decoded (UTF-32) version of the entire input_. If input is UTF-32, this is
// just a reference to input_.
dchar[] decoded_;
// Current error message.
// To be fully nothrow, we use return values and the user (Reader) can check
// for a detailed error message if they get an error return.
string errorMessage_;
/// Construct a UTFBlockDecoder decoding data from a buffer.
this(ubyte[] buffer, UTFEncoding encoding) @safe pure nothrow @nogc
input_ = buffer;
encoding_ = encoding;
final switch(encoding_)
case UTFEncoding.UTF_8: maxChars_ = input_.length; break;
case UTFEncoding.UTF_16: maxChars_ = input_.length / 2; break;
case UTFEncoding.UTF_32: maxChars_ = input_.length / 2; break;
/// Decode all data passed to the constructor.
/// On error, getAndClearErrorMessage() will return a non-null string.
void decodeAll() @safe pure nothrow
assert(decoded_ is null, "Calling decodeAll more than once");
final switch(encoding_)
case UTFEncoding.UTF_8: decode(cast(char[])input_); break;
case UTFEncoding.UTF_16:
assert(input_.length % 2 == 0, "UTF-16 buffer size must be even");
case UTFEncoding.UTF_32:
assert(input_.length % 4 == 0,
"UTF-32 buffer size must be a multiple of 4");
// No need to decode anything
decoded_ = cast(dchar[])input_;
// XXX This is risky. We rely on the assumption that the scanner only uses
// peek() to detect the end of the buffer. Should this cause any bugs,
// revert.
// The buffer must be zero terminated for scanner to detect its end.
// if(decoded_.empty || decoded_.back() != '\0')
// {
// decoded_ ~= cast(dchar)'\0';
// }
/// Get encoding we're decoding from.
UTFEncoding encoding() const pure @safe nothrow @nogc { return encoding_; }
/// Get all decoded characters.
const(dchar[]) decoded() @safe pure nothrow @nogc { return decoded_; }
/// Get the error message and clear it.
string getAndClearErrorMessage() @safe pure nothrow @nogc
const result = errorMessage_;
errorMessage_ = null;
return result;
// Decode input_ if it's encoded as UTF-8 or UTF-16.
// On error, errorMessage_ will be set.
void decode(C)(C[] buffer) @safe pure nothrow
// End of part of buffer that contains complete characters that can be decoded.
const size_t end = endOfLastUTFSequence(buffer);
// If end is 0, there are no full chars.
// This can happen at the end of file if there is an incomplete UTF sequence.
if(end < buffer.length)
errorMessage_ = "Invalid UTF character at the end of buffer";
const srclength = buffer.length;
try for(size_t srcpos = 0; srcpos < srclength;)
const c = buffer[srcpos];
if(c < 0x80)
decoded_ ~= c;
decoded_ ~= std.utf.decode(buffer, srcpos);
catch(UTFException e)
errorMessage_ = e.msg;
catch(Exception e)
assert(false, "Unexpected exception in decode(): " ~ e.msg);
// Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer.
size_t endOfLastUTFSequence(C)(const C[] buffer)
@safe pure nothrow const @nogc
static if(is(C == char))
for(long end = buffer.length - 1; end >= 0; --end)
@ -421,27 +430,6 @@ struct UTFDecoder
return end;
/// Determine if all characters in an array are printable.
/// Params: chars = Characters to check.
/// Returns: True if all the characters are printable, false otherwise.
bool printable(const dchar[] chars) @safe pure nothrow @nogc
foreach(c; chars)
if(!((c == 0x09 || c == 0x0A || c == 0x0D || c == 0x85) ||
(c >= 0x20 && c <= 0x7E) ||
(c >= 0xA0 && c <= '\uD7FF') ||
(c >= '\uE000' && c <= '\uFFFD')))
return false;
return true;
// Unittests.