dyaml/source/dyaml/reader.d

709 lines
25 KiB
D
Raw Normal View History

2011-08-16 12:53:13 +00:00
2014-07-21 22:21:42 +00:00
// Copyright Ferdinand Majerech 2011-2014.
2011-08-16 12:53:13 +00:00
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
module dyaml.reader;
import core.stdc.stdlib;
2011-08-16 12:53:13 +00:00
import core.stdc.string;
import core.thread;
2011-08-16 12:53:13 +00:00
import std.algorithm;
import std.conv;
import std.exception;
import std.stdio;
import std.string;
import std.system;
import std.utf;
import tinyendian;
import dyaml.fastcharsearch;
import dyaml.encoding;
2011-08-16 12:53:13 +00:00
import dyaml.exception;
2014-07-21 23:04:44 +00:00
import dyaml.streamcompat;
2011-08-16 12:53:13 +00:00
package:
2014-07-21 22:33:17 +00:00
//XXX VIM STUFF:
//XXX THE f/t COLORING PLUGIN, AND TRY TO REMOVE THE f/t AUTOREPEAT PLUGIN
// (AND MAYBE DO THE REPEAT WITH ALT-T/ALT-F
//XXX DDOC snippets such as $D, $BIGOH, anything else
// OR MAYBE JUST $ - EXPANDING TO $(${1} ${2})
// WHERE DEFAULT ${1} IS 'D' AND SPECIAL SNIPPETS FOR SPECIFIC DDOC MACROS
// (E.G. XREF HAS 2 ARGS)
// XXX DON'T FORGET TO COMMIT DSNIPS CHANGES
// XXX SNIPPETS: WHY CAN'T WE USE NEW IN NEW? FIX!
// XXX ALSO WRITELN VISUAL! (print whatever we have selected)
// XXX AND ``fun`` VISUAL TOO!
// XXX snippet to print variable along its name AND
// OR MULTIPLE VARS - USE std.format!
2011-08-16 12:53:13 +00:00
///Exception thrown at Reader errors.
class ReaderException : YAMLException
{
this(string msg, string file = __FILE__, int line = __LINE__)
2013-12-17 14:16:43 +00:00
@safe pure nothrow
{
super("Reader error: " ~ msg, file, line);
}
2011-08-16 12:53:13 +00:00
}
/// Lazily reads and decodes data from a buffer, only storing as much as needed at any
/// moment.
2014-07-21 23:04:44 +00:00
///
/// Adds a '\0' to the end of the data.
2011-08-16 12:53:13 +00:00
final class Reader
{
private:
// Allocated space for buffer_.
dchar[] bufferAllocated_ = null;
2014-07-21 22:41:13 +00:00
// Buffer of currently loaded characters.
dchar[] buffer_ = null;
2014-07-21 22:41:13 +00:00
// Current position within buffer. Only data after this position can be read.
2011-08-16 12:53:13 +00:00
uint bufferOffset_ = 0;
// Index of the current character in the buffer.
2011-08-16 12:53:13 +00:00
size_t charIndex_ = 0;
2014-07-21 22:41:13 +00:00
// Current line in file.
2011-08-16 12:53:13 +00:00
uint line_;
2014-07-21 22:41:13 +00:00
// Current column in file.
2011-08-16 12:53:13 +00:00
uint column_;
2014-07-21 22:41:13 +00:00
// Decoder reading data from file and decoding it to UTF-32.
UTFFastDecoder decoder_;
2011-08-16 12:53:13 +00:00
version(unittest)
{
// Endianness of the input before it was converted (for testing)
Endian endian_;
}
2011-08-16 12:53:13 +00:00
public:
import std.stream;
2014-07-21 22:41:13 +00:00
/// Construct a Reader.
///
/// Params: stream = Input stream. Must be readable and seekable.
///
/// Throws: ReaderException if the stream is invalid.
this(Stream stream) @trusted //!nothrow
2011-08-16 12:53:13 +00:00
{
auto streamBytes = streamToBytesGC(stream);
auto result = fixUTFByteOrder(streamBytes);
if(result.bytesStripped > 0)
{
throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned "
"to 2 or 4 bytes, respectively");
}
version(unittest) { endian_ = result.endian; }
decoder_ = UTFFastDecoder(result.array, result.encoding);
2011-08-16 12:53:13 +00:00
}
2014-07-21 07:56:41 +00:00
@trusted nothrow @nogc ~this()
2011-08-16 12:53:13 +00:00
{
2014-07-21 23:03:19 +00:00
// Delete the buffer, if allocated.
if(bufferAllocated_ is null){return;}
free(bufferAllocated_.ptr);
buffer_ = bufferAllocated_ = null;
2011-08-16 12:53:13 +00:00
}
2014-07-21 22:41:13 +00:00
/// Get character at specified index relative to current position.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Params: index = Index of the character to get relative to current position
/// in the buffer.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Returns: Character at specified position.
2014-07-21 23:10:23 +00:00
///
/// Throws: ReaderException if trying to read past the end of the buffer
2014-07-21 22:41:13 +00:00
/// or if invalid data is read.
dchar peek(size_t index = 0) @trusted
2011-08-16 12:53:13 +00:00
{
if(buffer_.length < bufferOffset_ + index + 1)
{
updateBuffer(index + 1);
}
if(buffer_.length <= bufferOffset_ + index)
{
throw new ReaderException("Trying to read past the end of the buffer");
}
2011-08-16 12:53:13 +00:00
return buffer_[bufferOffset_ + index];
}
2014-07-21 22:41:13 +00:00
/// Get specified number of characters starting at current position.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Note: This gets only a "view" into the internal buffer,
/// which WILL get invalidated after other Reader calls.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Params: length = Number of characters to get.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Returns: Characters starting at current position or an empty slice if out of bounds.
const(dstring) prefix(size_t length) @safe
{
return slice(0, length);
}
2014-07-21 22:41:13 +00:00
/// Get a slice view of the internal buffer.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Note: This gets only a "view" into the internal buffer,
/// which WILL get invalidated after other Reader calls.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Params: start = Start of the slice relative to current position.
/// end = End of the slice relative to current position.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Returns: Slice into the internal buffer or an empty slice if out of bounds.
const(dstring) slice(size_t start, size_t end) @trusted
2011-08-16 12:53:13 +00:00
{
if(buffer_.length <= bufferOffset_ + end)
{
updateBuffer(end);
}
end += bufferOffset_;
start += bufferOffset_;
end = min(buffer_.length, end);
return end > start ? cast(dstring)buffer_[start .. end] : "";
2011-08-16 12:53:13 +00:00
}
/// Get the next character, moving buffer position beyond it.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Returns: Next character.
2014-07-21 23:10:23 +00:00
///
/// Throws: ReaderException if trying to read past the end of the buffer
2014-07-21 22:41:13 +00:00
/// or if invalid data is read.
dchar get() @safe
2011-08-16 12:53:13 +00:00
{
const result = peek();
forward();
return result;
}
/// Get specified number of characters, moving buffer position beyond them.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Params: length = Number or characters to get.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Returns: Characters starting at current position.
2014-07-21 23:10:23 +00:00
///
/// Throws: ReaderException if trying to read past the end of the buffer
2014-07-21 22:41:13 +00:00
/// or if invalid data is read.
dstring get(size_t length) @safe
2011-08-16 12:53:13 +00:00
{
auto result = prefix(length).idup;
2011-08-16 12:53:13 +00:00
forward(length);
return result;
2011-08-16 12:53:13 +00:00
}
2014-07-21 22:41:13 +00:00
/// Move current position forward.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Params: length = Number of characters to move position forward.
2014-07-21 23:10:23 +00:00
///
/// Throws: ReaderException if trying to read past the end of the buffer
2014-07-21 22:41:13 +00:00
/// or if invalid data is read.
void forward(size_t length = 1) @trusted
2011-08-16 12:53:13 +00:00
{
if(buffer_.length <= bufferOffset_ + length + 1)
{
updateBuffer(length + 1);
}
2011-08-16 12:53:13 +00:00
mixin FastCharSearch!"\n\u0085\u2028\u2029"d search;
2011-08-16 12:53:13 +00:00
while(length > 0)
{
const c = buffer_[bufferOffset_];
++bufferOffset_;
++charIndex_;
//New line.
if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n'))
2011-08-16 12:53:13 +00:00
{
++line_;
column_ = 0;
}
else if(c != '\uFEFF'){++column_;}
--length;
}
}
/// Get a string describing current buffer position, used for error messages.
2014-07-21 23:23:15 +00:00
final Mark mark() @safe pure nothrow const @nogc { return Mark(line_, column_); }
2011-08-16 12:53:13 +00:00
2014-07-21 22:41:13 +00:00
/// Get current line number.
2014-07-21 23:23:15 +00:00
final uint line() @safe pure nothrow const @nogc { return line_; }
2011-08-16 12:53:13 +00:00
2014-07-21 22:41:13 +00:00
/// Get current column number.
2014-07-21 23:23:15 +00:00
final uint column() @safe pure nothrow const @nogc { return column_; }
2011-08-16 12:53:13 +00:00
/// Get index of the current character in the buffer.
2014-07-21 23:23:15 +00:00
final size_t charIndex() @safe pure nothrow const @nogc { return charIndex_; }
2011-08-16 12:53:13 +00:00
/// Get encoding of the input buffer.
2014-07-21 23:23:15 +00:00
final Encoding encoding() @safe pure nothrow const @nogc { return decoder_.encoding; }
2011-08-16 12:53:13 +00:00
private:
2014-07-21 22:41:13 +00:00
// Update buffer to be able to read length characters after buffer offset.
2014-07-21 23:10:23 +00:00
//
// If there are not enough characters in the buffer, it will get
2014-07-21 22:41:13 +00:00
// as many as possible.
2014-07-21 23:10:23 +00:00
//
2014-07-21 22:41:13 +00:00
// Params: length = Number of characters we need to read.
2014-07-21 23:10:23 +00:00
//
// Throws: ReaderException if trying to read past the end of the buffer
2014-07-21 22:41:13 +00:00
// or if invalid data is read.
2014-07-21 07:57:59 +00:00
void updateBuffer(const size_t length) @system
2011-08-16 12:53:13 +00:00
{
2014-07-21 22:41:13 +00:00
// Get rid of unneeded data in the buffer.
2011-08-16 12:53:13 +00:00
if(bufferOffset_ > 0)
{
2014-07-21 07:57:59 +00:00
const size_t bufferLength = buffer_.length - bufferOffset_;
2011-08-16 12:53:13 +00:00
memmove(buffer_.ptr, buffer_.ptr + bufferOffset_,
bufferLength * dchar.sizeof);
buffer_ = buffer_[0 .. bufferLength];
2011-08-16 12:53:13 +00:00
bufferOffset_ = 0;
}
2014-07-21 22:41:13 +00:00
// Load chars in batches of at most 1024 bytes (256 chars)
2011-08-16 12:53:13 +00:00
while(buffer_.length <= bufferOffset_ + length)
{
loadChars(512);
2011-08-16 12:53:13 +00:00
if(decoder_.done)
2011-08-16 12:53:13 +00:00
{
if(buffer_.length == 0 || buffer_[$ - 1] != '\0')
{
bufferReserve(buffer_.length + 1);
buffer_ = bufferAllocated_[0 .. buffer_.length + 1];
buffer_[$ - 1] = '\0';
2011-08-16 12:53:13 +00:00
}
break;
}
}
}
2014-07-21 22:41:13 +00:00
// Load more characters to the buffer.
2014-07-21 23:10:23 +00:00
//
2014-07-21 22:41:13 +00:00
// Params: chars = Recommended number of characters to load.
// More characters might be loaded.
// Less will be loaded if not enough available.
2014-07-21 23:10:23 +00:00
//
2014-07-21 22:41:13 +00:00
// Throws: ReaderException on Unicode decoding error,
// if nonprintable characters are detected, or
// if there is an error reading from the buffer.
2014-07-21 23:10:23 +00:00
//
void loadChars(size_t chars) @system
2011-08-16 12:53:13 +00:00
{
const oldLength = buffer_.length;
const oldPosition = decoder_.position;
bufferReserve(buffer_.length + chars);
buffer_ = bufferAllocated_[0 .. buffer_.length + chars];
scope(success)
2011-08-16 12:53:13 +00:00
{
buffer_ = buffer_[0 .. $ - chars];
2014-07-21 07:56:41 +00:00
enforce(printable(buffer_[oldLength .. $]),
new ReaderException("Special unicode characters are not allowed"));
}
2014-07-22 01:31:56 +00:00
for(size_t c = 0; chars && !decoder_.done;)
{
const slice = decoder_.getDChars(chars);
2014-07-22 01:31:56 +00:00
if(slice is null)
{
const msg = decoder_.getAndClearErrorMessage();
throw new ReaderException(
"Unicode decoding error between bytes %s and %s : %s"
.format(oldPosition, decoder_.position, msg));
}
2013-03-23 16:31:14 +00:00
buffer_[oldLength + c .. oldLength + c + slice.length] = slice[];
c += slice.length;
chars -= slice.length;
}
2011-08-16 12:53:13 +00:00
}
2014-07-21 22:41:13 +00:00
// Code shared by loadEntireFile methods.
void loadEntireFile_() @system
2011-08-16 12:53:13 +00:00
{
const maxChars = decoder_.maxChars;
bufferReserve(maxChars + 1);
loadChars(maxChars);
if(buffer_.length == 0 || buffer_[$ - 1] != '\0')
2011-08-16 12:53:13 +00:00
{
buffer_ = bufferAllocated_[0 .. buffer_.length + 1];
buffer_[$ - 1] = '\0';
}
}
2014-07-21 22:41:13 +00:00
// Ensure there is space for at least capacity characters in bufferAllocated_.
2014-07-21 07:57:59 +00:00
void bufferReserve(const size_t capacity) @system nothrow
{
if(bufferAllocated_ !is null && bufferAllocated_.length >= capacity){return;}
2014-07-21 22:41:13 +00:00
// Handle first allocation as well as reallocation.
2014-07-21 07:56:41 +00:00
auto ptr = bufferAllocated_ !is null
? realloc(bufferAllocated_.ptr, capacity * dchar.sizeof)
: malloc(capacity * dchar.sizeof);
bufferAllocated_ = (cast(dchar*)ptr)[0 .. capacity];
buffer_ = bufferAllocated_[0 .. buffer_.length];
}
}
private:
alias UTFBlockDecoder!512 UTFFastDecoder;
/// Decodes a buffer to UTF-32 in blocks.
struct UTFBlockDecoder(size_t bufferSize_) if (bufferSize_ % 2 == 0)
{
private:
2014-07-21 22:33:17 +00:00
// UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence).
static immutable ubyte[256] utf8Stride =
[
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
];
// Encoding of the input buffer.
UTFEncoding encoding_;
// Maximum number of characters that might be in the buffer.
size_t maxChars_;
// The entire input buffer.
ubyte[] inputAll_;
// Part of the input buffer that has not yet been decoded.
ubyte[] input_;
2014-07-21 22:33:17 +00:00
// Buffer used to store raw UTF-8 or UTF-16 code points.
union
{
char[bufferSize_] rawBuffer8_;
wchar[bufferSize_ / 2] rawBuffer16_;
}
2014-07-21 22:33:17 +00:00
// Used space (in items) in rawBuffer8_/rawBuffer16_.
size_t rawUsed_;
// Space used by decoded_.
dchar[bufferSize_] decodedSpace_;
// Buffer of decoded, UTF-32 characters. This is a slice into decodedSpace_.
dchar[] decoded_;
2014-07-22 01:31:56 +00:00
// Current error message.
//
// To be fully nothrow, we use return values and the user (Reader) can check
// for a detailed error message if they get an error return.
string errorMessage_;
public:
/// Construct a UTFBlockDecoder decoding data from a buffer.
this(ubyte[] buffer, UTFEncoding encoding) @safe pure nothrow @nogc
{
inputAll_ = buffer;
input_ = inputAll_[];
encoding_ = encoding;
final switch(encoding_)
{
case UTFEncoding.UTF_8: maxChars_ = input_.length; break;
case UTFEncoding.UTF_16: maxChars_ = input_.length / 2; break;
case UTFEncoding.UTF_32: maxChars_ = input_.length / 2; break;
2011-08-16 12:53:13 +00:00
}
}
/// Get maximum number of characters that might be in the buffer.
2014-07-22 00:11:31 +00:00
size_t maxChars() const pure @safe nothrow @nogc { return maxChars_; }
2014-07-21 22:33:17 +00:00
/// Get encoding we're decoding from.
2014-07-22 00:11:31 +00:00
UTFEncoding encoding() const pure @safe nothrow @nogc { return encoding_; }
/// Get the current position in buffer.
size_t position() @safe pure nothrow const @nogc
{
return inputAll_.length - input_.length;
}
2014-07-22 01:31:56 +00:00
/// Get the error message and clear it.
///
/// Can only be used in case of an error return from e.g. getDChars().
string getAndClearErrorMessage() @safe pure nothrow @nogc
{
assert(errorMessage_ !is null,
"Trying to get an error message when there's no error");
const result = errorMessage_;
errorMessage_ = null;
return errorMessage_;
}
2014-07-21 22:33:17 +00:00
/// Are we done decoding?
2014-07-22 00:11:31 +00:00
bool done() const pure @safe nothrow @nogc
2014-07-21 07:56:41 +00:00
{
return rawUsed_ == 0 && decoded_.length == 0 && input_.length == 0;
2011-08-16 12:53:13 +00:00
}
/// Get as many characters as possible, but at most maxChars.
///
/// Returns: A slice with decoded characters or NULL on failure (in that case,
/// check getAndClearErrorMessage(). The slice $(B will) be invalidated
/// in further calls.
const(dchar[]) getDChars(size_t maxChars = size_t.max) @safe pure nothrow
{
if(decoded_.length)
{
const slice = min(decoded_.length, maxChars);
const result = decoded_[0 .. slice];
decoded_ = decoded_[slice .. $];
2014-07-22 01:31:56 +00:00
assert(result !is null,
"NULL error on a getDChars call without an error");
return result;
}
assert(input_.length > 0 || rawUsed_ > 0);
updateBuffer();
2014-07-22 01:31:56 +00:00
// updateBuffer may fail
return errorMessage_ is null ? getDChars(maxChars) : null;
}
private:
// Read and decode characters from file and store them in the buffer.
//
// On error, errorMessage_ will be set.
void updateBuffer() @trusted pure nothrow
{
assert(decoded_.length == 0,
"updateBuffer can only be called when the buffer is empty");
final switch(encoding_)
{
case UTFEncoding.UTF_8:
const bytes = min(bufferSize_ - rawUsed_, input_.length);
2014-07-21 22:33:17 +00:00
// Current length of valid data in rawBuffer8_.
const rawLength = rawUsed_ + bytes;
rawBuffer8_[rawUsed_ .. rawUsed_ + bytes] = cast(char[])input_[0 .. bytes];
input_ = input_[bytes .. $];
decodeRawBuffer(rawBuffer8_, rawLength);
break;
case UTFEncoding.UTF_16:
const words = min((bufferSize_ / 2) - rawUsed_, input_.length / 2);
2014-07-21 22:33:17 +00:00
// Current length of valid data in rawBuffer16_.
const rawLength = rawUsed_ + words;
foreach(c; rawUsed_ .. rawLength)
{
rawBuffer16_[c] = *cast(wchar*)input_.ptr;
input_ = input_[2 .. $];
}
decodeRawBuffer(rawBuffer16_, rawLength);
break;
case UTFEncoding.UTF_32:
const chars = min(bufferSize_ / 4, input_.length / 4);
foreach(c; 0 .. chars)
{
decodedSpace_[c] = *cast(dchar*)input_.ptr;
input_ = input_[4 .. $];
}
decoded_ = decodedSpace_[0 .. chars];
break;
}
}
// Decode contents of a UTF-8 or UTF-16 raw buffer.
2014-07-22 01:31:56 +00:00
//
// On error, errorMessage_ will be set.
void decodeRawBuffer(C)(C[] buffer, const size_t length)
@safe pure nothrow
{
2014-07-21 07:56:41 +00:00
// End of part of rawBuffer8_ that contains
// complete characters and can be decoded.
const end = endOfLastUTFSequence(buffer, length);
// If end is 0, there are no full UTF-8 chars.
// This can happen at the end of file if there is an incomplete UTF-8 sequence.
2014-07-22 01:31:56 +00:00
if(end <= 0)
{
errorMessage_ = "Invalid UTF-8 character at the end of buffer";
return;
}
decodeUTF(buffer[0 .. end]);
2014-07-22 01:31:56 +00:00
if(errorMessage_ !is null) { return; }
// After decoding, any code points not decoded go to the start of raw buffer.
rawUsed_ = length - end;
foreach(i; 0 .. rawUsed_) { buffer[i] = buffer[i + end]; }
}
// Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer.
2014-07-21 07:56:41 +00:00
size_t endOfLastUTFSequence(C)(const C[] buffer, const size_t max)
@safe pure nothrow const @nogc
{
static if(is(C == char))
{
for(long end = max - 1; end >= 0; --end)
{
2012-01-22 09:37:44 +00:00
const s = utf8Stride[buffer[cast(size_t)end]];
if(s != 0xFF)
{
// If stride goes beyond end of the buffer (max), return end.
// Otherwise the last sequence ends at max, so we can return that.
2014-07-21 07:56:41 +00:00
// (Unless there is an invalid code point, which is
// caught at decoding)
return (s > max - end) ? cast(size_t)end : max;
}
}
return 0;
}
2014-07-21 07:56:41 +00:00
else
{
size_t end = 0;
while(end < max)
{
const s = stride(buffer, end);
if(s + end > max) { break; }
end += s;
}
return end;
}
}
// Decode a UTF-8 or UTF-16 buffer (with no incomplete sequences at the end).
//
// On error, sets errorMessage_.
void decodeUTF(C)(const C[] source) @safe pure nothrow
{
size_t bufpos = 0;
const srclength = source.length;
for(size_t srcpos = 0; srcpos < srclength;)
{
const c = source[srcpos];
if(c < 0x80)
{
decodedSpace_[bufpos++] = c;
++srcpos;
}
2014-07-22 01:31:56 +00:00
else try
{
decodedSpace_[bufpos++] = decode(source, srcpos);
}
2014-07-22 01:31:56 +00:00
catch(UTFException e)
{
errorMessage_ = e.msg;
return;
}
catch(Exception e)
{
assert(false, "Unexpected exception in Reader.decodeUTF " ~ e.msg);
}
}
decoded_ = decodedSpace_[0 .. bufpos];
}
}
/// Determine if all characters in an array are printable.
2014-07-21 07:56:41 +00:00
///
/// Params: chars = Characters to check.
2014-07-21 07:56:41 +00:00
///
/// Returns: True if all the characters are printable, false otherwise.
2014-07-21 07:56:41 +00:00
bool printable(const dchar[] chars) @safe pure nothrow @nogc
{
foreach(c; chars)
2011-08-16 12:53:13 +00:00
{
if(!((c == 0x09 || c == 0x0A || c == 0x0D || c == 0x85) ||
(c >= 0x20 && c <= 0x7E) ||
(c >= 0xA0 && c <= '\uD7FF') ||
(c >= '\uE000' && c <= '\uFFFD')))
2011-08-16 12:53:13 +00:00
{
return false;
2011-08-16 12:53:13 +00:00
}
}
return true;
}
//Unittests.
import std.stream;
void testEndian(R)()
{
writeln(typeid(R).toString() ~ ": endian unittest");
void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected)
{
2014-07-21 07:53:28 +00:00
auto reader = new R(new MemoryStream(data));
assert(reader.encoding == encoding_expected);
assert(reader.endian_ == endian_expected);
}
ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian);
endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian);
}
void testPeekPrefixForward(R)()
{
writeln(typeid(R).toString() ~ ": peek/prefix/forward unittest");
ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data";
2014-07-21 07:53:28 +00:00
auto reader = new R(new MemoryStream(data));
assert(reader.peek() == 'd');
assert(reader.peek(1) == 'a');
assert(reader.peek(2) == 't');
assert(reader.peek(3) == 'a');
assert(reader.peek(4) == '\0');
assert(reader.prefix(4) == "data");
assert(reader.prefix(6) == "data\0");
reader.forward(2);
assert(reader.peek(1) == 'a');
assert(collectException(reader.peek(3)));
}
void testUTF(R)()
{
writeln(typeid(R).toString() ~ ": UTF formats unittest");
dchar[] data = cast(dchar[])"data";
void utf_test(T)(T[] data, BOM bom)
2011-08-16 12:53:13 +00:00
{
2014-07-21 07:56:41 +00:00
ubyte[] bytes = ByteOrderMarks[bom] ~
(cast(ubyte*)data.ptr)[0 .. data.length * T.sizeof];
2014-07-21 07:53:28 +00:00
auto reader = new R(new MemoryStream(bytes));
2011-08-16 12:53:13 +00:00
assert(reader.peek() == 'd');
assert(reader.peek(1) == 'a');
assert(reader.peek(2) == 't');
assert(reader.peek(3) == 'a');
}
utf_test!char(to!(char[])(data), BOM.UTF8);
utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.UTF16BE : BOM.UTF16LE);
utf_test(data, endian == Endian.bigEndian ? BOM.UTF32BE : BOM.UTF32LE);
}
2014-07-21 07:53:58 +00:00
void test1Byte(R)()
{
writeln(typeid(R).toString() ~ ": 1 byte file unittest");
ubyte[] data = [97];
auto reader = new R(new MemoryStream(data));
assert(reader.peek() == 'a');
assert(reader.peek(1) == '\0');
assert(collectException(reader.peek(2)));
}
unittest
{
testEndian!Reader();
testPeekPrefixForward!Reader();
testUTF!Reader();
2014-07-21 07:53:58 +00:00
test1Byte!Reader();
2011-08-16 12:53:13 +00:00
}