dyaml/source/dyaml/reader.d

805 lines
27 KiB
D
Raw Normal View History

2011-08-16 12:53:13 +00:00
2014-07-21 22:21:42 +00:00
// Copyright Ferdinand Majerech 2011-2014.
2011-08-16 12:53:13 +00:00
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
module dyaml.reader;
import core.stdc.stdlib;
2011-08-16 12:53:13 +00:00
import core.stdc.string;
import core.thread;
2011-08-16 12:53:13 +00:00
import std.algorithm;
import std.array;
2011-08-16 12:53:13 +00:00
import std.conv;
import std.exception;
import std.stdio;
import std.string;
import std.system;
import std.typecons;
2011-08-16 12:53:13 +00:00
import std.utf;
import tinyendian;
import dyaml.fastcharsearch;
import dyaml.encoding;
2011-08-16 12:53:13 +00:00
import dyaml.exception;
2014-07-29 00:59:58 +00:00
import dyaml.nogcutil;
2014-07-21 23:04:44 +00:00
2011-08-16 12:53:13 +00:00
package:
2014-07-21 22:33:17 +00:00
2011-08-16 12:53:13 +00:00
///Exception thrown at Reader errors.
class ReaderException : YAMLException
{
this(string msg, string file = __FILE__, int line = __LINE__)
2013-12-17 14:16:43 +00:00
@safe pure nothrow
{
super("Reader error: " ~ msg, file, line);
}
2011-08-16 12:53:13 +00:00
}
2014-07-30 21:30:37 +00:00
/// Provides an API to read characters from a UTF-8 buffer and build slices into that
/// buffer to avoid allocations (see SliceBuilder).
2011-08-16 12:53:13 +00:00
final class Reader
{
private:
2014-07-21 22:41:13 +00:00
// Buffer of currently loaded characters.
char[] buffer_ = null;
2014-07-21 22:41:13 +00:00
// Current position within buffer. Only data after this position can be read.
size_t bufferOffset_ = 0;
// Index of the current character in the buffer.
2011-08-16 12:53:13 +00:00
size_t charIndex_ = 0;
// Number of characters (code points) in buffer_.
size_t characterCount_ = 0;
2014-07-22 23:36:38 +00:00
2014-07-21 22:41:13 +00:00
// Current line in file.
2011-08-16 12:53:13 +00:00
uint line_;
2014-07-21 22:41:13 +00:00
// Current column in file.
2011-08-16 12:53:13 +00:00
uint column_;
2014-07-22 23:36:09 +00:00
// Original Unicode encoding of the data.
Encoding encoding_;
2011-08-16 12:53:13 +00:00
version(unittest)
{
// Endianness of the input before it was converted (for testing)
Endian endian_;
}
// Index to buffer_ where the last decoded character starts.
2014-07-29 00:56:23 +00:00
size_t lastDecodedBufferOffset_ = 0;
2014-07-30 02:46:53 +00:00
// Offset, relative to charIndex_, of the last decoded character,
2014-07-29 00:56:23 +00:00
// in code points, not chars.
size_t lastDecodedCharOffset_ = 0;
// Number of character decodings done during the life of the Reader.
//
// Used for performance testing.
size_t decodeCount_ = 0;
2011-08-16 12:53:13 +00:00
public:
/// Construct a Reader.
///
/// Params: buffer = Buffer with YAML data. This may be e.g. the entire
/// contents of a file or a string. $(B will) be modified by
/// the Reader and other parts of D:YAML (D:YAML tries to
/// reuse the buffer to minimize memory allocations)
///
/// Throws: ReaderException on a UTF decoding error or if there are
/// nonprintable Unicode characters illegal in YAML.
this(ubyte[] buffer) @trusted pure //!nothrow
{
auto endianResult = fixUTFByteOrder(buffer);
if(endianResult.bytesStripped > 0)
{
throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned "
"to 2 or 4 bytes, respectively");
}
version(unittest) { endian_ = endianResult.endian; }
encoding_ = endianResult.encoding;
auto utf8Result = toUTF8(endianResult.array, endianResult.encoding);
const msg = utf8Result.errorMessage;
if(msg !is null)
{
throw new ReaderException("Error when converting to UTF-8: " ~ msg);
}
2011-08-16 12:53:13 +00:00
buffer_ = utf8Result.utf8;
2014-07-24 00:43:28 +00:00
characterCount_ = utf8Result.characterCount;
// Check that all characters in buffer are printable.
enforce(isPrintableValidUTF8(buffer_),
new ReaderException("Special unicode characters are not allowed"));
this.sliceBuilder = SliceBuilder(this);
2011-08-16 12:53:13 +00:00
}
2014-07-21 22:41:13 +00:00
/// Get character at specified index relative to current position.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Params: index = Index of the character to get relative to current position
2014-07-30 21:30:37 +00:00
/// in the buffer. Can point outside of the buffer; In that
/// case, '\0' will be returned.
2014-07-21 23:10:23 +00:00
///
2014-07-30 21:30:37 +00:00
/// Returns: Character at specified position or '\0' if outside of the buffer.
2014-07-21 23:10:23 +00:00
///
// XXX removed; search for 'risky' to find why.
// Throws: ReaderException if trying to read past the end of the buffer.
2014-07-29 00:56:23 +00:00
dchar peek(size_t index = 0) @safe pure nothrow @nogc
2011-08-16 12:53:13 +00:00
{
if(characterCount_ <= charIndex_ + index)
{
2014-07-30 02:46:53 +00:00
// XXX This is risky; revert this if bugs are introduced. We rely on
// the assumption that Reader only uses peek() to detect end of buffer.
// The test suite passes.
// throw new ReaderException("Trying to read past the end of the buffer");
return '\0';
}
2014-07-29 00:56:23 +00:00
// Optimized path for Scanner code that peeks chars in linear order to
// determine the length of some sequence.
if(index == lastDecodedCharOffset_)
{
++decodeCount_;
++lastDecodedCharOffset_;
const char b = buffer_[lastDecodedBufferOffset_];
2014-07-29 00:56:23 +00:00
// ASCII
2014-07-30 02:46:53 +00:00
if(b < 0x80)
2014-07-29 00:56:23 +00:00
{
++lastDecodedBufferOffset_;
return b;
}
return decodeValidUTF8NoGC(buffer_, lastDecodedBufferOffset_);
2014-07-29 00:56:23 +00:00
}
// 'Slow' path where we decode everything up to the requested character.
lastDecodedCharOffset_ = 0;
lastDecodedBufferOffset_ = bufferOffset_;
2014-07-29 00:56:23 +00:00
dchar d;
while(lastDecodedCharOffset_ <= index)
{
d = decodeNext();
}
return d;
2011-08-16 12:53:13 +00:00
}
2014-07-21 22:41:13 +00:00
/// Get specified number of characters starting at current position.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Note: This gets only a "view" into the internal buffer,
2014-07-22 23:49:11 +00:00
/// which get invalidated after other Reader calls.
2014-07-21 23:10:23 +00:00
///
2014-07-29 21:15:08 +00:00
/// Params: length = Number of characters (code points, not bytes) to get. May
/// reach past the end of the buffer; in that case the
/// returned slice will be shorter.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Returns: Characters starting at current position or an empty slice if out of bounds.
char[] prefix(const size_t length) @safe pure nothrow @nogc
2014-07-29 00:59:16 +00:00
{
return slice(length);
2014-07-29 00:59:16 +00:00
}
2014-07-29 21:15:08 +00:00
/// Get a slice view of the internal buffer, starting at the current position.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Note: This gets only a "view" into the internal buffer,
2014-07-22 23:49:11 +00:00
/// which get invalidated after other Reader calls.
2014-07-21 23:10:23 +00:00
///
2014-07-29 21:15:08 +00:00
/// Params: end = End of the slice relative to current position. May reach past
/// the end of the buffer; in that case the returned slice will
/// be shorter.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Returns: Slice into the internal buffer or an empty slice if out of bounds.
char[] slice(const size_t end) @safe pure nothrow @nogc
2014-07-29 00:58:04 +00:00
{
// Fast path in case the caller has already peek()ed all the way to end.
if(end == lastDecodedCharOffset_)
{
return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
2014-07-29 00:58:04 +00:00
}
lastDecodedCharOffset_ = 0;
lastDecodedBufferOffset_ = bufferOffset_;
2014-07-29 00:58:04 +00:00
// 'Slow' path - decode everything up to end.
while(lastDecodedCharOffset_ < end &&
lastDecodedBufferOffset_ < buffer_.length)
2014-07-29 00:58:04 +00:00
{
decodeNext();
}
return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
2014-07-29 00:58:04 +00:00
}
2011-08-16 12:53:13 +00:00
/// Get the next character, moving buffer position beyond it.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Returns: Next character.
2014-07-21 23:10:23 +00:00
///
/// Throws: ReaderException if trying to read past the end of the buffer
2014-07-21 22:41:13 +00:00
/// or if invalid data is read.
2014-07-22 22:46:00 +00:00
dchar get() @safe pure nothrow @nogc
2011-08-16 12:53:13 +00:00
{
const result = peek();
forward();
return result;
}
/// Get specified number of characters, moving buffer position beyond them.
2014-07-21 23:10:23 +00:00
///
2014-07-29 21:15:08 +00:00
/// Params: length = Number or characters (code points, not bytes) to get.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Returns: Characters starting at current position.
char[] get(const size_t length) @safe pure nothrow @nogc
2014-07-29 00:59:16 +00:00
{
auto result = prefix(length);
2014-07-29 00:59:16 +00:00
forward(length);
return result;
}
2011-08-16 12:53:13 +00:00
2014-07-21 22:41:13 +00:00
/// Move current position forward.
2014-07-21 23:10:23 +00:00
///
2014-07-21 22:41:13 +00:00
/// Params: length = Number of characters to move position forward.
void forward(size_t length = 1) @safe pure nothrow @nogc
2011-08-16 12:53:13 +00:00
{
mixin FastCharSearch!"\n\u0085\u2028\u2029"d search;
2014-07-22 22:41:14 +00:00
for(; length > 0; --length)
2011-08-16 12:53:13 +00:00
{
const c = decodeAndAdvanceCurrent();
// New line. (can compare with '\n' without decoding since it's ASCII)
if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n'))
2011-08-16 12:53:13 +00:00
{
++line_;
column_ = 0;
}
2014-07-22 22:41:14 +00:00
else if(c != '\uFEFF') { ++column_; }
2011-08-16 12:53:13 +00:00
}
lastDecodedBufferOffset_ = bufferOffset_;
lastDecodedCharOffset_ = 0;
2011-08-16 12:53:13 +00:00
}
2014-07-24 00:43:28 +00:00
/// Used to build slices of read data in Reader; to avoid allocations.
SliceBuilder sliceBuilder;
2014-07-24 00:43:28 +00:00
/// Get a string describing current buffer position, used for error messages.
2014-07-21 23:23:15 +00:00
final Mark mark() @safe pure nothrow const @nogc { return Mark(line_, column_); }
2011-08-16 12:53:13 +00:00
2014-07-21 22:41:13 +00:00
/// Get current line number.
2014-07-21 23:23:15 +00:00
final uint line() @safe pure nothrow const @nogc { return line_; }
2011-08-16 12:53:13 +00:00
2014-07-21 22:41:13 +00:00
/// Get current column number.
2014-07-21 23:23:15 +00:00
final uint column() @safe pure nothrow const @nogc { return column_; }
2011-08-16 12:53:13 +00:00
/// Get index of the current character in the buffer.
2014-07-21 23:23:15 +00:00
final size_t charIndex() @safe pure nothrow const @nogc { return charIndex_; }
2011-08-16 12:53:13 +00:00
/// Get encoding of the input buffer.
2014-07-22 23:36:09 +00:00
final Encoding encoding() @safe pure nothrow const @nogc { return encoding_; }
2014-07-29 00:56:23 +00:00
private:
// Decode the next character relative to
// lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them.
//
// Does not advance the buffer position. Used in peek() and slice().
2014-07-30 02:46:53 +00:00
dchar decodeNext() @safe pure nothrow @nogc
2014-07-29 00:56:23 +00:00
{
assert(lastDecodedBufferOffset_ < buffer_.length,
2014-07-29 00:56:23 +00:00
"Attempted to decode past the end of a string");
++decodeCount_;
const char b = buffer_[lastDecodedBufferOffset_];
2014-07-29 00:56:23 +00:00
++lastDecodedCharOffset_;
// ASCII
if(b < 0x80)
{
++lastDecodedBufferOffset_;
return b;
}
return decodeValidUTF8NoGC(buffer_, lastDecodedBufferOffset_);
2014-07-29 00:56:23 +00:00
}
// Decode the character starting at bufferOffset_ and move to the next
// character.
//
// Used in forward().
dchar decodeAndAdvanceCurrent() @safe pure nothrow @nogc
{
assert(bufferOffset_ < buffer_.length,
"Attempted to decode past the end of a string");
const b = buffer_[bufferOffset_];
++charIndex_;
++decodeCount_;
if(b < 0x80)
{
++bufferOffset_;
2014-07-30 02:46:53 +00:00
return b;
}
return decodeValidUTF8NoGC(buffer_, bufferOffset_);
}
}
2014-07-29 01:01:16 +00:00
/// Used to build slices of already read data in Reader buffer, avoiding allocations.
///
2014-07-30 02:46:53 +00:00
/// Usually these slices point to unchanged Reader data, but sometimes the data is
2014-07-29 01:01:16 +00:00
/// changed due to how YAML interprets certain characters/strings.
///
/// See begin() documentation.
struct SliceBuilder
2014-07-29 01:01:16 +00:00
{
private:
// No copying by the user.
@disable this(this);
@disable void opAssign(ref SliceBuilder);
2014-07-29 01:01:16 +00:00
// Reader this builder works in.
Reader reader_;
// Start of the slice om reader_.buffer_ (size_t.max while no slice being build)
2014-07-29 01:01:16 +00:00
size_t start_ = size_t.max;
// End of the slice om reader_.buffer_ (size_t.max while no slice being build)
2014-07-29 01:01:16 +00:00
size_t end_ = size_t.max;
// Stack of slice ends to revert to (see Transaction)
//
// Very few levels as we don't want arbitrarily nested transactions.
size_t[4] endStack_;
// The number of elements currently in endStack_.
size_t endStackUsed_ = 0;
@safe pure nothrow const @nogc invariant()
{
if(!inProgress) { return; }
assert(end_ <= reader_.bufferOffset_, "Slice ends after buffer position");
2014-07-29 01:01:16 +00:00
assert(start_ <= end_, "Slice start after slice end");
}
// Is a slice currently being built?
bool inProgress() @safe pure nothrow const @nogc
{
assert(start_ == size_t.max ? end_ == size_t.max : end_ != size_t.max,
"start_/end_ are not consistent");
return start_ != size_t.max;
}
public:
/// Begin building a slice.
///
/// Only one slice can be built at any given time; before beginning a new slice,
/// finish the previous one (if any).
///
/// The slice starts at the current position in the Reader buffer. It can only be
/// extended up to the current position in the buffer; Reader methods get() and
/// forward() move the position. E.g. it is valid to extend a slice by write()-ing
/// a string just returned by get() - but not one returned by prefix() unless the
/// position has changed since the prefix() call.
void begin() @system pure nothrow @nogc
{
assert(!inProgress, "Beginning a slice while another slice is being built");
assert(endStackUsed_ == 0, "Slice stack not empty at slice begin");
start_ = reader_.bufferOffset_;
end_ = reader_.bufferOffset_;
2014-07-29 01:01:16 +00:00
}
/// Finish building a slice and return it.
///
/// Any Transactions on the slice must be committed or destroyed before the slice
/// is finished.
///
/// Returns a string; once a slice is finished it is definitive that its contents
/// will not be changed.
char[] finish() @system pure nothrow @nogc
2014-07-29 01:01:16 +00:00
{
assert(inProgress, "finish called without begin");
assert(endStackUsed_ == 0, "Finishing a slice with running transactions.");
auto result = reader_.buffer_[start_ .. end_];
2014-07-29 01:01:16 +00:00
start_ = end_ = size_t.max;
return result;
2014-07-29 01:01:16 +00:00
}
/// Write a string to the slice being built.
///
/// Data can only be written up to the current position in the Reader buffer.
///
/// If str is a string returned by a Reader method, and str starts right after the
/// end of the slice being built, the slice is extended (trivial operation).
///
/// See_Also: begin
void write(char[] str) @system pure nothrow @nogc
{
assert(inProgress, "write called without begin");
assert(end_ <= reader_.bufferOffset_,
2014-07-29 01:01:16 +00:00
"AT START: Slice ends after buffer position");
// If str starts at the end of the slice (is a string returned by a Reader
// method), just extend the slice to contain str.
if(str.ptr == reader_.buffer_.ptr + end_)
2014-07-29 01:01:16 +00:00
{
end_ += str.length;
}
// Even if str does not start at the end of the slice, it still may be returned
// by a Reader method and point to buffer. So we need to memmove.
else
{
core.stdc.string.memmove(reader_.buffer_.ptr + end_, cast(char*)str.ptr,
2014-07-29 01:01:16 +00:00
str.length * char.sizeof);
end_ += str.length;
}
}
/// Write a character to the slice being built.
///
/// Data can only be written up to the current position in the Reader buffer.
///
/// See_Also: begin
void write(dchar c) @system pure nothrow @nogc
{
assert(inProgress, "write called without begin");
if(c < 0x80)
{
reader_.buffer_[end_++] = cast(char)c;
2014-07-29 01:01:16 +00:00
return;
}
// We need to encode a non-ASCII dchar into UTF-8
char[4] encodeBuf;
const bytes = encodeValidCharNoGC(encodeBuf, c);
reader_.buffer_[end_ .. end_ + bytes] = encodeBuf[0 .. bytes];
2014-07-29 01:01:16 +00:00
end_ += bytes;
}
2014-07-29 18:52:39 +00:00
/// Insert a character to a specified position in the slice.
///
/// Enlarges the slice by 1 char. Note that the slice can only extend up to the
/// current position in the Reader buffer.
///
/// Params:
///
/// c = The character to insert.
/// position = Position to insert the character at in code units, not code points.
/// Must be less than slice length(); a previously returned length()
/// can be used.
void insert(const dchar c, const size_t position) @system pure nothrow @nogc
{
assert(inProgress, "insert called without begin");
assert(start_ + position <= end_, "Trying to insert after the end of the slice");
const point = start_ + position;
const movedLength = end_ - point;
// Encode c into UTF-8
char[4] encodeBuf;
if(c < 0x80) { encodeBuf[0] = cast(char)c; }
const size_t bytes = c < 0x80 ? 1 : encodeValidCharNoGC(encodeBuf, c);
if(movedLength > 0)
{
core.stdc.string.memmove(reader_.buffer_.ptr + point + bytes,
reader_.buffer_.ptr + point,
2014-07-29 18:52:39 +00:00
movedLength * char.sizeof);
}
reader_.buffer_[point .. point + bytes] = encodeBuf[0 .. bytes];
2014-07-29 18:52:39 +00:00
end_ += bytes;
}
2014-07-29 01:01:16 +00:00
/// Get the current length of the slice.
size_t length() @safe pure nothrow const @nogc
{
return end_ - start_;
}
/// A slice building transaction.
///
/// Can be used to save and revert back to slice state.
struct Transaction
{
private:
// The slice builder affected by the transaction.
SliceBuilder* builder_ = null;
2014-07-29 01:01:16 +00:00
// Index of the return point of the transaction in StringBuilder.endStack_.
size_t stackLevel_;
// True after commit() has been called.
bool committed_;
public:
/// Begins a transaction on a SliceBuilder object.
///
/// The transaction must end $(B after) any transactions created within the
/// transaction but $(B before) the slice is finish()-ed. A transaction can be
/// ended either by commit()-ing or reverting through the destructor.
///
/// Saves the current state of a slice.
this(ref SliceBuilder builder) @system pure nothrow @nogc
2014-07-29 01:01:16 +00:00
{
builder_ = &builder;
stackLevel_ = builder_.endStackUsed_;
builder_.push();
}
2014-07-30 02:46:53 +00:00
/// Commit changes to the slice.
2014-07-29 01:01:16 +00:00
///
/// Ends the transaction - can only be called once, and removes the possibility
/// to revert slice state.
///
/// Does nothing for a default-initialized transaction (the transaction has not
/// been started yet).
void commit() @system pure nothrow @nogc
{
assert(!committed_, "Can't commit a transaction more than once");
if(builder_ is null) { return; }
assert(builder_.endStackUsed_ == stackLevel_ + 1,
"Parent transactions don't fully contain child transactions");
builder_.apply();
committed_ = true;
}
/// Destroy the transaction and revert it if it hasn't been committed yet.
///
/// Does nothing for a default-initialized transaction.
~this() @system pure nothrow @nogc
{
if(builder_ is null || committed_) { return; }
assert(builder_.endStackUsed_ == stackLevel_ + 1,
"Parent transactions don't fully contain child transactions");
builder_.pop();
builder_ = null;
}
}
private:
// Push the current end of the slice so we can revert to it if needed.
//
// Used by Transaction.
void push() @system pure nothrow @nogc
{
assert(inProgress, "push called without begin");
assert(endStackUsed_ < endStack_.length, "Slice stack overflow");
endStack_[endStackUsed_++] = end_;
}
// Pop the current end of endStack_ and set the end of the slice to the popped
// value, reverting changes since the old end was pushed.
//
// Used by Transaction.
void pop() @system pure nothrow @nogc
{
assert(inProgress, "pop called without begin");
assert(endStackUsed_ > 0, "Trying to pop an empty slice stack");
end_ = endStack_[--endStackUsed_];
}
// Pop the current end of endStack_, but keep the current end of the slice, applying
// changes made since pushing the old end.
//
// Used by Transaction.
void apply() @system pure nothrow @nogc
{
assert(inProgress, "apply called without begin");
assert(endStackUsed_ > 0, "Trying to apply an empty slice stack");
--endStackUsed_;
}
}
private:
2014-07-26 21:37:56 +00:00
// Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible.
//
// Params:
//
// input = Buffer with UTF-8/16/32 data to decode. May be overwritten by the
// conversion, in which case the result will be a slice of this buffer.
// encoding = Encoding of input.
//
// Returns:
//
// A struct with the following members:
//
// $(D string errorMessage) In case of an error, the error message is stored here. If
// there was no error, errorMessage is NULL. Always check
// this first.
// $(D char[] utf8) input converted to UTF-8. May be a slice of input.
// $(D size_t characterCount) Number of characters (code points) in input.
auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
{
// Documented in function ddoc.
struct Result
{
string errorMessage;
char[] utf8;
size_t characterCount;
}
Result result;
// Encode input_ into UTF-8 if it's encoded as UTF-16 or UTF-32.
//
// Params:
//
// buffer = The input buffer to encode.
// result = A Result struct to put encoded result and any error messages to.
//
// On error, result.errorMessage will be set.
static void encode(C)(C[] input, ref Result result) @safe pure
{
// We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or
// less bytes.
static if(is(C == dchar))
{
char[4] encodeBuf;
auto utf8 = cast(char[])input;
auto length = 0;
foreach(dchar c; input)
{
++result.characterCount;
// ASCII
if(c < 0x80)
{
utf8[length++] = cast(char)c;
continue;
}
const encodeResult = encodeCharNoGC!(No.validated)(encodeBuf, c);
if(encodeResult.errorMessage !is null)
{
result.errorMessage = encodeResult.errorMessage;
return;
}
const bytes = encodeResult.bytes;
utf8[length .. length + bytes] = encodeBuf[0 .. bytes];
length += bytes;
}
result.utf8 = utf8[0 .. length];
}
// Unfortunately we can't do UTF-16 in place so we just use std.conv.to
else
{
result.characterCount = std.utf.count(input);
result.utf8 = input.to!(char[]);
}
}
try final switch(encoding)
{
case UTFEncoding.UTF_8:
result.utf8 = cast(char[])input;
const validateResult = result.utf8.validateUTF8NoGC();
if(!validateResult.valid)
{
result.errorMessage = "UTF-8 validation error after character #" ~
validateResult.characterCount.to!string ~ ": " ~
validateResult.msg;
}
result.characterCount = validateResult.characterCount;
break;
case UTFEncoding.UTF_16:
assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
encode(cast(wchar[])input, result);
break;
case UTFEncoding.UTF_32:
assert(input.length % 4 == 0, "UTF-32 buffer size must be a multiple of 4");
encode(cast(dchar[])input, result);
break;
}
catch(ConvException e) { result.errorMessage = e.msg; }
catch(UTFException e) { result.errorMessage = e.msg; }
catch(Exception e)
{
assert(false, "Unexpected exception in encode(): " ~ e.msg);
}
return result;
}
/// Determine if all characters (code points, not bytes) in a string are printable,
/// except for one or more trailing zeroes.
2014-07-21 07:56:41 +00:00
///
/// Params:
2014-07-21 07:56:41 +00:00
///
/// chars =
bool isPrintableValidUTF8(const char[] chars) @safe pure nothrow @nogc
{
for(size_t b = 0; b < chars.length;)
2011-08-16 12:53:13 +00:00
{
const dchar c = chars[b] < 0x80 ? chars[b++] : decodeValidUTF8NoGC(chars, b);
if(!((c == 0x09 || c == 0x0A || c == 0x0D || c == 0x85) ||
(c >= 0x20 && c <= 0x7E) ||
(c >= 0xA0 && c <= '\uD7FF') ||
(c >= '\uE000' && c <= '\uFFFD')))
2011-08-16 12:53:13 +00:00
{
return false;
2011-08-16 12:53:13 +00:00
}
}
return true;
}
2014-07-22 01:33:50 +00:00
// Unittests.
void testEndian(R)()
{
writeln(typeid(R).toString() ~ ": endian unittest");
void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected)
{
auto reader = new R(data);
assert(reader.encoding == encoding_expected);
assert(reader.endian_ == endian_expected);
}
ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian);
endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian);
}
void testPeekPrefixForward(R)()
{
import std.stream;
writeln(typeid(R).toString() ~ ": peek/prefix/forward unittest");
ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data";
auto reader = new R(data);
assert(reader.peek() == 'd');
assert(reader.peek(1) == 'a');
assert(reader.peek(2) == 't');
assert(reader.peek(3) == 'a');
assert(reader.peek(4) == '\0');
assert(reader.prefix(4) == "data");
// assert(reader.prefix(6) == "data\0");
reader.forward(2);
assert(reader.peek(1) == 'a');
// assert(collectException(reader.peek(3)));
}
void testUTF(R)()
{
import std.stream;
writeln(typeid(R).toString() ~ ": UTF formats unittest");
dchar[] data = cast(dchar[])"data";
void utf_test(T)(T[] data, BOM bom)
2011-08-16 12:53:13 +00:00
{
2014-07-21 07:56:41 +00:00
ubyte[] bytes = ByteOrderMarks[bom] ~
(cast(ubyte[])data)[0 .. data.length * T.sizeof];
auto reader = new R(bytes);
2011-08-16 12:53:13 +00:00
assert(reader.peek() == 'd');
assert(reader.peek(1) == 'a');
assert(reader.peek(2) == 't');
assert(reader.peek(3) == 'a');
}
utf_test!char(to!(char[])(data), BOM.UTF8);
utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.UTF16BE : BOM.UTF16LE);
utf_test(data, endian == Endian.bigEndian ? BOM.UTF32BE : BOM.UTF32LE);
}
2014-07-21 07:53:58 +00:00
void test1Byte(R)()
{
writeln(typeid(R).toString() ~ ": 1 byte file unittest");
ubyte[] data = [97];
auto reader = new R(data);
2014-07-21 07:53:58 +00:00
assert(reader.peek() == 'a');
assert(reader.peek(1) == '\0');
// assert(collectException(reader.peek(2)));
2014-07-21 07:53:58 +00:00
}
unittest
{
testEndian!Reader();
testPeekPrefixForward!Reader();
testUTF!Reader();
2014-07-21 07:53:58 +00:00
test1Byte!Reader();
2011-08-16 12:53:13 +00:00
}