dyaml/source/dyaml/reader.d


//          Copyright Ferdinand Majerech 2011-2014.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE_1_0.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)

module dyaml.reader;


import core.stdc.stdlib;
import core.stdc.string;
import core.thread;

import std.algorithm;
import std.array;
import std.conv;
import std.exception;
import std.range;
import std.string;
import std.system;
import std.typecons;
import std.utf;

import tinyendian;

import dyaml.encoding;
import dyaml.exception;

alias isBreak = among!('\n', '\u0085', '\u2028', '\u2029');

package:


///Exception thrown at Reader errors.
class ReaderException : YAMLException
{
    this(string msg, string file = __FILE__, int line = __LINE__)
        @safe pure nothrow
    {
        super("Reader error: " ~ msg, file, line);
    }
}

/// Provides an API to read characters from a UTF-8 buffer and build slices into that
/// buffer to avoid allocations (see SliceBuilder).
final class Reader
{
    private:
        // Buffer of currently loaded characters.
        char[] buffer_ = null;

        // Current position within buffer. Only data after this position can be read.
        size_t bufferOffset_ = 0;

        // Index of the current character in the buffer.
        size_t charIndex_ = 0;
        // Number of characters (code points) in buffer_.
        size_t characterCount_ = 0;

        // Current line in file.
        uint line_;
        // Current column in file.
        uint column_;

        // Original Unicode encoding of the data.
        Encoding encoding_;

        version(unittest)
        {
            // Endianness of the input before it was converted (for testing)
            Endian endian_;
        }

        // The number of consecutive ASCII characters starting at bufferOffset_.
        //
        // Used to minimize UTF-8 decoding.
        size_t upcomingASCII_ = 0;

        // Index to buffer_ where the last decoded character starts.
        size_t lastDecodedBufferOffset_ = 0;
        // Offset, relative to charIndex_, of the last decoded character,
        // in code points, not chars.
        size_t lastDecodedCharOffset_ = 0;

    public:
        /// Construct a Reader.
        ///
        /// Params:  buffer = Buffer with YAML data. This may be e.g. the entire
        ///                   contents of a file or a string. $(B will) be modified by
        ///                   the Reader and other parts of D:YAML (D:YAML tries to
        ///                   reuse the buffer to minimize memory allocations)
        ///
        /// Throws:  ReaderException on a UTF decoding error or if there are
        ///          nonprintable Unicode characters illegal in YAML.
        this(ubyte[] buffer) @safe pure
        {
            auto endianResult = fixUTFByteOrder(buffer);
            if(endianResult.bytesStripped > 0)
            {
                throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned " ~
                                          "to 2 or 4 bytes, respectively");
            }

            version(unittest) { endian_ = endianResult.endian; }
            encoding_ = endianResult.encoding;

            auto utf8Result = toUTF8(endianResult.array, endianResult.encoding);
            const msg = utf8Result.errorMessage;
            if(msg !is null)
            {
                throw new ReaderException("Error when converting to UTF-8: " ~ msg);
            }

            buffer_ = utf8Result.utf8;

            characterCount_ = utf8Result.characterCount;
            // Check that all characters in buffer are printable.
            enforce(isPrintableValidUTF8(buffer_),
                    new ReaderException("Special unicode characters are not allowed"));

            this.sliceBuilder = SliceBuilder(this);
            checkASCII();
        }

        /// Get character at specified index relative to current position.
        ///
        /// Params:  index = Index of the character to get relative to current position
        ///                  in the buffer. Can point outside of the buffer; In that
        ///                  case, '\0' will be returned.
        ///
        /// Returns: Character at specified position or '\0' if outside of the buffer.
        ///
        // XXX removed; search for 'risky' to find why.
        // Throws:  ReaderException if trying to read past the end of the buffer.
        dchar peek(const size_t index) @safe pure
        {
            if(index < upcomingASCII_) { return buffer_[bufferOffset_ + index]; }
            if(characterCount_ <= charIndex_ + index)
            {
                // XXX This is risky; revert this if bugs are introduced. We rely on
                // the assumption that Reader only uses peek() to detect end of buffer.
                // The test suite passes.
                // Revert this case here and in other peek() versions if this causes
                // errors.
                // throw new ReaderException("Trying to read past the end of the buffer");
                return '\0';
            }

            // Optimized path for Scanner code that peeks chars in linear order to
            // determine the length of some sequence.
            if(index == lastDecodedCharOffset_)
            {
                ++lastDecodedCharOffset_;
                const char b = buffer_[lastDecodedBufferOffset_];
                // ASCII
                if(b < 0x80)
                {
                    ++lastDecodedBufferOffset_;
                    return b;
                }
                return decode(buffer_, lastDecodedBufferOffset_);
            }

            // 'Slow' path where we decode everything up to the requested character.
            const asciiToTake = min(upcomingASCII_, index);
            lastDecodedCharOffset_   = asciiToTake;
            lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;
            dchar d;
            while(lastDecodedCharOffset_ <= index)
            {
                d = decodeNext();
            }

            return d;
        }

        /// Optimized version of peek() for the case where peek index is 0.
        dchar peek() @safe pure
        {
            if(upcomingASCII_ > 0)            { return buffer_[bufferOffset_]; }
            if(characterCount_ <= charIndex_) { return '\0'; }

            lastDecodedCharOffset_   = 0;
            lastDecodedBufferOffset_ = bufferOffset_;
            return decodeNext();
        }

        /// Get byte at specified index relative to current position.
        ///
        /// Params:  index = Index of the byte to get relative to current position
        ///                  in the buffer. Can point outside of the buffer; In that
        ///                  case, '\0' will be returned.
        ///
        /// Returns: Byte at specified position or '\0' if outside of the buffer.
        char peekByte(const size_t index) @safe pure nothrow @nogc
        {
            return characterCount_ > (charIndex_ + index) ? buffer_[bufferOffset_ + index] : '\0';
        }

        /// Optimized version of peekByte() for the case where peek byte index is 0.
        char peekByte() @safe pure nothrow @nogc
        {
            return characterCount_ > charIndex_ ? buffer_[bufferOffset_] : '\0';
        }


        /// Get specified number of characters starting at current position.
        ///
        /// Note: This gets only a "view" into the internal buffer, which will be
        ///       invalidated after other Reader calls. Use SliceBuilder to build slices
        ///       for permanent use.
        ///
        /// Params: length = Number of characters (code points, not bytes) to get. May
        ///                  reach past the end of the buffer; in that case the returned
        ///                  slice will be shorter.
        ///
        /// Returns: Characters starting at current position or an empty slice if out of bounds.
        char[] prefix(const size_t length) @safe pure
        {
            return slice(length);
        }

        /// Get specified number of bytes, not code points, starting at current position.
        ///
        /// Note: This gets only a "view" into the internal buffer, which will be
        ///       invalidated after other Reader calls. Use SliceBuilder to build slices
        ///       for permanent use.
        ///
        /// Params: length = Number bytes (not code points) to get. May NOT reach past
        ///                  the end of the buffer; should be used with peek() to avoid
        ///                  this.
        ///
        /// Returns: Bytes starting at current position.
        char[] prefixBytes(const size_t length) @safe pure nothrow @nogc
        {
            assert(length == 0 || bufferOffset_ + length < buffer_.length,
                   "prefixBytes out of bounds");
            return buffer_[bufferOffset_ .. bufferOffset_ + length];
        }

        /// Get a slice view of the internal buffer, starting at the current position.
        ///
        /// Note: This gets only a "view" into the internal buffer,
        ///       which get invalidated after other Reader calls.
        ///
        /// Params:  end = End of the slice relative to current position. May reach past
        ///                the end of the buffer; in that case the returned slice will
        ///                be shorter.
        ///
        /// Returns: Slice into the internal buffer or an empty slice if out of bounds.
        char[] slice(const size_t end) @safe pure
        {
            // Fast path in case the caller has already peek()ed all the way to end.
            if(end == lastDecodedCharOffset_)
            {
                return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
            }

            const asciiToTake = min(upcomingASCII_, end, buffer_.length);
            lastDecodedCharOffset_   = asciiToTake;
            lastDecodedBufferOffset_ = bufferOffset_ + asciiToTake;

            // 'Slow' path - decode everything up to end.
            while(lastDecodedCharOffset_ < end &&
                  lastDecodedBufferOffset_ < buffer_.length)
            {
                decodeNext();
            }

            return buffer_[bufferOffset_ .. lastDecodedBufferOffset_];
        }

        /// Get the next character, moving buffer position beyond it.
        ///
        /// Returns: Next character.
        ///
        /// Throws:  ReaderException if trying to read past the end of the buffer
        ///          or if invalid data is read.
        dchar get() @safe pure
        {
            const result = peek();
            forward();
            return result;
        }

        /// Get specified number of characters, moving buffer position beyond them.
        ///
        /// Params:  length = Number or characters (code points, not bytes) to get.
        ///
        /// Returns: Characters starting at current position.
        char[] get(const size_t length) @safe pure
        {
            auto result = slice(length);
            forward(length);
            return result;
        }

        /// Move current position forward.
        ///
        /// Params:  length = Number of characters to move position forward.
        void forward(size_t length) @safe pure
        {
            while(length > 0)
            {
                auto asciiToTake = min(upcomingASCII_, length);
                charIndex_     += asciiToTake;
                length         -= asciiToTake;
                upcomingASCII_ -= asciiToTake;

                for(; asciiToTake > 0; --asciiToTake)
                {
                    const c = buffer_[bufferOffset_++];
                    // c is ASCII, do we only need to check for ASCII line breaks.
                    if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
                    {
                        ++line_;
                        column_ = 0;
                        continue;
                    }
                    ++column_;
                }

                // If we have used up all upcoming ASCII chars, the next char is
                // non-ASCII even after this returns, so upcomingASCII_ doesn't need to
                // be updated - it's zero.
                if(length == 0) { break; }

                assert(upcomingASCII_ == 0,
                       "Running unicode handling code but we haven't run out of ASCII chars");
                assert(bufferOffset_ < buffer_.length,
                       "Attempted to decode past the end of YAML buffer");
                assert(buffer_[bufferOffset_] >= 0x80,
                       "ASCII must be handled by preceding code");

                ++charIndex_;
                const c = decode(buffer_, bufferOffset_);

                // New line. (can compare with '\n' without decoding since it's ASCII)
                if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n'))
                {
                    ++line_;
                    column_ = 0;
                }
                else if(c != '\uFEFF') { ++column_; }
                --length;
                checkASCII();
            }

            lastDecodedBufferOffset_ = bufferOffset_;
            lastDecodedCharOffset_ = 0;
        }

        /// Move current position forward by one character.
        void forward() @safe pure
        {
            ++charIndex_;
            lastDecodedBufferOffset_ = bufferOffset_;
            lastDecodedCharOffset_ = 0;

            // ASCII
            if(upcomingASCII_ > 0)
            {
                --upcomingASCII_;
                const c = buffer_[bufferOffset_++];

                if(c == '\n' || (c == '\r' && buffer_[bufferOffset_] != '\n'))
                {
                    ++line_;
                    column_ = 0;
                    return;
                }
                ++column_;
                return;
            }

            // UTF-8
            assert(bufferOffset_ < buffer_.length,
                   "Attempted to decode past the end of YAML buffer");
            assert(buffer_[bufferOffset_] >= 0x80,
                   "ASCII must be handled by preceding code");

            const c = decode(buffer_, bufferOffset_);

            // New line. (can compare with '\n' without decoding since it's ASCII)
            if(c.isBreak || (c == '\r' && buffer_[bufferOffset_] != '\n'))
            {
                ++line_;
                column_ = 0;
            }
            else if(c != '\uFEFF') { ++column_; }

            checkASCII();
        }

        /// Used to build slices of read data in Reader; to avoid allocations.
        SliceBuilder sliceBuilder;

        /// Get a string describing current buffer position, used for error messages.
        Mark mark() const pure nothrow @nogc @safe { return Mark(line_, column_); }

        /// Get current line number.
        uint line() const @safe pure nothrow @nogc { return line_; }

        /// Get current column number.
        uint column() const @safe pure nothrow @nogc { return column_; }

        /// Get index of the current character in the buffer.
        size_t charIndex() const @safe pure nothrow @nogc { return charIndex_; }

        /// Get encoding of the input buffer.
        Encoding encoding() const @safe pure nothrow @nogc { return encoding_; }

private:
        // Update upcomingASCII_ (should be called forward()ing over a UTF-8 sequence)
        void checkASCII() @safe pure nothrow @nogc
        {
            upcomingASCII_ = countASCII(buffer_[bufferOffset_ .. $]);
        }

        // Decode the next character relative to
        // lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them.
        //
        // Does not advance the buffer position. Used in peek() and slice().
        dchar decodeNext() @safe pure
        {
            assert(lastDecodedBufferOffset_ < buffer_.length,
                   "Attempted to decode past the end of YAML buffer");
            const char b = buffer_[lastDecodedBufferOffset_];
            ++lastDecodedCharOffset_;
            // ASCII
            if(b < 0x80)
            {
                ++lastDecodedBufferOffset_;
                return b;
            }

            return decode(buffer_, lastDecodedBufferOffset_);
        }
}

/// Used to build slices of already read data in Reader buffer, avoiding allocations.
///
/// Usually these slices point to unchanged Reader data, but sometimes the data is
/// changed due to how YAML interprets certain characters/strings.
///
/// See begin() documentation.
struct SliceBuilder
{
private:
    // No copying by the user.
    @disable this(this);
    @disable void opAssign(ref SliceBuilder);

    // Reader this builder works in.
    Reader reader_;

    // Start of the slice om reader_.buffer_ (size_t.max while no slice being build)
    size_t start_ = size_t.max;
    // End of the slice om reader_.buffer_ (size_t.max while no slice being build)
    size_t end_   = size_t.max;

    // Stack of slice ends to revert to (see Transaction)
    //
    // Very few levels as we don't want arbitrarily nested transactions.
    size_t[4] endStack_;
    // The number of elements currently in endStack_.
    size_t endStackUsed_ = 0;

    @safe const pure nothrow @nogc invariant()
    {
        if(!inProgress) { return; }
        assert(end_ <= reader_.bufferOffset_, "Slice ends after buffer position");
        assert(start_ <= end_, "Slice start after slice end");
    }

    // Is a slice currently being built?
    bool inProgress() @safe const pure nothrow @nogc
    {
        assert(start_ == size_t.max ? end_ == size_t.max : end_ != size_t.max,
               "start_/end_ are not consistent");
        return start_ != size_t.max;
    }

public:
    /// Begin building a slice.
    ///
    /// Only one slice can be built at any given time; before beginning a new slice,
    /// finish the previous one (if any).
    ///
    /// The slice starts at the current position in the Reader buffer. It can only be
    /// extended up to the current position in the buffer; Reader methods get() and
    /// forward() move the position. E.g. it is valid to extend a slice by write()-ing
    /// a string just returned by get() - but not one returned by prefix() unless the
    /// position has changed since the prefix() call.
    void begin() @safe pure nothrow @nogc
    {
        assert(!inProgress, "Beginning a slice while another slice is being built");
        assert(endStackUsed_ == 0, "Slice stack not empty at slice begin");

        start_ = reader_.bufferOffset_;
        end_   = reader_.bufferOffset_;
    }

    /// Finish building a slice and return it.
    ///
    /// Any Transactions on the slice must be committed or destroyed before the slice
    /// is finished.
    ///
    /// Returns a string; once a slice is finished it is definitive that its contents
    /// will not be changed.
    char[] finish() @safe pure nothrow @nogc
    {
        assert(inProgress, "finish called without begin");
        assert(endStackUsed_ == 0, "Finishing a slice with running transactions.");

        auto result = reader_.buffer_[start_ .. end_];
        start_ = end_ = size_t.max;
        return result;
    }

    /// Write a string to the slice being built.
    ///
    /// Data can only be written up to the current position in the Reader buffer.
    ///
    /// If str is a string returned by a Reader method, and str starts right after the
    /// end of the slice being built, the slice is extended (trivial operation).
    ///
    /// See_Also: begin
    void write(char[] str) @safe pure nothrow @nogc
    {
        assert(inProgress, "write called without begin");
        assert(end_ <= reader_.bufferOffset_,
               "AT START: Slice ends after buffer position");

        // Nothing? Already done.
        if (str.length == 0) { return; }
        // If str starts at the end of the slice (is a string returned by a Reader
        // method), just extend the slice to contain str.
        if(&str[0] == &reader_.buffer_[end_])
        {
            end_ += str.length;
        }
        // Even if str does not start at the end of the slice, it still may be returned
        // by a Reader method and point to buffer. So we need to memmove.
        else
        {
            copy(str, reader_.buffer_[end_..end_ + str.length * char.sizeof]);
            end_ += str.length;
        }
    }

    /// Write a character to the slice being built.
    ///
    /// Data can only be written up to the current position in the Reader buffer.
    ///
    /// See_Also: begin
    void write(dchar c) @safe pure
    {
        assert(inProgress, "write called without begin");
        if(c < 0x80)
        {
            reader_.buffer_[end_++] = cast(char)c;
            return;
        }

        // We need to encode a non-ASCII dchar into UTF-8
        char[4] encodeBuf;
        const bytes = encode(encodeBuf, c);
        reader_.buffer_[end_ .. end_ + bytes] = encodeBuf[0 .. bytes];
        end_ += bytes;
    }

    /// Insert a character to a specified position in the slice.
    ///
    /// Enlarges the slice by 1 char. Note that the slice can only extend up to the
    /// current position in the Reader buffer.
    ///
    /// Params:
    ///
    /// c        = The character to insert.
    /// position = Position to insert the character at in code units, not code points.
    ///            Must be less than slice length(); a previously returned length()
    ///            can be used.
    void insert(const dchar c, const size_t position) @safe pure
    {
        assert(inProgress, "insert called without begin");
        assert(start_ + position <= end_, "Trying to insert after the end of the slice");

        const point       = start_ + position;
        const movedLength = end_ - point;

        // Encode c into UTF-8
        char[4] encodeBuf;
        if(c < 0x80) { encodeBuf[0] = cast(char)c; }
        const size_t bytes = c < 0x80 ? 1 : encode(encodeBuf, c);

        if(movedLength > 0)
        {
            copy(reader_.buffer_[point..point + movedLength * char.sizeof],
                    reader_.buffer_[point + bytes..point + bytes + movedLength * char.sizeof]);
        }
        reader_.buffer_[point .. point + bytes] = encodeBuf[0 .. bytes];
        end_ += bytes;
    }

    /// Get the current length of the slice.
    size_t length() @safe const pure nothrow @nogc
    {
        return end_ - start_;
    }

    /// A slice building transaction.
    ///
    /// Can be used to save and revert back to slice state.
    struct Transaction
    {
    private:
        // The slice builder affected by the transaction.
        SliceBuilder* builder_ = null;
        // Index of the return point of the transaction in StringBuilder.endStack_.
        size_t stackLevel_;
        // True after commit() has been called.
        bool committed_;

    public:
        /// Begins a transaction on a SliceBuilder object.
        ///
        /// The transaction must end $(B after) any transactions created within the
        /// transaction but $(B before) the slice is finish()-ed. A transaction can be
        /// ended either by commit()-ing or reverting through the destructor.
        ///
        /// Saves the current state of a slice.
        this(SliceBuilder* builder) @safe pure nothrow @nogc
        {
            builder_ = builder;
            stackLevel_ = builder_.endStackUsed_;
            builder_.push();
        }

        /// Commit changes to the slice.
        ///
        /// Ends the transaction - can only be called once, and removes the possibility
        /// to revert slice state.
        ///
        /// Does nothing for a default-initialized transaction (the transaction has not
        /// been started yet).
        void commit() @safe pure nothrow @nogc
        {
            assert(!committed_, "Can't commit a transaction more than once");

            if(builder_ is null) { return; }
            assert(builder_.endStackUsed_ == stackLevel_ + 1,
                   "Parent transactions don't fully contain child transactions");
            builder_.apply();
            committed_ = true;
        }

        /// Destroy the transaction and revert it if it hasn't been committed yet.
        void end() @safe pure nothrow @nogc
        {
            assert(builder_ && builder_.endStackUsed_ == stackLevel_ + 1,
                   "Parent transactions don't fully contain child transactions");
            builder_.pop();
            builder_ = null;
        }

    }

private:
    // Push the current end of the slice so we can revert to it if needed.
    //
    // Used by Transaction.
    void push() @safe pure nothrow @nogc
    {
        assert(inProgress, "push called without begin");
        assert(endStackUsed_ < endStack_.length, "Slice stack overflow");
        endStack_[endStackUsed_++] = end_;
    }

    // Pop the current end of endStack_ and set the end of the slice to the popped
    // value, reverting changes since the old end was pushed.
    //
    // Used by Transaction.
    void pop() @safe pure nothrow @nogc
    {
        assert(inProgress, "pop called without begin");
        assert(endStackUsed_ > 0, "Trying to pop an empty slice stack");
        end_ = endStack_[--endStackUsed_];
    }

    // Pop the current end of endStack_, but keep the current end of the slice, applying
    // changes made since pushing the old end.
    //
    // Used by Transaction.
    void apply() @safe pure nothrow @nogc
    {
        assert(inProgress, "apply called without begin");
        assert(endStackUsed_ > 0, "Trying to apply an empty slice stack");
        --endStackUsed_;
    }
}


private:

// Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible.
//
// Params:
//
// input    = Buffer with UTF-8/16/32 data to decode. May be overwritten by the
//            conversion, in which case the result will be a slice of this buffer.
// encoding = Encoding of input.
//
// Returns:
//
// A struct with the following members:
//
// $(D string errorMessage)   In case of an error, the error message is stored here. If
//                            there was no error, errorMessage is NULL. Always check
//                            this first.
// $(D char[] utf8)           input converted to UTF-8. May be a slice of input.
// $(D size_t characterCount) Number of characters (code points) in input.
auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
{
    // Documented in function ddoc.
    struct Result
    {
        string errorMessage;
        char[] utf8;
        size_t characterCount;
    }

    Result result;

    // Encode input_ into UTF-8 if it's encoded as UTF-16 or UTF-32.
    //
    // Params:
    //
    // buffer = The input buffer to encode.
    // result = A Result struct to put encoded result and any error messages to.
    //
    // On error, result.errorMessage will be set.
    static void encode(C)(C[] input, ref Result result) @safe pure
    {
        // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or
        // less bytes.
        static if(is(C == dchar))
        {
            char[4] encodeBuf;
            auto utf8 = cast(char[])input;
            auto length = 0;
            foreach(dchar c; input)
            {
                ++result.characterCount;
                // ASCII
                if(c < 0x80)
                {
                    utf8[length++] = cast(char)c;
                    continue;
                }

                std.utf.encode(encodeBuf, c);
                const bytes = codeLength!char(c);
                utf8[length .. length + bytes] = encodeBuf[0 .. bytes];
                length += bytes;
            }
            result.utf8 = utf8[0 .. length];
        }
        // Unfortunately we can't do UTF-16 in place so we just use std.conv.to
        else
        {
            result.characterCount = std.utf.count(input);
            result.utf8 = input.to!(char[]);
        }
    }

    try final switch(encoding)
    {
        case UTFEncoding.UTF_8:
            result.utf8 = cast(char[])input;
            result.utf8.validate();
            result.characterCount = std.utf.count(result.utf8);
            break;
        case UTFEncoding.UTF_16:
            assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
            encode(cast(wchar[])input, result);
            break;
        case UTFEncoding.UTF_32:
            assert(input.length % 4 == 0, "UTF-32 buffer size must be a multiple of 4");
            encode(cast(dchar[])input, result);
            break;
    }
    catch(ConvException e) { result.errorMessage = e.msg; }
    catch(UTFException e)  { result.errorMessage = e.msg; }
    catch(Exception e)
    {
        assert(false, "Unexpected exception in encode(): " ~ e.msg);
    }

    return result;
}

/// Determine if all characters (code points, not bytes) in a string are printable.
bool isPrintableValidUTF8(const char[] chars) @safe pure
{
    // This is oversized (only 128 entries are necessary) simply because having 256
    // entries improves performance... for some reason (alignment?)
    bool[256] printable = [false, false, false, false, false, false, false, false,
                           false, true,  true,  false, false, true,  false, false,
                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false,

                           true,  true,  true,  true, true,  true,  true,  true,
                           true,  true,  true,  true, true,  true,  true,  true,
                           true,  true,  true,  true, true,  true,  true,  true,
                           true,  true,  true,  true, true,  true,  true,  true,

                           true,  true,  true,  true, true,  true,  true,  true,
                           true,  true,  true,  true, true,  true,  true,  true,
                           true,  true,  true,  true, true,  true,  true,  true,
                           true,  true,  true,  true, true,  true,  true,  true,
                           true,  true,  true,  true, true,  true,  true,  true,
                           true,  true,  true,  true, true,  true,  true,  true,
                           true,  true,  true,  true, true,  true,  true,  true,
                           true,  true,  true,  true, true,  true,  true,  true,

                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false,

                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false,
                           false, false, false, false, false, false, false, false];

    for(size_t index = 0; index < chars.length;)
    {
        // Fast path for ASCII.
        // Both this while() block and the if() block below it are optimized, unrolled
        // versions of the for() block below them; the while()/if() block could be
        // removed without affecting logic, but both help increase performance.
        size_t asciiCount = countASCII(chars[index .. $]);
        // 8 ASCII iterations unrolled, looping while there are at most 8 ASCII chars.
        while(asciiCount > 8)
        {
            const dchar b0 = chars[index];
            const dchar b1 = chars[index + 1];
            const dchar b2 = chars[index + 2];
            const dchar b3 = chars[index + 3];
            const dchar b4 = chars[index + 4];
            const dchar b5 = chars[index + 5];
            const dchar b6 = chars[index + 6];
            const dchar b7 = chars[index + 7];

            index += 8;
            asciiCount -= 8;

            const all = printable[b0] & printable[b1] & printable[b2] & printable[b3] &
                        printable[b4] & printable[b5] & printable[b6] & printable[b1];
            if(!all)
            {
                return false;
            }
        }
        // 4 ASCII iterations unrolled
        if(asciiCount > 4)
        {
            const char b0 = chars[index];
            const char b1 = chars[index + 1];
            const char b2 = chars[index + 2];
            const char b3 = chars[index + 3];

            index += 4;
            asciiCount -= 4;

            if(!printable[b0]) { return false; }
            if(!printable[b1]) { return false; }
            if(!printable[b2]) { return false; }
            if(!printable[b3]) { return false; }
        }
        // Any remaining ASCII chars. This is really the only code needed to handle
        // ASCII, the above if() and while() blocks are just an optimization.
        for(; asciiCount > 0; --asciiCount)
        {
            const char b = chars[index];
            ++index;
            if(b >= 0x20)    { continue; }
            if(printable[b]) { continue; }
            return false;
        }

        if(index == chars.length) { break; }

        // Not ASCII, need to decode.
        const dchar c = decode(chars, index);
        // We now c is not ASCII, so only check for printable non-ASCII chars.
        if(!(c == 0x85 || (c >= 0xA0 && c <= '\uD7FF') ||
            (c >= '\uE000' && c <= '\uFFFD') ||
            (c >= '\U00010000' && c <= '\U0010FFFF')))
        {
            return false;
        }
    }
    return true;
}

/// Counts the number of ASCII characters in buffer until the first UTF-8 sequence.
///
/// Used to determine how many characters we can process without decoding.
size_t countASCII(const(char)[] buffer) @safe pure nothrow @nogc
{
    return buffer.byCodeUnit.until!(x => x > 0x7F).walkLength;
}
// Unittests.

void testEndian(R)()
{
    void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected)
    {
        auto reader = new R(data);
        assert(reader.encoding == encoding_expected);
        assert(reader.endian_ == endian_expected);
    }
    ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
    ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
    endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian);
    endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian);
}

void testPeekPrefixForward(R)()
{
    import std.encoding;
    ubyte[] data = bomTable[BOM.utf8].sequence ~ cast(ubyte[])"data";
    auto reader = new R(data);
    assert(reader.peek() == 'd');
    assert(reader.peek(1) == 'a');
    assert(reader.peek(2) == 't');
    assert(reader.peek(3) == 'a');
    assert(reader.peek(4) == '\0');
    assert(reader.prefix(4) == "data");
    // assert(reader.prefix(6) == "data\0");
    reader.forward(2);
    assert(reader.peek(1) == 'a');
    // assert(collectException(reader.peek(3)));
}

void testUTF(R)()
{
    import std.encoding;
    dchar[] data = cast(dchar[])"data";
    void utf_test(T)(T[] data, BOM bom)
    {
        ubyte[] bytes = bomTable[bom].sequence ~
                        (cast(ubyte[])data)[0 .. data.length * T.sizeof];
        auto reader = new R(bytes);
        assert(reader.peek() == 'd');
        assert(reader.peek(1) == 'a');
        assert(reader.peek(2) == 't');
        assert(reader.peek(3) == 'a');
    }
    utf_test!char(to!(char[])(data), BOM.utf8);
    utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.utf16be : BOM.utf16le);
    utf_test(data, endian == Endian.bigEndian ? BOM.utf32be : BOM.utf32le);
}

void test1Byte(R)()
{
    ubyte[] data = [97];

    auto reader = new R(data);
    assert(reader.peek() == 'a');
    assert(reader.peek(1) == '\0');
    // assert(collectException(reader.peek(2)));
}

@system unittest
{
    testEndian!Reader();
    testPeekPrefixForward!Reader();
    testUTF!Reader();
    test1Byte!Reader();
}