// Copyright Ferdinand Majerech 2011-2014. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) module dyaml.reader; import core.stdc.stdlib; import core.stdc.string; import core.thread; import std.algorithm; import std.array; import std.conv; import std.exception; import std.stdio; import std.string; import std.system; import std.utf; import tinyendian; import dyaml.fastcharsearch; import dyaml.encoding; import dyaml.exception; import dyaml.streamcompat; package: //XXX VIM STUFF: //XXX THE f/t COLORING PLUGIN, AND TRY TO REMOVE THE f/t AUTOREPEAT PLUGIN // (AND MAYBE DO THE REPEAT WITH ALT-T/ALT-F //XXX DDOC snippets such as $D, $BIGOH, anything else // OR MAYBE JUST $ - EXPANDING TO $(${1} ${2}) // WHERE DEFAULT ${1} IS 'D' AND SPECIAL SNIPPETS FOR SPECIFIC DDOC MACROS // (E.G. XREF HAS 2 ARGS) // XXX DON'T FORGET TO COMMIT DSNIPS CHANGES // XXX SNIPPETS: WHY CAN'T WE USE NEW IN NEW? FIX! // XXX ALSO WRITELN VISUAL! (print whatever we have selected) // XXX AND ``fun`` VISUAL TOO! // XXX snippet to print variable along its name AND // OR MULTIPLE VARS - USE std.format! ///Exception thrown at Reader errors. class ReaderException : YAMLException { this(string msg, string file = __FILE__, int line = __LINE__) @safe pure nothrow { super("Reader error: " ~ msg, file, line); } } /// Lazily reads and decodes data from a buffer, only storing as much as needed at any /// moment. /// /// Adds a '\0' to the end of the data. final class Reader { private: // Buffer of currently loaded characters. dchar[] buffer_ = null; // Current position within buffer. Only data after this position can be read. uint bufferOffset_ = 0; // Index of the current character in the buffer. size_t charIndex_ = 0; // Current line in file. uint line_; // Current column in file. uint column_; // Decoder reading data from file and decoding it to UTF-32. UTFFastDecoder decoder_; version(unittest) { // Endianness of the input before it was converted (for testing) Endian endian_; } public: import std.stream; /// Construct a Reader. /// /// Params: stream = Input stream. Must be readable and seekable. /// /// Throws: ReaderException if the stream is invalid, on a UTF decoding error /// or if there are nonprintable unicode characters illegal in YAML. this(Stream stream) @trusted //!nothrow { auto streamBytes = streamToBytesGC(stream); auto result = fixUTFByteOrder(streamBytes); if(result.bytesStripped > 0) { throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned " "to 2 or 4 bytes, respectively"); } version(unittest) { endian_ = result.endian; } decoder_ = UTFFastDecoder(result.array, result.encoding); decoder_.decodeAll(); const msg = decoder_.getAndClearErrorMessage(); if(msg !is null) { throw new ReaderException("UTF decoding error: " ~ msg); } buffer_ = decoder_.decoded; // The part of buffer excluding trailing zeroes. auto noZeros = buffer_; while(!noZeros.empty && noZeros.back == '\0') { noZeros.popBack(); } enforce(printable(noZeros[]), new ReaderException("Special unicode characters are not allowed")); } /// Get character at specified index relative to current position. /// /// Params: index = Index of the character to get relative to current position /// in the buffer. /// /// Returns: Character at specified position. /// // XXX removed; search for 'risky' to find why. // Throws: ReaderException if trying to read past the end of the buffer. dchar peek(size_t index = 0) @safe pure nothrow const @nogc { if(buffer_.length <= bufferOffset_ + index) { // XXX This is risky; revert this and the 'risky' change in UTF decoder // if any bugs are introduced. We rely on the assumption that Reader // only uses peek() to detect the of buffer. The test suite passes. // throw new ReaderException("Trying to read past the end of the buffer"); return '\0'; } return buffer_[bufferOffset_ + index]; } /// Get specified number of characters starting at current position. /// /// Note: This gets only a "view" into the internal buffer, /// which WILL get invalidated after other Reader calls. /// /// Params: length = Number of characters to get. May reach past the end of the /// buffer; in that case the returned slice will be shorter. /// /// Returns: Characters starting at current position or an empty slice if out of bounds. const(dstring) prefix(size_t length) @safe pure nothrow const @nogc { return slice(0, length); } /// Get a slice view of the internal buffer. /// /// Note: This gets only a "view" into the internal buffer, /// which WILL get invalidated after other Reader calls. /// /// Params: start = Start of the slice relative to current position. /// end = End of the slice relative to current position. May reach /// past the end of the buffer; in that case the returned /// slice will be shorter. /// /// Returns: Slice into the internal buffer or an empty slice if out of bounds. const(dstring) slice(size_t start, size_t end) @trusted pure nothrow const @nogc { end += bufferOffset_; start += bufferOffset_; end = min(buffer_.length, end); return end > start ? cast(dstring)buffer_[start .. end] : ""; } /// Get the next character, moving buffer position beyond it. /// /// Returns: Next character. /// /// Throws: ReaderException if trying to read past the end of the buffer /// or if invalid data is read. dchar get() @safe pure nothrow @nogc { const result = peek(); forward(); return result; } /// Get specified number of characters, moving buffer position beyond them. /// /// Params: length = Number or characters to get. /// /// Returns: Characters starting at current position. dstring get(size_t length) @safe pure nothrow @nogc { auto result = prefix(length); forward(length); return result; } /// Move current position forward. /// /// Params: length = Number of characters to move position forward. void forward(size_t length = 1) @safe pure nothrow @nogc { mixin FastCharSearch!"\n\u0085\u2028\u2029"d search; for(; length > 0; --length) { const c = buffer_[bufferOffset_]; ++bufferOffset_; ++charIndex_; //New line. if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n')) { ++line_; column_ = 0; } else if(c != '\uFEFF') { ++column_; } } } /// Get a string describing current buffer position, used for error messages. final Mark mark() @safe pure nothrow const @nogc { return Mark(line_, column_); } /// Get current line number. final uint line() @safe pure nothrow const @nogc { return line_; } /// Get current column number. final uint column() @safe pure nothrow const @nogc { return column_; } /// Get index of the current character in the buffer. final size_t charIndex() @safe pure nothrow const @nogc { return charIndex_; } /// Get encoding of the input buffer. final Encoding encoding() @safe pure nothrow const @nogc { return decoder_.encoding; } } private: alias UTFDecoder UTFFastDecoder; struct UTFDecoder { private: // UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence). static immutable ubyte[256] utf8Stride = [ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, ]; // Encoding of the input buffer. UTFEncoding encoding_; // Maximum number of characters that might be in the buffer. size_t maxChars_; // Part of the input buffer that has not yet been decoded. ubyte[] input_; // Decoded (UTF-32) version of the entire input_. If input is UTF-32, this is // just a reference to input_. dchar[] decoded_; // Current error message. // // To be fully nothrow, we use return values and the user (Reader) can check // for a detailed error message if they get an error return. string errorMessage_; public: /// Construct a UTFBlockDecoder decoding data from a buffer. this(ubyte[] buffer, UTFEncoding encoding) @safe pure nothrow @nogc { input_ = buffer; encoding_ = encoding; final switch(encoding_) { case UTFEncoding.UTF_8: maxChars_ = input_.length; break; case UTFEncoding.UTF_16: maxChars_ = input_.length / 2; break; case UTFEncoding.UTF_32: maxChars_ = input_.length / 2; break; } } /// Decode all data passed to the constructor. /// /// On error, getAndClearErrorMessage() will return a non-null string. void decodeAll() @safe pure nothrow { assert(decoded_ is null, "Calling decodeAll more than once"); final switch(encoding_) { case UTFEncoding.UTF_8: decode(cast(char[])input_); break; case UTFEncoding.UTF_16: assert(input_.length % 2 == 0, "UTF-16 buffer size must be even"); decode(cast(wchar[])input_); break; case UTFEncoding.UTF_32: assert(input_.length % 4 == 0, "UTF-32 buffer size must be a multiple of 4"); // No need to decode anything decoded_ = cast(dchar[])input_; break; } // XXX This is risky. We rely on the assumption that the scanner only uses // peek() to detect the end of the buffer. Should this cause any bugs, // revert. // // The buffer must be zero terminated for scanner to detect its end. // if(decoded_.empty || decoded_.back() != '\0') // { // decoded_ ~= cast(dchar)'\0'; // } } /// Get encoding we're decoding from. UTFEncoding encoding() const pure @safe nothrow @nogc { return encoding_; } /// Get all decoded characters. const(dchar[]) decoded() @safe pure nothrow @nogc { return decoded_; } /// Get the error message and clear it. string getAndClearErrorMessage() @safe pure nothrow @nogc { const result = errorMessage_; errorMessage_ = null; return result; } private: // Decode input_ if it's encoded as UTF-8 or UTF-16. // // On error, errorMessage_ will be set. void decode(C)(C[] buffer) @safe pure nothrow { // End of part of buffer that contains complete characters that can be decoded. const size_t end = endOfLastUTFSequence(buffer); // If end is 0, there are no full chars. // This can happen at the end of file if there is an incomplete UTF sequence. if(end < buffer.length) { errorMessage_ = "Invalid UTF character at the end of buffer"; return; } const srclength = buffer.length; try for(size_t srcpos = 0; srcpos < srclength;) { const c = buffer[srcpos]; if(c < 0x80) { decoded_ ~= c; ++srcpos; } else { decoded_ ~= std.utf.decode(buffer, srcpos); } } catch(UTFException e) { errorMessage_ = e.msg; return; } catch(Exception e) { assert(false, "Unexpected exception in decode(): " ~ e.msg); } } // Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer. size_t endOfLastUTFSequence(C)(const C[] buffer) @safe pure nothrow const @nogc { static if(is(C == char)) { for(long end = buffer.length - 1; end >= 0; --end) { const stride = utf8Stride[buffer[cast(size_t)end]]; if(stride != 0xFF) { // If stride goes beyond end of the buffer, return end. // Otherwise the last sequence ends at buffer.length, so we can // return that. (Unless there is an invalid code point, which is // caught at decoding) return (stride > buffer.length - end) ? cast(size_t)end : buffer.length; } } return 0; } else static if(is(C == wchar)) { // TODO this is O(N), which is slow. Find out if we can somehow go // from the end backwards with UTF-16. size_t end = 0; while(end < buffer.length) { const s = stride(buffer, end); if(s + end > buffer.length) { break; } end += s; } return end; } } } /// Determine if all characters in an array are printable. /// /// Params: chars = Characters to check. /// /// Returns: True if all the characters are printable, false otherwise. bool printable(const dchar[] chars) @safe pure nothrow @nogc { foreach(c; chars) { if(!((c == 0x09 || c == 0x0A || c == 0x0D || c == 0x85) || (c >= 0x20 && c <= 0x7E) || (c >= 0xA0 && c <= '\uD7FF') || (c >= '\uE000' && c <= '\uFFFD'))) { return false; } } return true; } // Unittests. import std.stream; void testEndian(R)() { writeln(typeid(R).toString() ~ ": endian unittest"); void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected) { auto reader = new R(new MemoryStream(data)); assert(reader.encoding == encoding_expected); assert(reader.endian_ == endian_expected); } ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00]; ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A]; endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian); endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian); } void testPeekPrefixForward(R)() { writeln(typeid(R).toString() ~ ": peek/prefix/forward unittest"); ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data"; auto reader = new R(new MemoryStream(data)); assert(reader.peek() == 'd'); assert(reader.peek(1) == 'a'); assert(reader.peek(2) == 't'); assert(reader.peek(3) == 'a'); assert(reader.peek(4) == '\0'); assert(reader.prefix(4) == "data"); // assert(reader.prefix(6) == "data\0"); reader.forward(2); assert(reader.peek(1) == 'a'); // assert(collectException(reader.peek(3))); } void testUTF(R)() { writeln(typeid(R).toString() ~ ": UTF formats unittest"); dchar[] data = cast(dchar[])"data"; void utf_test(T)(T[] data, BOM bom) { ubyte[] bytes = ByteOrderMarks[bom] ~ (cast(ubyte*)data.ptr)[0 .. data.length * T.sizeof]; auto reader = new R(new MemoryStream(bytes)); assert(reader.peek() == 'd'); assert(reader.peek(1) == 'a'); assert(reader.peek(2) == 't'); assert(reader.peek(3) == 'a'); } utf_test!char(to!(char[])(data), BOM.UTF8); utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.UTF16BE : BOM.UTF16LE); utf_test(data, endian == Endian.bigEndian ? BOM.UTF32BE : BOM.UTF32LE); } void test1Byte(R)() { writeln(typeid(R).toString() ~ ": 1 byte file unittest"); ubyte[] data = [97]; auto reader = new R(new MemoryStream(data)); assert(reader.peek() == 'a'); assert(reader.peek(1) == '\0'); // assert(collectException(reader.peek(2))); } unittest { testEndian!Reader(); testPeekPrefixForward!Reader(); testUTF!Reader(); test1Byte!Reader(); }