// Copyright Ferdinand Majerech 2011. // Distributed under the Boost Software License, Version 1.0. // (See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) module dyaml.reader; import core.stdc.string; import std.algorithm; import std.conv; import std.exception; import std.stdio; import std.stream; import std.string; import std.system; import std.utf; import dyaml.fastcharsearch; import dyaml.encoding; import dyaml.exception; package: ///Exception thrown at Reader errors. class ReaderException : YAMLException { this(string msg, string file = __FILE__, int line = __LINE__) { super("Error reading stream: " ~ msg, file, line); } } ///Reads data from a stream and converts it to UTF-32 (dchar) data. final class Reader { private: ///Input stream. EndianStream stream_; ///Allocated space for buffer_. dchar[] bufferAllocated_; ///Buffer of currently loaded characters. dchar[] buffer_; ///Current position within buffer. Only data after this position can be read. uint bufferOffset_ = 0; ///Index of the current character in the stream. size_t charIndex_ = 0; ///Encoding of the input stream. Encoding encoding_; ///Current line in file. uint line_; ///Current column in file. uint column_; ///Number of bytes still available (not read) in the stream. size_t available_; ///Capacity of raw buffers. static immutable bufferLength8_ = 8; ///Capacity of raw buffers. static immutable bufferLength16_ = bufferLength8_ / 2; union { ///Buffer to hold UTF-8 data before decoding. char[bufferLength8_ + 1] rawBuffer8_; ///Buffer to hold UTF-16 data before decoding. wchar[bufferLength16_ + 1] rawBuffer16_; } ///Number of elements held in the used raw buffer. uint rawUsed_ = 0; public: /** * Construct a Reader. * * Params: stream = Input stream. Must be readable and seekable. * * Throws: ReaderException if the stream is invalid. */ this(Stream stream) in { assert(stream.readable && stream.seekable, "Can't read YAML from a stream that is not readable and seekable"); } body { stream_ = new EndianStream(stream); available_ = stream_.available; //handle files short enough not to have a BOM if(available_ < 2) { encoding_ = Encoding.UTF_8; return; } //readBOM will determine and set stream endianness switch(stream_.readBOM(2)) { case -1: //readBOM() eats two more bytes in this case so get them back const wchar bytes = stream_.getcw(); rawBuffer8_[0] = cast(char)(bytes % 256); rawBuffer8_[1] = cast(char)(bytes / 256); rawUsed_ = 2; goto case 0; case 0: encoding_ = Encoding.UTF_8; break; case 1, 2: //readBOM() eats two more bytes in this case so get them back encoding_ = Encoding.UTF_16; rawBuffer16_[0] = stream_.getcw(); rawUsed_ = 1; enforce(available_ % 2 == 0, new ReaderException("Odd byte count in an UTF-16 stream")); break; case 3, 4: enforce(available_ % 4 == 0, new ReaderException("Byte count in an UTF-32 stream not divisible by 4")); encoding_ = Encoding.UTF_32; break; default: assert(false, "Unknown UTF BOM"); } available_ = stream_.available; auto ptr = cast(dchar*)core.stdc.stdlib.malloc(dchar.sizeof * 256); bufferAllocated_ = ptr[0 .. 256]; } ///Destroy the Reader. ~this() { core.stdc.stdlib.free(bufferAllocated_.ptr); buffer_ = bufferAllocated_ = null; } /** * Get character at specified index relative to current position. * * Params: index = Index of the character to get relative to current position * in the stream. * * Returns: Character at specified position. * * Throws: ReaderException if trying to read past the end of the stream * or if invalid data is read. */ dchar peek(in size_t index = 0) { if(buffer_.length <= bufferOffset_ + index + 1) { updateBuffer(index + 1); } if(buffer_.length <= bufferOffset_ + index) { throw new ReaderException("Trying to read past the end of the stream"); } return buffer_[bufferOffset_ + index]; } /** * Get specified number of characters starting at current position. * * Note: This gets only a "view" into the internal buffer, * which WILL get invalidated after other Reader calls. * * Params: length = Number of characters to get. * * Returns: Characters starting at current position or an empty slice if out of bounds. */ const(dstring) prefix(in size_t length) { return slice(0, length); } /** * Get a slice view of the internal buffer. * * Note: This gets only a "view" into the internal buffer, * which WILL get invalidated after other Reader calls. * * Params: start = Start of the slice relative to current position. * end = End of the slice relative to current position. * * Returns: Slice into the internal buffer or an empty slice if out of bounds. */ const(dstring) slice(size_t start, size_t end) { if(buffer_.length <= bufferOffset_ + end) { updateBuffer(end); } end += bufferOffset_; start += bufferOffset_; end = min(buffer_.length, end); if(end <= start){return "";} return cast(dstring)buffer_[start .. end]; } /** * Get the next character, moving stream position beyond it. * * Returns: Next character. * * Throws: ReaderException if trying to read past the end of the stream * or if invalid data is read. */ dchar get() { const result = peek(); forward(); return result; } /** * Get specified number of characters, moving stream position beyond them. * * Params: length = Number or characters to get. * * Returns: Characters starting at current position. * * Throws: ReaderException if trying to read past the end of the stream * or if invalid data is read. */ dstring get(in size_t length) { auto result = prefix(length).dup; forward(length); return cast(dstring)result; } /** * Move current position forward. * * Params: length = Number of characters to move position forward. * * Throws: ReaderException if trying to read past the end of the stream * or if invalid data is read. */ void forward(size_t length = 1) { mixin FastCharSearch!"\n\u0085\u2028\u2029"d search; if(buffer_.length <= bufferOffset_ + length + 1) { updateBuffer(length + 1); } while(length > 0) { const c = buffer_[bufferOffset_]; ++bufferOffset_; ++charIndex_; //New line. if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n')) { ++line_; column_ = 0; } else if(c != '\uFEFF'){++column_;} --length; } } ///Get a string describing current stream position, used for error messages. @property Mark mark() const {return Mark(line_, column_);} ///Get current line number. @property uint line() const {return line_;} ///Get current line number. @property uint column() const {return column_;} ///Get index of the current character in the stream. @property size_t charIndex() const {return charIndex_;} ///Get encoding of the input stream. @property Encoding encoding() const {return encoding_;} private: /** * Update buffer to be able to read length characters after buffer offset. * * If there are not enough characters in the stream, it will get * as many as possible. * * Params: length = Number of characters we need to read. * * Throws: ReaderException if trying to read past the end of the stream * or if invalid data is read. */ void updateBuffer(in size_t length) { //get rid of unneeded data in the buffer if(bufferOffset_ > 0) { size_t bufferLength = buffer_.length - bufferOffset_; memmove(buffer_.ptr, buffer_.ptr + bufferOffset_, bufferLength * dchar.sizeof); buffer_ = buffer_[0 .. bufferLength]; bufferOffset_ = 0; } ////Load chars in batches of at most 1024 bytes (256 chars) while(buffer_.length <= bufferOffset_ + length) { loadChars(256); if(done) { if(buffer_.length == 0 || buffer_[$ - 1] != '\0') { bufferReserve(buffer_.length + 1); buffer_ = bufferAllocated_[0 .. buffer_.length + 1]; buffer_[$ - 1] = '\0'; } break; } } } /** * Load at most specified number of characters. * * Params: chars = Maximum number of characters to load. * * Throws: ReaderException on Unicode decoding error, * if nonprintable characters are detected, or * if there is an error reading from the stream. */ void loadChars(size_t chars) { ///Get next character from the stream. dchar getDChar() { final switch(encoding_) { case Encoding.UTF_8: //Temp buffer for moving data in rawBuffer8_. char[bufferLength8_] temp; //Shortcut for ASCII. if(rawUsed_ > 0 && rawBuffer8_[0] < 128) { //Get the first byte (one char in ASCII). const dchar result = rawBuffer8_[0]; --rawUsed_; //Move the data. *(cast(ulong*)temp.ptr) = *(cast(ulong*)(rawBuffer8_.ptr + 1)); *(cast(ulong*)rawBuffer8_.ptr) = *(cast(ulong*)temp.ptr); return result; } //Bytes to read. const readBytes = min(available_, bufferLength8_ - rawUsed_); available_ -= readBytes; //Length of data in rawBuffer8_ after reading. const len = rawUsed_ + readBytes; //Read the data. stream_.readExact(rawBuffer8_.ptr + rawUsed_, readBytes); //After decoding, this will point to the first byte not decoded. size_t idx = 0; const dchar result = decode(rawBuffer8_, idx); rawUsed_ = cast(uint)(len - idx); //Move the data. temp[0 .. rawUsed_] = rawBuffer8_[idx .. len]; rawBuffer8_[0 .. rawUsed_] = temp[0 .. rawUsed_]; return result; case Encoding.UTF_16: //Temp buffer for moving data in rawBuffer8_. wchar[bufferLength16_] temp; //Words to read. size_t readWords = min(available_ / 2, bufferLength16_ - rawUsed_); available_ -= readWords * 2; //Length of data in rawBuffer16_ after reading. size_t len = rawUsed_; //Read the data. while(readWords > 0) { //Due to a bug in std.stream, we have to use getcw here. rawBuffer16_[len] = stream_.getcw(); --readWords; ++len; } //After decoding, this will point to the first word not decoded. size_t idx = 0; const dchar result = decode(rawBuffer16_, idx); rawUsed_ = cast(uint)(len - idx); //Move the data. temp[0 .. rawUsed_] = rawBuffer16_[idx .. len]; rawBuffer16_[0 .. rawUsed_] = temp[0 .. rawUsed_]; return result; case Encoding.UTF_32: dchar result; available_ -= 4; stream_.read(result); return result; } } const oldLength = buffer_.length; const oldPosition = stream_.position; //Preallocating memory to limit GC reallocations. bufferReserve(buffer_.length + chars); buffer_ = bufferAllocated_[0 .. buffer_.length + chars]; scope(exit) { buffer_ = buffer_[0 .. $ - chars]; enforce(printable(buffer_[oldLength .. $]), new ReaderException("Special unicode characters are not allowed")); } try for(uint c = 0; chars; --chars, ++c) { if(done){break;} buffer_[oldLength + c] = getDChar(); } catch(UtfException e) { const position = stream_.position; throw new ReaderException(format("Unicode decoding error between bytes ", oldPosition, " and ", position, " : ", e.msg)); } catch(ReadException e) { throw new ReaderException(e.msg); } } /** * Determine if all characters in an array are printable. * * Params: chars = Characters to check. * * Returns: True if all the characters are printable, false otherwise. */ static bool printable(const ref dchar[] chars) pure { foreach(c; chars) { if(!((c == 0x09 || c == 0x0A || c == 0x0D || c == 0x85) || (c >= 0x20 && c <= 0x7E) || (c >= 0xA0 && c <= '\uD7FF') || (c >= '\uE000' && c <= '\uFFFD'))) { return false; } } return true; } ///Are we done reading? @property bool done() const { return (available_ == 0 && ((encoding_ == Encoding.UTF_8 && rawUsed_ == 0) || (encoding_ == Encoding.UTF_16 && rawUsed_ == 0) || encoding_ == Encoding.UTF_32)); } ///Ensure there is space for at least capacity characters in bufferAllocated_. void bufferReserve(in size_t capacity) { if(bufferAllocated_.length >= capacity){return;} auto newPtr = core.stdc.stdlib.realloc(bufferAllocated_.ptr, capacity * dchar.sizeof); bufferAllocated_ = (cast(dchar*)newPtr)[0 .. capacity]; buffer_ = bufferAllocated_[0 .. buffer_.length]; } unittest { writeln("D:YAML reader endian unittest"); void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected) { auto reader = new Reader(new MemoryStream(data)); assert(reader.encoding_ == encoding_expected); assert(reader.stream_.endian == endian_expected); } ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00]; ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A]; endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian); endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian); } unittest { writeln("D:YAML reader peek/prefix/forward unittest"); ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data"; auto reader = new Reader(new MemoryStream(data)); assert(reader.peek() == 'd'); assert(reader.peek(1) == 'a'); assert(reader.peek(2) == 't'); assert(reader.peek(3) == 'a'); assert(reader.peek(4) == '\0'); assert(reader.prefix(4) == "data"); assert(reader.prefix(6) == "data\0"); reader.forward(2); assert(reader.peek(1) == 'a'); assert(collectException(reader.peek(3))); } unittest { writeln("D:YAML reader UTF formats unittest"); dchar[] data = cast(dchar[])"data"; void utf_test(T)(T[] data, BOM bom) { ubyte[] bytes = ByteOrderMarks[bom] ~ (cast(ubyte*)data.ptr)[0 .. data.length * T.sizeof]; auto reader = new Reader(new MemoryStream(bytes)); assert(reader.peek() == 'd'); assert(reader.peek(1) == 'a'); assert(reader.peek(2) == 't'); assert(reader.peek(3) == 'a'); } utf_test!char(to!(char[])(data), BOM.UTF8); utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.UTF16BE : BOM.UTF16LE); utf_test(data, endian == Endian.bigEndian ? BOM.UTF32BE : BOM.UTF32LE); } }