dyaml/source/dyaml/reader.d
Jonathan M Davis 7a1e1ecce3 Another attempt at making d-yaml work with dub.
Creating a symlink in source to the dyaml directory does not actually
result in a symlink when another package grabs d-yaml as a dependency
via dub, and even if it did, it wouldn't work on Windows. So, this moves
the source into source so that it'll actually work, and cdc.d has been
adjusted accordingly so that building with it should still work.
2013-03-28 21:33:13 -07:00

703 lines
24 KiB
D

// Copyright Ferdinand Majerech 2011.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
module dyaml.reader;
import core.stdc.stdlib;
import core.stdc.string;
import core.thread;
import std.algorithm;
import std.conv;
import std.exception;
import std.stdio;
import std.stream;
import std.string;
import std.system;
import std.utf;
import dyaml.fastcharsearch;
import dyaml.encoding;
import dyaml.exception;
package:
///Exception thrown at Reader errors.
class ReaderException : YAMLException
{
this(string msg, string file = __FILE__, int line = __LINE__)
@safe nothrow
{
super("Error reading stream: " ~ msg, file, line);
}
}
///Lazily reads and decodes data from stream, only storing as much as needed at any moment.
final class Reader
{
private:
//Input stream.
EndianStream stream_;
//Allocated space for buffer_.
dchar[] bufferAllocated_ = null;
//Buffer of currently loaded characters.
dchar[] buffer_ = null;
//Current position within buffer. Only data after this position can be read.
uint bufferOffset_ = 0;
//Index of the current character in the stream.
size_t charIndex_ = 0;
//Current line in file.
uint line_;
//Current column in file.
uint column_;
//Decoder reading data from file and decoding it to UTF-32.
UTFFastDecoder decoder_;
public:
/*
* Construct an AbstractReader.
*
* Params: stream = Input stream. Must be readable and seekable.
*
* Throws: ReaderException if the stream is invalid.
*/
this(Stream stream) @trusted
in
{
assert(stream.readable && stream.seekable,
"Can't read YAML from a stream that is not readable and seekable");
}
body
{
stream_ = new EndianStream(stream);
decoder_ = UTFFastDecoder(stream_);
}
@trusted nothrow ~this()
{
//Delete the buffer, if allocated.
if(bufferAllocated_ is null){return;}
free(bufferAllocated_.ptr);
buffer_ = bufferAllocated_ = null;
}
/**
* Get character at specified index relative to current position.
*
* Params: index = Index of the character to get relative to current position
* in the stream.
*
* Returns: Character at specified position.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
dchar peek(size_t index = 0) @trusted
{
if(buffer_.length < bufferOffset_ + index + 1)
{
updateBuffer(index + 1);
}
if(buffer_.length <= bufferOffset_ + index)
{
throw new ReaderException("Trying to read past the end of the stream");
}
return buffer_[bufferOffset_ + index];
}
/**
* Get specified number of characters starting at current position.
*
* Note: This gets only a "view" into the internal buffer,
* which WILL get invalidated after other Reader calls.
*
* Params: length = Number of characters to get.
*
* Returns: Characters starting at current position or an empty slice if out of bounds.
*/
const(dstring) prefix(size_t length) @safe
{
return slice(0, length);
}
/**
* Get a slice view of the internal buffer.
*
* Note: This gets only a "view" into the internal buffer,
* which WILL get invalidated after other Reader calls.
*
* Params: start = Start of the slice relative to current position.
* end = End of the slice relative to current position.
*
* Returns: Slice into the internal buffer or an empty slice if out of bounds.
*/
const(dstring) slice(size_t start, size_t end) @trusted
{
if(buffer_.length <= bufferOffset_ + end)
{
updateBuffer(end);
}
end += bufferOffset_;
start += bufferOffset_;
end = min(buffer_.length, end);
return end > start ? cast(dstring)buffer_[start .. end] : "";
}
/**
* Get the next character, moving stream position beyond it.
*
* Returns: Next character.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
dchar get() @safe
{
const result = peek();
forward();
return result;
}
/**
* Get specified number of characters, moving stream position beyond them.
*
* Params: length = Number or characters to get.
*
* Returns: Characters starting at current position.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
dstring get(size_t length) @safe
{
auto result = prefix(length).idup;
forward(length);
return result;
}
/**
* Move current position forward.
*
* Params: length = Number of characters to move position forward.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
void forward(size_t length = 1) @trusted
{
if(buffer_.length <= bufferOffset_ + length + 1)
{
updateBuffer(length + 1);
}
mixin FastCharSearch!"\n\u0085\u2028\u2029"d search;
while(length > 0)
{
const c = buffer_[bufferOffset_];
++bufferOffset_;
++charIndex_;
//New line.
if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n'))
{
++line_;
column_ = 0;
}
else if(c != '\uFEFF'){++column_;}
--length;
}
}
///Get a string describing current stream position, used for error messages.
@property final Mark mark() const pure @safe nothrow {return Mark(line_, column_);}
///Get current line number.
@property final uint line() const pure @safe nothrow {return line_;}
///Get current column number.
@property final uint column() const pure @safe nothrow {return column_;}
///Get index of the current character in the stream.
@property final size_t charIndex() const pure @safe nothrow {return charIndex_;}
///Get encoding of the input stream.
@property final Encoding encoding() const pure @safe nothrow {return decoder_.encoding;}
private:
/**
* Update buffer to be able to read length characters after buffer offset.
*
* If there are not enough characters in the stream, it will get
* as many as possible.
*
* Params: length = Number of characters we need to read.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
void updateBuffer(in size_t length) @system
{
//Get rid of unneeded data in the buffer.
if(bufferOffset_ > 0)
{
size_t bufferLength = buffer_.length - bufferOffset_;
memmove(buffer_.ptr, buffer_.ptr + bufferOffset_,
bufferLength * dchar.sizeof);
buffer_ = buffer_[0 .. bufferLength];
bufferOffset_ = 0;
}
//Load chars in batches of at most 1024 bytes (256 chars)
while(buffer_.length <= bufferOffset_ + length)
{
loadChars(512);
if(decoder_.done)
{
if(buffer_.length == 0 || buffer_[$ - 1] != '\0')
{
bufferReserve(buffer_.length + 1);
buffer_ = bufferAllocated_[0 .. buffer_.length + 1];
buffer_[$ - 1] = '\0';
}
break;
}
}
}
/**
* Load more characters to the buffer.
*
* Params: chars = Recommended number of characters to load.
* More characters might be loaded.
* Less will be loaded if not enough available.
*
* Throws: ReaderException on Unicode decoding error,
* if nonprintable characters are detected, or
* if there is an error reading from the stream.
*/
void loadChars(size_t chars) @system
{
const oldLength = buffer_.length;
const oldPosition = stream_.position;
bufferReserve(buffer_.length + chars);
buffer_ = bufferAllocated_[0 .. buffer_.length + chars];
scope(success)
{
buffer_ = buffer_[0 .. $ - chars];
enforce(printable(buffer_[oldLength .. $]),
new ReaderException("Special unicode characters are not allowed"));
}
try for(size_t c = 0; chars && !decoder_.done;)
{
const slice = decoder_.getDChars(chars);
buffer_[oldLength + c .. oldLength + c + slice.length] = slice[];
c += slice.length;
chars -= slice.length;
}
catch(Exception e)
{
handleLoadCharsException(e, oldPosition);
}
}
//Handle an exception thrown in loadChars method of any Reader.
void handleLoadCharsException(Exception e, ulong oldPosition) @system
{
try{throw e;}
catch(UTFException e)
{
const position = stream_.position;
throw new ReaderException(format("Unicode decoding error between bytes %s and %s : %s",
oldPosition, position, e.msg));
}
catch(ReadException e)
{
throw new ReaderException(e.msg);
}
}
//Code shared by loadEntireFile methods.
void loadEntireFile_() @system
{
const maxChars = decoder_.maxChars;
bufferReserve(maxChars + 1);
loadChars(maxChars);
if(buffer_.length == 0 || buffer_[$ - 1] != '\0')
{
buffer_ = bufferAllocated_[0 .. buffer_.length + 1];
buffer_[$ - 1] = '\0';
}
}
//Ensure there is space for at least capacity characters in bufferAllocated_.
void bufferReserve(in size_t capacity) @system nothrow
{
if(bufferAllocated_ !is null && bufferAllocated_.length >= capacity){return;}
//Handle first allocation as well as reallocation.
auto ptr = bufferAllocated_ !is null
? realloc(bufferAllocated_.ptr, capacity * dchar.sizeof)
: malloc(capacity * dchar.sizeof);
bufferAllocated_ = (cast(dchar*)ptr)[0 .. capacity];
buffer_ = bufferAllocated_[0 .. buffer_.length];
}
}
private:
alias UTFBlockDecoder!512 UTFFastDecoder;
///Decodes streams to UTF-32 in blocks.
struct UTFBlockDecoder(size_t bufferSize_) if (bufferSize_ % 2 == 0)
{
private:
//UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence).
static immutable ubyte[256] utf8Stride =
[
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
];
//Encoding of the input stream.
Encoding encoding_;
//Maximum number of characters that might be in the stream.
size_t maxChars_;
//Bytes available in the stream.
size_t available_;
//Input stream.
EndianStream stream_;
//Buffer used to store raw UTF-8 or UTF-16 code points.
union
{
char[bufferSize_] rawBuffer8_;
wchar[bufferSize_ / 2] rawBuffer16_;
}
//Used space (in items) in rawBuffer8_/rawBuffer16_.
size_t rawUsed_;
//Space used by buffer_.
dchar[bufferSize_] bufferSpace_;
//Buffer of decoded, UTF-32 characters. This is a slice into bufferSpace_.
dchar[] buffer_;
public:
///Construct a UTFBlockDecoder decoding a stream.
this(EndianStream stream) @system
{
stream_ = stream;
available_ = stream_.available;
//Handle files short enough not to have a BOM.
if(available_ < 2)
{
encoding_ = Encoding.UTF_8;
maxChars_ = 0;
if(available_ == 1)
{
bufferSpace_[0] = stream_.getc();
buffer_ = bufferSpace_[0 .. 1];
maxChars_ = 1;
}
return;
}
char[] rawBuffer8;
wchar[] rawBuffer16;
//readBOM will determine and set stream endianness.
switch(stream_.readBOM(2))
{
case -1:
//readBOM() eats two more bytes in this case so get them back.
const wchar bytes = stream_.getcw();
rawBuffer8_[0 .. 2] = [cast(ubyte)(bytes % 256), cast(ubyte)(bytes / 256)];
rawUsed_ = 2;
goto case 0;
case 0:
maxChars_ = available_;
encoding_ = Encoding.UTF_8;
break;
case 1, 2:
maxChars_ = available_ / 2;
//readBOM() eats two more bytes in this case so get them back.
encoding_ = Encoding.UTF_16;
rawBuffer16_[0] = stream_.getcw();
rawUsed_ = 1;
enforce(available_ % 2 == 0,
new ReaderException("Odd byte count in an UTF-16 stream"));
break;
case 3, 4:
maxChars_ = available_ / 4;
encoding_ = Encoding.UTF_32;
enforce(available_ % 4 == 0,
new ReaderException("Byte count in an UTF-32 stream not divisible by 4"));
break;
default: assert(false, "Unknown UTF BOM");
}
available_ = stream_.available;
}
///Get maximum number of characters that might be in the stream.
@property size_t maxChars() const pure @safe nothrow {return maxChars_;}
///Get encoding we're decoding from.
@property Encoding encoding() const pure @safe nothrow {return encoding_;}
///Are we done decoding?
@property bool done() const pure @safe nothrow
{
return rawUsed_ == 0 && buffer_.length == 0 && available_ == 0;
}
///Get next character.
dchar getDChar() @system
{
if(buffer_.length)
{
const result = buffer_[0];
buffer_ = buffer_[1 .. $];
return result;
}
assert(available_ > 0 || rawUsed_ > 0);
updateBuffer();
return getDChar();
}
///Get as many characters as possible, but at most maxChars. Slice returned will be invalidated in further calls.
const(dchar[]) getDChars(size_t maxChars = size_t.max) @system
{
if(buffer_.length)
{
const slice = min(buffer_.length, maxChars);
const result = buffer_[0 .. slice];
buffer_ = buffer_[slice .. $];
return result;
}
assert(available_ > 0 || rawUsed_ > 0);
updateBuffer();
return getDChars(maxChars);
}
private:
//Read and decode characters from file and store them in the buffer.
void updateBuffer() @system
{
assert(buffer_.length == 0);
final switch(encoding_)
{
case Encoding.UTF_8:
const bytes = min(bufferSize_ - rawUsed_, available_);
//Current length of valid data in rawBuffer8_.
const rawLength = rawUsed_ + bytes;
stream_.readExact(rawBuffer8_.ptr + rawUsed_, bytes);
available_ -= bytes;
decodeRawBuffer(rawBuffer8_, rawLength);
break;
case Encoding.UTF_16:
const words = min((bufferSize_ / 2) - rawUsed_, available_ / 2);
//Current length of valid data in rawBuffer16_.
const rawLength = rawUsed_ + words;
foreach(c; rawUsed_ .. rawLength)
{
stream_.read(rawBuffer16_[c]);
available_ -= 2;
}
decodeRawBuffer(rawBuffer16_, rawLength);
break;
case Encoding.UTF_32:
const chars = min(bufferSize_ / 4, available_ / 4);
foreach(c; 0 .. chars)
{
stream_.read(bufferSpace_[c]);
available_ -= 4;
}
buffer_ = bufferSpace_[0 .. chars];
break;
}
}
//Decode contents of a UTF-8 or UTF-16 raw buffer.
void decodeRawBuffer(C)(C[] buffer, const size_t length) pure @system
{
//End of part of rawBuffer8_ that contains
//complete characters and can be decoded.
const end = endOfLastUTFSequence(buffer, length);
//If end is 0, there are no full UTF-8 chars.
//This can happen at the end of file if there is an incomplete UTF-8 sequence.
enforce(end > 0,
new ReaderException("Invalid UTF-8 character at the end of stream"));
decodeUTF(buffer[0 .. end]);
//After decoding, any code points not decoded go to the start of raw buffer.
rawUsed_ = length - end;
foreach(i; 0 .. rawUsed_){buffer[i] = buffer[i + end];}
}
//Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer.
size_t endOfLastUTFSequence(C)(const C[] buffer, const size_t max)
pure @system nothrow
{
static if(is(C == char))
{
for(long end = max - 1; end >= 0; --end)
{
const s = utf8Stride[buffer[cast(size_t)end]];
if(s != 0xFF)
{
//If stride goes beyond end of the buffer (max), return end.
//Otherwise the last sequence ends at max, so we can return that.
//(Unless there is an invalid code point, which is
//caught at decoding)
return (s > max - end) ? cast(size_t)end : max;
}
}
return 0;
}
else
{
size_t end = 0;
while(end < max)
{
const s = stride(buffer, end);
if(s + end > max){break;}
end += s;
}
return end;
}
}
//Decode a UTF-8 or UTF-16 buffer (with no incomplete sequences at the end).
void decodeUTF(C)(const C[] source) pure @system
{
size_t bufpos = 0;
const srclength = source.length;
for(size_t srcpos = 0; srcpos < srclength;)
{
const c = source[srcpos];
if(c < 0x80)
{
bufferSpace_[bufpos++] = c;
++srcpos;
}
else
{
bufferSpace_[bufpos++] = decode(source, srcpos);
}
}
buffer_ = bufferSpace_[0 .. bufpos];
}
}
/**
* Determine if all characters in an array are printable.
*
* Params: chars = Characters to check.
*
* Returns: True if all the characters are printable, false otherwise.
*/
bool printable(const dchar[] chars) pure @safe nothrow
{
foreach(c; chars)
{
if(!((c == 0x09 || c == 0x0A || c == 0x0D || c == 0x85) ||
(c >= 0x20 && c <= 0x7E) ||
(c >= 0xA0 && c <= '\uD7FF') ||
(c >= '\uE000' && c <= '\uFFFD')))
{
return false;
}
}
return true;
}
//Unittests.
void testEndian(R)()
{
writeln(typeid(R).toString() ~ ": endian unittest");
void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected)
{
Reader reader = new R(new MemoryStream(data));
assert(reader.encoding == encoding_expected);
assert(reader.stream_.endian == endian_expected);
}
ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian);
endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian);
}
void testPeekPrefixForward(R)()
{
writeln(typeid(R).toString() ~ ": peek/prefix/forward unittest");
ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data";
Reader reader = new R(new MemoryStream(data));
assert(reader.peek() == 'd');
assert(reader.peek(1) == 'a');
assert(reader.peek(2) == 't');
assert(reader.peek(3) == 'a');
assert(reader.peek(4) == '\0');
assert(reader.prefix(4) == "data");
assert(reader.prefix(6) == "data\0");
reader.forward(2);
assert(reader.peek(1) == 'a');
assert(collectException(reader.peek(3)));
}
void testUTF(R)()
{
writeln(typeid(R).toString() ~ ": UTF formats unittest");
dchar[] data = cast(dchar[])"data";
void utf_test(T)(T[] data, BOM bom)
{
ubyte[] bytes = ByteOrderMarks[bom] ~
(cast(ubyte*)data.ptr)[0 .. data.length * T.sizeof];
Reader reader = new R(new MemoryStream(bytes));
assert(reader.peek() == 'd');
assert(reader.peek(1) == 'a');
assert(reader.peek(2) == 't');
assert(reader.peek(3) == 'a');
}
utf_test!char(to!(char[])(data), BOM.UTF8);
utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.UTF16BE : BOM.UTF16LE);
utf_test(data, endian == Endian.bigEndian ? BOM.UTF32BE : BOM.UTF32LE);
}
unittest
{
testEndian!Reader();
testPeekPrefixForward!Reader();
testUTF!Reader();
}