dyaml/dyaml/reader.d
2011-10-30 20:24:43 +01:00

531 lines
19 KiB
D

// Copyright Ferdinand Majerech 2011.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
module dyaml.reader;
import core.stdc.string;
import std.algorithm;
import std.conv;
import std.exception;
import std.stdio;
import std.stream;
import std.string;
import std.system;
import std.utf;
import dyaml.fastcharsearch;
import dyaml.encoding;
import dyaml.exception;
package:
///Exception thrown at Reader errors.
class ReaderException : YAMLException
{
this(string msg, string file = __FILE__, int line = __LINE__)
{
super("Error reading stream: " ~ msg, file, line);
}
}
///Reads data from a stream and converts it to UTF-32 (dchar) data.
final class Reader
{
private:
///Input stream.
EndianStream stream_;
///Allocated space for buffer_.
dchar[] bufferAllocated_;
///Buffer of currently loaded characters.
dchar[] buffer_;
///Current position within buffer. Only data after this position can be read.
uint bufferOffset_ = 0;
///Index of the current character in the stream.
size_t charIndex_ = 0;
///Encoding of the input stream.
Encoding encoding_;
///Current line in file.
uint line_;
///Current column in file.
uint column_;
///Number of bytes still available (not read) in the stream.
size_t available_;
///Capacity of raw buffers.
static immutable bufferLength8_ = 8;
///Capacity of raw buffers.
static immutable bufferLength16_ = bufferLength8_ / 2;
union
{
///Buffer to hold UTF-8 data before decoding.
char[bufferLength8_ + 1] rawBuffer8_;
///Buffer to hold UTF-16 data before decoding.
wchar[bufferLength16_ + 1] rawBuffer16_;
}
///Number of elements held in the used raw buffer.
uint rawUsed_ = 0;
public:
/**
* Construct a Reader.
*
* Params: stream = Input stream. Must be readable and seekable.
*
* Throws: ReaderException if the stream is invalid.
*/
this(Stream stream)
in
{
assert(stream.readable && stream.seekable,
"Can't read YAML from a stream that is not readable and seekable");
}
body
{
stream_ = new EndianStream(stream);
available_ = stream_.available;
//handle files short enough not to have a BOM
if(available_ < 2)
{
encoding_ = Encoding.UTF_8;
return;
}
//readBOM will determine and set stream endianness
switch(stream_.readBOM(2))
{
case -1:
//readBOM() eats two more bytes in this case so get them back
const wchar bytes = stream_.getcw();
rawBuffer8_[0] = cast(char)(bytes % 256);
rawBuffer8_[1] = cast(char)(bytes / 256);
rawUsed_ = 2;
goto case 0;
case 0: encoding_ = Encoding.UTF_8; break;
case 1, 2:
//readBOM() eats two more bytes in this case so get them back
encoding_ = Encoding.UTF_16;
rawBuffer16_[0] = stream_.getcw();
rawUsed_ = 1;
enforce(available_ % 2 == 0,
new ReaderException("Odd byte count in an UTF-16 stream"));
break;
case 3, 4:
enforce(available_ % 4 == 0,
new ReaderException("Byte count in an UTF-32 stream not divisible by 4"));
encoding_ = Encoding.UTF_32;
break;
default: assert(false, "Unknown UTF BOM");
}
available_ = stream_.available;
auto ptr = cast(dchar*)core.stdc.stdlib.malloc(dchar.sizeof * 256);
bufferAllocated_ = ptr[0 .. 256];
}
///Destroy the Reader.
~this()
{
core.stdc.stdlib.free(bufferAllocated_.ptr);
buffer_ = bufferAllocated_ = null;
}
/**
* Get character at specified index relative to current position.
*
* Params: index = Index of the character to get relative to current position
* in the stream.
*
* Returns: Character at specified position.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
dchar peek(in size_t index = 0)
{
if(buffer_.length <= bufferOffset_ + index + 1)
{
updateBuffer(index + 1);
}
if(buffer_.length <= bufferOffset_ + index)
{
throw new ReaderException("Trying to read past the end of the stream");
}
return buffer_[bufferOffset_ + index];
}
/**
* Get specified number of characters starting at current position.
*
* Note: This gets only a "view" into the internal buffer,
* which WILL get invalidated after other Reader calls.
*
* Params: length = Number of characters to get.
*
* Returns: Characters starting at current position or an empty slice if out of bounds.
*/
const(dstring) prefix(in size_t length)
{
return slice(0, length);
}
/**
* Get a slice view of the internal buffer.
*
* Note: This gets only a "view" into the internal buffer,
* which WILL get invalidated after other Reader calls.
*
* Params: start = Start of the slice relative to current position.
* end = End of the slice relative to current position.
*
* Returns: Slice into the internal buffer or an empty slice if out of bounds.
*/
const(dstring) slice(size_t start, size_t end)
{
if(buffer_.length <= bufferOffset_ + end)
{
updateBuffer(end);
}
end += bufferOffset_;
start += bufferOffset_;
end = min(buffer_.length, end);
if(end <= start){return "";}
return cast(dstring)buffer_[start .. end];
}
/**
* Get the next character, moving stream position beyond it.
*
* Returns: Next character.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
dchar get()
{
const result = peek();
forward();
return result;
}
/**
* Get specified number of characters, moving stream position beyond them.
*
* Params: length = Number or characters to get.
*
* Returns: Characters starting at current position.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
dstring get(in size_t length)
{
auto result = prefix(length).dup;
forward(length);
return cast(dstring)result;
}
/**
* Move current position forward.
*
* Params: length = Number of characters to move position forward.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
void forward(size_t length = 1)
{
mixin FastCharSearch!"\n\u0085\u2028\u2029"d search;
if(buffer_.length <= bufferOffset_ + length + 1)
{
updateBuffer(length + 1);
}
while(length > 0)
{
const c = buffer_[bufferOffset_];
++bufferOffset_;
++charIndex_;
//New line.
if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n'))
{
++line_;
column_ = 0;
}
else if(c != '\uFEFF'){++column_;}
--length;
}
}
///Get a string describing current stream position, used for error messages.
@property Mark mark() const {return Mark(line_, column_);}
///Get current line number.
@property uint line() const {return line_;}
///Get current line number.
@property uint column() const {return column_;}
///Get index of the current character in the stream.
@property size_t charIndex() const {return charIndex_;}
///Get encoding of the input stream.
@property Encoding encoding() const {return encoding_;}
private:
/**
* Update buffer to be able to read length characters after buffer offset.
*
* If there are not enough characters in the stream, it will get
* as many as possible.
*
* Params: length = Number of characters we need to read.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
void updateBuffer(in size_t length)
{
//get rid of unneeded data in the buffer
if(bufferOffset_ > 0)
{
size_t bufferLength = buffer_.length - bufferOffset_;
memmove(buffer_.ptr, buffer_.ptr + bufferOffset_,
bufferLength * dchar.sizeof);
buffer_ = buffer_[0 .. bufferLength];
bufferOffset_ = 0;
}
////Load chars in batches of at most 1024 bytes (256 chars)
while(buffer_.length <= bufferOffset_ + length)
{
loadChars(256);
if(done)
{
if(buffer_.length == 0 || buffer_[$ - 1] != '\0')
{
bufferReserve(buffer_.length + 1);
buffer_ = bufferAllocated_[0 .. buffer_.length + 1];
buffer_[$ - 1] = '\0';
}
break;
}
}
}
/**
* Load at most specified number of characters.
*
* Params: chars = Maximum number of characters to load.
*
* Throws: ReaderException on Unicode decoding error,
* if nonprintable characters are detected, or
* if there is an error reading from the stream.
*/
void loadChars(size_t chars)
{
///Get next character from the stream.
dchar getDChar()
{
final switch(encoding_)
{
case Encoding.UTF_8:
//Temp buffer for moving data in rawBuffer8_.
char[bufferLength8_] temp;
//Shortcut for ASCII.
if(rawUsed_ > 0 && rawBuffer8_[0] < 128)
{
//Get the first byte (one char in ASCII).
const dchar result = rawBuffer8_[0];
--rawUsed_;
//Move the data.
*(cast(ulong*)temp.ptr) = *(cast(ulong*)(rawBuffer8_.ptr + 1));
*(cast(ulong*)rawBuffer8_.ptr) = *(cast(ulong*)temp.ptr);
return result;
}
//Bytes to read.
const readBytes = min(available_, bufferLength8_ - rawUsed_);
available_ -= readBytes;
//Length of data in rawBuffer8_ after reading.
const len = rawUsed_ + readBytes;
//Read the data.
stream_.readExact(rawBuffer8_.ptr + rawUsed_, readBytes);
//After decoding, this will point to the first byte not decoded.
size_t idx = 0;
const dchar result = decode(rawBuffer8_, idx);
rawUsed_ = cast(uint)(len - idx);
//Move the data.
temp[0 .. rawUsed_] = rawBuffer8_[idx .. len];
rawBuffer8_[0 .. rawUsed_] = temp[0 .. rawUsed_];
return result;
case Encoding.UTF_16:
//Temp buffer for moving data in rawBuffer8_.
wchar[bufferLength16_] temp;
//Words to read.
size_t readWords = min(available_ / 2, bufferLength16_ - rawUsed_);
available_ -= readWords * 2;
//Length of data in rawBuffer16_ after reading.
size_t len = rawUsed_;
//Read the data.
while(readWords > 0)
{
//Due to a bug in std.stream, we have to use getcw here.
rawBuffer16_[len] = stream_.getcw();
--readWords;
++len;
}
//After decoding, this will point to the first word not decoded.
size_t idx = 0;
const dchar result = decode(rawBuffer16_, idx);
rawUsed_ = cast(uint)(len - idx);
//Move the data.
temp[0 .. rawUsed_] = rawBuffer16_[idx .. len];
rawBuffer16_[0 .. rawUsed_] = temp[0 .. rawUsed_];
return result;
case Encoding.UTF_32:
dchar result;
available_ -= 4;
stream_.read(result);
return result;
}
}
const oldLength = buffer_.length;
const oldPosition = stream_.position;
//Preallocating memory to limit GC reallocations.
bufferReserve(buffer_.length + chars);
buffer_ = bufferAllocated_[0 .. buffer_.length + chars];
scope(exit)
{
buffer_ = buffer_[0 .. $ - chars];
enforce(printable(buffer_[oldLength .. $]),
new ReaderException("Special unicode characters are not allowed"));
}
try for(uint c = 0; chars; --chars, ++c)
{
if(done){break;}
buffer_[oldLength + c] = getDChar();
}
catch(UtfException e)
{
const position = stream_.position;
throw new ReaderException(format("Unicode decoding error between bytes ",
oldPosition, " and ", position, " : ", e.msg));
}
catch(ReadException e)
{
throw new ReaderException(e.msg);
}
}
/**
* Determine if all characters in an array are printable.
*
* Params: chars = Characters to check.
*
* Returns: True if all the characters are printable, false otherwise.
*/
static bool printable(const ref dchar[] chars) pure
{
foreach(c; chars)
{
if(!((c == 0x09 || c == 0x0A || c == 0x0D || c == 0x85) ||
(c >= 0x20 && c <= 0x7E) ||
(c >= 0xA0 && c <= '\uD7FF') ||
(c >= '\uE000' && c <= '\uFFFD')))
{
return false;
}
}
return true;
}
///Are we done reading?
@property bool done() const
{
return (available_ == 0 &&
((encoding_ == Encoding.UTF_8 && rawUsed_ == 0) ||
(encoding_ == Encoding.UTF_16 && rawUsed_ == 0) ||
encoding_ == Encoding.UTF_32));
}
///Ensure there is space for at least capacity characters in bufferAllocated_.
void bufferReserve(in size_t capacity)
{
if(bufferAllocated_.length >= capacity){return;}
auto newPtr = core.stdc.stdlib.realloc(bufferAllocated_.ptr,
capacity * dchar.sizeof);
bufferAllocated_ = (cast(dchar*)newPtr)[0 .. capacity];
buffer_ = bufferAllocated_[0 .. buffer_.length];
}
unittest
{
writeln("D:YAML reader endian unittest");
void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected)
{
auto reader = new Reader(new MemoryStream(data));
assert(reader.encoding_ == encoding_expected);
assert(reader.stream_.endian == endian_expected);
}
ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian);
endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian);
}
unittest
{
writeln("D:YAML reader peek/prefix/forward unittest");
ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data";
auto reader = new Reader(new MemoryStream(data));
assert(reader.peek() == 'd');
assert(reader.peek(1) == 'a');
assert(reader.peek(2) == 't');
assert(reader.peek(3) == 'a');
assert(reader.peek(4) == '\0');
assert(reader.prefix(4) == "data");
assert(reader.prefix(6) == "data\0");
reader.forward(2);
assert(reader.peek(1) == 'a');
assert(collectException(reader.peek(3)));
}
unittest
{
writeln("D:YAML reader UTF formats unittest");
dchar[] data = cast(dchar[])"data";
void utf_test(T)(T[] data, BOM bom)
{
ubyte[] bytes = ByteOrderMarks[bom] ~
(cast(ubyte*)data.ptr)[0 .. data.length * T.sizeof];
auto reader = new Reader(new MemoryStream(bytes));
assert(reader.peek() == 'd');
assert(reader.peek(1) == 'a');
assert(reader.peek(2) == 't');
assert(reader.peek(3) == 'a');
}
utf_test!char(to!(char[])(data), BOM.UTF8);
utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.UTF16BE : BOM.UTF16LE);
utf_test(data, endian == Endian.bigEndian ? BOM.UTF32BE : BOM.UTF32LE);
}
}