dyaml/dyaml/reader.d
Ferdinand Majerech 283c42bf8f Initial commit.
2011-08-16 14:53:13 +02:00

483 lines
17 KiB
D

// Copyright Ferdinand Majerech 2011.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)
module dyaml.reader;
import core.stdc.string;
import std.algorithm;
import std.conv;
import std.exception;
import std.stdio;
import std.stream;
import std.string;
import std.system;
import std.utf;
import dyaml.exception;
package:
///Exception thrown at Reader errors.
class ReaderException : YAMLException
{
this(string msg){super("Error reading YAML stream: " ~ msg);}
}
///Reads data from a stream and converts it to UTF-32 (dchar) data.
final class Reader
{
private:
///Unicode encodings.
enum UTF
{
///UTF-8.
_8,
///UTF-16.
_16,
///UTF-32.
_32
}
///Input stream.
EndianStream stream_;
///Buffer of currently loaded characters.
dchar[] buffer_;
///Current position within buffer. Only data after this position can be read.
uint bufferOffset_ = 0;
///Index of the current character in the stream.
size_t charIndex_ = 0;
///Encoding of the input stream.
UTF utf_= UTF._8;
///Current line in file.
uint line_;
///Current column in file.
uint column_;
///Capacity of raw buffers.
static immutable bufferLength8_ = 8;
///Capacity of raw buffers.
static immutable bufferLength16_ = bufferLength8_ / 2;
union
{
///Buffer to hold UTF-8 data before decoding.
char[bufferLength8_] rawBuffer8_;
///Buffer to hold UTF-16 data before decoding.
wchar[bufferLength16_] rawBuffer16_;
}
///Number of elements held in the used raw buffer.
uint rawUsed_ = 0;
public:
/**
* Construct a Reader.
*
* Params: stream = Input stream. Must be readable.
*
* Throws: ReaderException if the stream is invalid.
*/
this(Stream stream)
in{assert(stream.readable, "Can't read YAML from a non-readable stream");}
body
{
stream_ = new EndianStream(stream);
//handle files short enough not to have a BOM
if(stream_.available < 2)
{
utf_ = UTF._8;
return;
}
//readBOM will determine and set stream endianness
switch(stream_.readBOM(2))
{
case -1:
//readBOM() eats two more bytes in this case so get them back
wchar bytes = stream_.getcw();
rawBuffer8_[0] = cast(char)(bytes % 256);
rawBuffer8_[1] = cast(char)(bytes / 256);
rawUsed_ = 2;
goto case 0;
case 0: utf_ = UTF._8; break;
case 1, 2:
//readBOM() eats two more bytes in this case so get them back
utf_ = UTF._16;
rawBuffer16_[0] = stream_.getcw();
rawUsed_ = 1;
enforce(stream_.available % 2 == 0,
new ReaderException("Odd number of bytes in an UTF-16 stream"));
break;
case 3, 4:
enforce(stream_.available % 4 == 0,
new ReaderException("Number of bytes in an UTF-32 stream not divisible by 4"));
utf_ = UTF._32;
break;
default: assert(false, "Unknown UTF BOM");
}
}
///Destroy the Reader.
~this()
{
clear(buffer_);
buffer_ = null;
}
/**
* Get character at specified index relative to current position.
*
* Params: index = Index of the character to get relative to current position
* in the stream.
*
* Returns: Character at specified position.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
dchar peek(in size_t index = 0)
{
updateBuffer(index + 1);
enforce(buffer_.length >= bufferOffset_ + index + 1,
new ReaderException("Trying to read past the end of the stream"));
return buffer_[bufferOffset_ + index];
}
/**
* Get specified number of characters starting at current position.
*
* Params: length = Number of characters to get.
*
* Returns: Characters starting at current position.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
dstring prefix(in size_t length)
{
if(length == 0){return "";}
updateBuffer(length);
const end = min(buffer_.length, bufferOffset_ + length);
//need to duplicate as we change buffer content with C functions
//and could end up with returned string referencing changed data
return cast(dstring)buffer_[bufferOffset_ .. end].dup;
}
/**
* Get the next character, moving stream position beyond it.
*
* Returns: Next character.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
dchar get()
{
const result = peek();
forward();
return result;
}
/**
* Get specified number of characters, moving stream position beyond them.
*
* Params: length = Number or characters to get.
*
* Returns: Characters starting at current position.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
dstring get(in size_t length)
{
dstring result = prefix(length);
forward(length);
return result;
}
/**
* Move current position forward.
*
* Params: length = Number of characters to move position forward.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
void forward(size_t length = 1)
{
updateBuffer(length + 1);
while(length > 0)
{
const c = buffer_[bufferOffset_];
++bufferOffset_;
++charIndex_;
//new line
if(['\n', '\x85', '\u2028', '\u2029'].canFind(c) ||
(c == '\r' && buffer_[bufferOffset_] != '\n'))
{
++line_;
column_ = 0;
}
else if(c != '\uFEFF'){++column_;}
--length;
}
}
///Get a string describing current stream position, used for error messages.
@property Mark mark() const {return Mark(line_, column_);}
///Get current line number.
@property uint line() const {return line_;}
///Get current line number.
@property uint column() const {return column_;}
///Get index of the current character in the stream.
@property size_t charIndex() const {return charIndex_;}
private:
/**
* Update buffer to be able to read length characters after buffer offset.
*
* If there are not enough characters in the stream, it will get
* as many as possible.
*
* Params: length = Number of characters we need to read.
*
* Throws: ReaderException if trying to read past the end of the stream
* or if invalid data is read.
*/
void updateBuffer(in size_t length)
{
if(buffer_.length > bufferOffset_ + length){return;}
//get rid of unneeded data in the buffer
if(bufferOffset_ > 0)
{
size_t bufferLength = buffer_.length - bufferOffset_;
memmove(buffer_.ptr, buffer_.ptr + bufferOffset_,
bufferLength * dchar.sizeof);
buffer_.length = bufferLength;
bufferOffset_ = 0;
}
////load chars in batches of at most 64 bytes
while(buffer_.length <= bufferOffset_ + length)
{
loadChars(16);
if(done)
{
if(buffer_.length == 0 || buffer_[$ - 1] != '\0')
{
buffer_ ~= '\0';
}
break;
}
}
}
/**
* Load at most specified number of characters.
*
* Params: chars = Maximum number of characters to load.
*
* Throws: ReaderException on unicode decoding error,
* if nonprintable characters are detected, or
* if there is an error reading from the stream.
*/
void loadChars(in uint chars)
{
const oldLength = buffer_.length;
/**
* Get next character from the stream.
*
* Params: available = Bytes available in the stream.
*
* Returns: Next character in the stream.
*/
dchar getDChar(in size_t available)
{
switch(utf_)
{
case UTF._8:
//Temp buffer for moving data in rawBuffer8_.
char[bufferLength8_] temp;
//Shortcut for ASCII.
if(rawUsed_ > 0 && rawBuffer8_[0] < 128)
{
//Get the first byte (one char in ASCII).
const dchar result = rawBuffer8_[0];
--rawUsed_;
//Move the data.
temp[0 .. rawUsed_] = rawBuffer8_[1 .. rawUsed_ + 1];
rawBuffer8_[0 .. rawUsed_] = temp[0 .. rawUsed_];
return result;
}
//Bytes to read.
const readBytes = min(available, bufferLength8_ - rawUsed_);
//Length of data in rawBuffer8_ after reading.
const len = rawUsed_ + readBytes;
//Read the data.
stream_.readExact(rawBuffer8_.ptr + rawUsed_, readBytes);
//After decoding, this will point to the first byte not decoded.
size_t idx = 0;
const dchar result = decode(rawBuffer8_, idx);
rawUsed_ = cast(uint)(len - idx);
//Move the data.
temp[0 .. rawUsed_] = rawBuffer8_[idx .. len];
rawBuffer8_[0 .. rawUsed_] = temp[0 .. rawUsed_];
return result;
case UTF._16:
//Temp buffer for moving data in rawBuffer8_.
wchar[bufferLength16_] temp;
//Words to read.
size_t readWords = min(available / 2, bufferLength16_ - rawUsed_);
//Length of data in rawBuffer16_ after reading.
size_t len = rawUsed_;
//Read the data.
while(readWords > 0)
{
//Due to a bug in std.stream, we have to use getcw here.
rawBuffer16_[len] = stream_.getcw();
--readWords;
++len;
}
//After decoding, this will point to the first word not decoded.
size_t idx = 0;
const dchar result = decode(rawBuffer16_, idx);
rawUsed_ = cast(uint)(len - idx);
//Move the data.
temp[0 .. rawUsed_] = rawBuffer16_[idx .. len];
rawBuffer16_[0 .. rawUsed_] = temp[0 .. rawUsed_];
return result;
case UTF._32:
dchar result;
stream_.read(result);
return result;
default: assert(false);
}
}
const oldPosition = stream_.position;
try
{
foreach(i; 0 .. chars)
{
if(done){break;}
const available = stream_.available;
buffer_ ~= getDChar(available);
}
}
catch(UtfException e)
{
const position = stream_.position;
throw new ReaderException("Unicode decoding error between bytes " ~
to!string(oldPosition) ~ " and " ~
to!string(position) ~ " " ~ e.msg);
}
catch(ReadException e)
{
throw new ReaderException("Error reading from the stream: " ~ e.msg);
}
enforce(printable(buffer_[oldLength .. $]),
new ReaderException("Special unicode characters are not allowed"));
}
/**
* Determine if all characters in an array are printable.
*
* Params: chars = Characters to check.
*
* Returns: True if all the characters are printable, false otherwise.
*/
static pure bool printable(const ref dchar[] chars)
{
foreach(c; chars)
{
if(!((c == 0x09 || c == 0x0A || c == 0x0D || c == 0x85) ||
(c >= 0x20 && c <= 0x7E) ||
(c >= 0xA0 && c <= '\uD7FF') ||
(c >= '\uE000' && c <= '\uFFFD')))
{
return false;
}
}
return true;
}
///Are we done reading?
@property bool done()
{
return (stream_.available == 0 &&
((utf_ == UTF._8 && rawUsed_ == 0) ||
(utf_ == UTF._16 && rawUsed_ == 0) ||
utf_ == UTF._32));
}
unittest
{
writeln("D:YAML reader endian unittest");
void endian_test(ubyte[] data, UTF utf_expected, Endian endian_expected)
{
auto reader = new Reader(new MemoryStream(data));
assert(reader.utf_ == utf_expected);
assert(reader.stream_.endian == endian_expected);
}
ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
endian_test(little_endian_utf_16, UTF._16, Endian.LittleEndian);
endian_test(big_endian_utf_16, UTF._16, Endian.BigEndian);
}
unittest
{
writeln("D:YAML reader peek/prefix/forward unittest");
ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data";
auto reader = new Reader(new MemoryStream(data));
assert(reader.peek() == 'd');
assert(reader.peek(1) == 'a');
assert(reader.peek(2) == 't');
assert(reader.peek(3) == 'a');
assert(reader.peek(4) == '\0');
assert(reader.prefix(4) == "data");
assert(reader.prefix(6) == "data\0");
reader.forward(2);
assert(reader.peek(1) == 'a');
assert(collectException(reader.peek(3)));
}
unittest
{
writeln("D:YAML reader UTF formats unittest");
dchar[] data = cast(dchar[])"data";
void utf_test(T)(T[] data, BOM bom)
{
ubyte[] bytes = ByteOrderMarks[bom] ~
(cast(ubyte*)data.ptr)[0 .. data.length * T.sizeof];
auto reader = new Reader(new MemoryStream(bytes));
assert(reader.peek() == 'd');
assert(reader.peek(1) == 'a');
assert(reader.peek(2) == 't');
assert(reader.peek(3) == 'a');
}
utf_test!char(to!(char[])(data), BOM.UTF8);
utf_test!wchar(to!(wchar[])(data), endian == Endian.BigEndian ? BOM.UTF16BE : BOM.UTF16LE);
utf_test(data, endian == Endian.BigEndian ? BOM.UTF32BE : BOM.UTF32LE);
}
}