483 lines
17 KiB
D
483 lines
17 KiB
D
|
|
// Copyright Ferdinand Majerech 2011.
|
|
// Distributed under the Boost Software License, Version 1.0.
|
|
// (See accompanying file LICENSE_1_0.txt or copy at
|
|
// http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
module dyaml.reader;
|
|
|
|
|
|
import core.stdc.string;
|
|
|
|
import std.algorithm;
|
|
import std.conv;
|
|
import std.exception;
|
|
import std.stdio;
|
|
import std.stream;
|
|
import std.string;
|
|
import std.system;
|
|
import std.utf;
|
|
|
|
import dyaml.exception;
|
|
|
|
|
|
package:
|
|
|
|
///Exception thrown at Reader errors.
|
|
class ReaderException : YAMLException
|
|
{
|
|
this(string msg){super("Error reading YAML stream: " ~ msg);}
|
|
}
|
|
|
|
|
|
///Reads data from a stream and converts it to UTF-32 (dchar) data.
|
|
final class Reader
|
|
{
|
|
private:
|
|
///Unicode encodings.
|
|
enum UTF
|
|
{
|
|
///UTF-8.
|
|
_8,
|
|
///UTF-16.
|
|
_16,
|
|
///UTF-32.
|
|
_32
|
|
}
|
|
|
|
///Input stream.
|
|
EndianStream stream_;
|
|
///Buffer of currently loaded characters.
|
|
dchar[] buffer_;
|
|
///Current position within buffer. Only data after this position can be read.
|
|
uint bufferOffset_ = 0;
|
|
///Index of the current character in the stream.
|
|
size_t charIndex_ = 0;
|
|
///Encoding of the input stream.
|
|
UTF utf_= UTF._8;
|
|
///Current line in file.
|
|
uint line_;
|
|
///Current column in file.
|
|
uint column_;
|
|
|
|
///Capacity of raw buffers.
|
|
static immutable bufferLength8_ = 8;
|
|
///Capacity of raw buffers.
|
|
static immutable bufferLength16_ = bufferLength8_ / 2;
|
|
|
|
union
|
|
{
|
|
///Buffer to hold UTF-8 data before decoding.
|
|
char[bufferLength8_] rawBuffer8_;
|
|
///Buffer to hold UTF-16 data before decoding.
|
|
wchar[bufferLength16_] rawBuffer16_;
|
|
}
|
|
///Number of elements held in the used raw buffer.
|
|
uint rawUsed_ = 0;
|
|
|
|
public:
|
|
/**
|
|
* Construct a Reader.
|
|
*
|
|
* Params: stream = Input stream. Must be readable.
|
|
*
|
|
* Throws: ReaderException if the stream is invalid.
|
|
*/
|
|
this(Stream stream)
|
|
in{assert(stream.readable, "Can't read YAML from a non-readable stream");}
|
|
body
|
|
{
|
|
stream_ = new EndianStream(stream);
|
|
|
|
//handle files short enough not to have a BOM
|
|
if(stream_.available < 2)
|
|
{
|
|
utf_ = UTF._8;
|
|
return;
|
|
}
|
|
|
|
//readBOM will determine and set stream endianness
|
|
switch(stream_.readBOM(2))
|
|
{
|
|
case -1:
|
|
//readBOM() eats two more bytes in this case so get them back
|
|
wchar bytes = stream_.getcw();
|
|
rawBuffer8_[0] = cast(char)(bytes % 256);
|
|
rawBuffer8_[1] = cast(char)(bytes / 256);
|
|
rawUsed_ = 2;
|
|
goto case 0;
|
|
case 0: utf_ = UTF._8; break;
|
|
case 1, 2:
|
|
//readBOM() eats two more bytes in this case so get them back
|
|
utf_ = UTF._16;
|
|
rawBuffer16_[0] = stream_.getcw();
|
|
rawUsed_ = 1;
|
|
enforce(stream_.available % 2 == 0,
|
|
new ReaderException("Odd number of bytes in an UTF-16 stream"));
|
|
break;
|
|
case 3, 4:
|
|
enforce(stream_.available % 4 == 0,
|
|
new ReaderException("Number of bytes in an UTF-32 stream not divisible by 4"));
|
|
utf_ = UTF._32;
|
|
break;
|
|
default: assert(false, "Unknown UTF BOM");
|
|
}
|
|
}
|
|
|
|
///Destroy the Reader.
|
|
~this()
|
|
{
|
|
clear(buffer_);
|
|
buffer_ = null;
|
|
}
|
|
|
|
/**
|
|
* Get character at specified index relative to current position.
|
|
*
|
|
* Params: index = Index of the character to get relative to current position
|
|
* in the stream.
|
|
*
|
|
* Returns: Character at specified position.
|
|
*
|
|
* Throws: ReaderException if trying to read past the end of the stream
|
|
* or if invalid data is read.
|
|
*/
|
|
dchar peek(in size_t index = 0)
|
|
{
|
|
updateBuffer(index + 1);
|
|
|
|
enforce(buffer_.length >= bufferOffset_ + index + 1,
|
|
new ReaderException("Trying to read past the end of the stream"));
|
|
return buffer_[bufferOffset_ + index];
|
|
}
|
|
|
|
/**
|
|
* Get specified number of characters starting at current position.
|
|
*
|
|
* Params: length = Number of characters to get.
|
|
*
|
|
* Returns: Characters starting at current position.
|
|
*
|
|
* Throws: ReaderException if trying to read past the end of the stream
|
|
* or if invalid data is read.
|
|
*/
|
|
dstring prefix(in size_t length)
|
|
{
|
|
if(length == 0){return "";}
|
|
updateBuffer(length);
|
|
const end = min(buffer_.length, bufferOffset_ + length);
|
|
//need to duplicate as we change buffer content with C functions
|
|
//and could end up with returned string referencing changed data
|
|
return cast(dstring)buffer_[bufferOffset_ .. end].dup;
|
|
}
|
|
|
|
/**
|
|
* Get the next character, moving stream position beyond it.
|
|
*
|
|
* Returns: Next character.
|
|
*
|
|
* Throws: ReaderException if trying to read past the end of the stream
|
|
* or if invalid data is read.
|
|
*/
|
|
dchar get()
|
|
{
|
|
const result = peek();
|
|
forward();
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Get specified number of characters, moving stream position beyond them.
|
|
*
|
|
* Params: length = Number or characters to get.
|
|
*
|
|
* Returns: Characters starting at current position.
|
|
*
|
|
* Throws: ReaderException if trying to read past the end of the stream
|
|
* or if invalid data is read.
|
|
*/
|
|
dstring get(in size_t length)
|
|
{
|
|
dstring result = prefix(length);
|
|
forward(length);
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Move current position forward.
|
|
*
|
|
* Params: length = Number of characters to move position forward.
|
|
*
|
|
* Throws: ReaderException if trying to read past the end of the stream
|
|
* or if invalid data is read.
|
|
*/
|
|
void forward(size_t length = 1)
|
|
{
|
|
updateBuffer(length + 1);
|
|
|
|
while(length > 0)
|
|
{
|
|
const c = buffer_[bufferOffset_];
|
|
++bufferOffset_;
|
|
++charIndex_;
|
|
//new line
|
|
if(['\n', '\x85', '\u2028', '\u2029'].canFind(c) ||
|
|
(c == '\r' && buffer_[bufferOffset_] != '\n'))
|
|
{
|
|
++line_;
|
|
column_ = 0;
|
|
}
|
|
else if(c != '\uFEFF'){++column_;}
|
|
--length;
|
|
}
|
|
}
|
|
|
|
///Get a string describing current stream position, used for error messages.
|
|
@property Mark mark() const {return Mark(line_, column_);}
|
|
|
|
///Get current line number.
|
|
@property uint line() const {return line_;}
|
|
|
|
///Get current line number.
|
|
@property uint column() const {return column_;}
|
|
|
|
///Get index of the current character in the stream.
|
|
@property size_t charIndex() const {return charIndex_;}
|
|
|
|
private:
|
|
/**
|
|
* Update buffer to be able to read length characters after buffer offset.
|
|
*
|
|
* If there are not enough characters in the stream, it will get
|
|
* as many as possible.
|
|
*
|
|
* Params: length = Number of characters we need to read.
|
|
*
|
|
* Throws: ReaderException if trying to read past the end of the stream
|
|
* or if invalid data is read.
|
|
*/
|
|
void updateBuffer(in size_t length)
|
|
{
|
|
if(buffer_.length > bufferOffset_ + length){return;}
|
|
|
|
//get rid of unneeded data in the buffer
|
|
if(bufferOffset_ > 0)
|
|
{
|
|
size_t bufferLength = buffer_.length - bufferOffset_;
|
|
memmove(buffer_.ptr, buffer_.ptr + bufferOffset_,
|
|
bufferLength * dchar.sizeof);
|
|
buffer_.length = bufferLength;
|
|
bufferOffset_ = 0;
|
|
}
|
|
|
|
////load chars in batches of at most 64 bytes
|
|
while(buffer_.length <= bufferOffset_ + length)
|
|
{
|
|
loadChars(16);
|
|
|
|
if(done)
|
|
{
|
|
if(buffer_.length == 0 || buffer_[$ - 1] != '\0')
|
|
{
|
|
buffer_ ~= '\0';
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Load at most specified number of characters.
|
|
*
|
|
* Params: chars = Maximum number of characters to load.
|
|
*
|
|
* Throws: ReaderException on unicode decoding error,
|
|
* if nonprintable characters are detected, or
|
|
* if there is an error reading from the stream.
|
|
*/
|
|
void loadChars(in uint chars)
|
|
{
|
|
const oldLength = buffer_.length;
|
|
|
|
/**
|
|
* Get next character from the stream.
|
|
*
|
|
* Params: available = Bytes available in the stream.
|
|
*
|
|
* Returns: Next character in the stream.
|
|
*/
|
|
dchar getDChar(in size_t available)
|
|
{
|
|
switch(utf_)
|
|
{
|
|
case UTF._8:
|
|
//Temp buffer for moving data in rawBuffer8_.
|
|
char[bufferLength8_] temp;
|
|
//Shortcut for ASCII.
|
|
if(rawUsed_ > 0 && rawBuffer8_[0] < 128)
|
|
{
|
|
//Get the first byte (one char in ASCII).
|
|
const dchar result = rawBuffer8_[0];
|
|
--rawUsed_;
|
|
//Move the data.
|
|
temp[0 .. rawUsed_] = rawBuffer8_[1 .. rawUsed_ + 1];
|
|
rawBuffer8_[0 .. rawUsed_] = temp[0 .. rawUsed_];
|
|
return result;
|
|
}
|
|
|
|
//Bytes to read.
|
|
const readBytes = min(available, bufferLength8_ - rawUsed_);
|
|
//Length of data in rawBuffer8_ after reading.
|
|
const len = rawUsed_ + readBytes;
|
|
//Read the data.
|
|
stream_.readExact(rawBuffer8_.ptr + rawUsed_, readBytes);
|
|
|
|
//After decoding, this will point to the first byte not decoded.
|
|
size_t idx = 0;
|
|
const dchar result = decode(rawBuffer8_, idx);
|
|
rawUsed_ = cast(uint)(len - idx);
|
|
|
|
//Move the data.
|
|
temp[0 .. rawUsed_] = rawBuffer8_[idx .. len];
|
|
rawBuffer8_[0 .. rawUsed_] = temp[0 .. rawUsed_];
|
|
return result;
|
|
case UTF._16:
|
|
//Temp buffer for moving data in rawBuffer8_.
|
|
wchar[bufferLength16_] temp;
|
|
//Words to read.
|
|
size_t readWords = min(available / 2, bufferLength16_ - rawUsed_);
|
|
//Length of data in rawBuffer16_ after reading.
|
|
size_t len = rawUsed_;
|
|
//Read the data.
|
|
while(readWords > 0)
|
|
{
|
|
//Due to a bug in std.stream, we have to use getcw here.
|
|
rawBuffer16_[len] = stream_.getcw();
|
|
--readWords;
|
|
++len;
|
|
}
|
|
|
|
//After decoding, this will point to the first word not decoded.
|
|
size_t idx = 0;
|
|
const dchar result = decode(rawBuffer16_, idx);
|
|
rawUsed_ = cast(uint)(len - idx);
|
|
|
|
//Move the data.
|
|
temp[0 .. rawUsed_] = rawBuffer16_[idx .. len];
|
|
rawBuffer16_[0 .. rawUsed_] = temp[0 .. rawUsed_];
|
|
return result;
|
|
case UTF._32:
|
|
dchar result;
|
|
stream_.read(result);
|
|
return result;
|
|
default: assert(false);
|
|
}
|
|
}
|
|
|
|
const oldPosition = stream_.position;
|
|
try
|
|
{
|
|
foreach(i; 0 .. chars)
|
|
{
|
|
if(done){break;}
|
|
const available = stream_.available;
|
|
buffer_ ~= getDChar(available);
|
|
}
|
|
}
|
|
catch(UtfException e)
|
|
{
|
|
const position = stream_.position;
|
|
throw new ReaderException("Unicode decoding error between bytes " ~
|
|
to!string(oldPosition) ~ " and " ~
|
|
to!string(position) ~ " " ~ e.msg);
|
|
}
|
|
catch(ReadException e)
|
|
{
|
|
throw new ReaderException("Error reading from the stream: " ~ e.msg);
|
|
}
|
|
|
|
enforce(printable(buffer_[oldLength .. $]),
|
|
new ReaderException("Special unicode characters are not allowed"));
|
|
}
|
|
|
|
/**
|
|
* Determine if all characters in an array are printable.
|
|
*
|
|
* Params: chars = Characters to check.
|
|
*
|
|
* Returns: True if all the characters are printable, false otherwise.
|
|
*/
|
|
static pure bool printable(const ref dchar[] chars)
|
|
{
|
|
foreach(c; chars)
|
|
{
|
|
if(!((c == 0x09 || c == 0x0A || c == 0x0D || c == 0x85) ||
|
|
(c >= 0x20 && c <= 0x7E) ||
|
|
(c >= 0xA0 && c <= '\uD7FF') ||
|
|
(c >= '\uE000' && c <= '\uFFFD')))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
///Are we done reading?
|
|
@property bool done()
|
|
{
|
|
return (stream_.available == 0 &&
|
|
((utf_ == UTF._8 && rawUsed_ == 0) ||
|
|
(utf_ == UTF._16 && rawUsed_ == 0) ||
|
|
utf_ == UTF._32));
|
|
}
|
|
|
|
unittest
|
|
{
|
|
writeln("D:YAML reader endian unittest");
|
|
void endian_test(ubyte[] data, UTF utf_expected, Endian endian_expected)
|
|
{
|
|
auto reader = new Reader(new MemoryStream(data));
|
|
assert(reader.utf_ == utf_expected);
|
|
assert(reader.stream_.endian == endian_expected);
|
|
}
|
|
ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
|
|
ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
|
|
endian_test(little_endian_utf_16, UTF._16, Endian.LittleEndian);
|
|
endian_test(big_endian_utf_16, UTF._16, Endian.BigEndian);
|
|
}
|
|
unittest
|
|
{
|
|
writeln("D:YAML reader peek/prefix/forward unittest");
|
|
ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data";
|
|
auto reader = new Reader(new MemoryStream(data));
|
|
assert(reader.peek() == 'd');
|
|
assert(reader.peek(1) == 'a');
|
|
assert(reader.peek(2) == 't');
|
|
assert(reader.peek(3) == 'a');
|
|
assert(reader.peek(4) == '\0');
|
|
assert(reader.prefix(4) == "data");
|
|
assert(reader.prefix(6) == "data\0");
|
|
reader.forward(2);
|
|
assert(reader.peek(1) == 'a');
|
|
assert(collectException(reader.peek(3)));
|
|
}
|
|
unittest
|
|
{
|
|
writeln("D:YAML reader UTF formats unittest");
|
|
dchar[] data = cast(dchar[])"data";
|
|
void utf_test(T)(T[] data, BOM bom)
|
|
{
|
|
ubyte[] bytes = ByteOrderMarks[bom] ~
|
|
(cast(ubyte*)data.ptr)[0 .. data.length * T.sizeof];
|
|
auto reader = new Reader(new MemoryStream(bytes));
|
|
assert(reader.peek() == 'd');
|
|
assert(reader.peek(1) == 'a');
|
|
assert(reader.peek(2) == 't');
|
|
assert(reader.peek(3) == 'a');
|
|
}
|
|
utf_test!char(to!(char[])(data), BOM.UTF8);
|
|
utf_test!wchar(to!(wchar[])(data), endian == Endian.BigEndian ? BOM.UTF16BE : BOM.UTF16LE);
|
|
utf_test(data, endian == Endian.BigEndian ? BOM.UTF32BE : BOM.UTF32LE);
|
|
}
|
|
}
|