531 lines
19 KiB
D
531 lines
19 KiB
D
|
|
// Copyright Ferdinand Majerech 2011.
|
|
// Distributed under the Boost Software License, Version 1.0.
|
|
// (See accompanying file LICENSE_1_0.txt or copy at
|
|
// http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
module dyaml.reader;
|
|
|
|
|
|
import core.stdc.string;
|
|
|
|
import std.algorithm;
|
|
import std.conv;
|
|
import std.exception;
|
|
import std.stdio;
|
|
import std.stream;
|
|
import std.string;
|
|
import std.system;
|
|
import std.utf;
|
|
|
|
import dyaml.fastcharsearch;
|
|
import dyaml.encoding;
|
|
import dyaml.exception;
|
|
|
|
|
|
package:
|
|
|
|
///Exception thrown at Reader errors.
|
|
class ReaderException : YAMLException
|
|
{
|
|
this(string msg, string file = __FILE__, int line = __LINE__)
|
|
{
|
|
super("Error reading stream: " ~ msg, file, line);
|
|
}
|
|
}
|
|
|
|
///Reads data from a stream and converts it to UTF-32 (dchar) data.
|
|
final class Reader
|
|
{
|
|
private:
|
|
///Input stream.
|
|
EndianStream stream_;
|
|
///Allocated space for buffer_.
|
|
dchar[] bufferAllocated_;
|
|
///Buffer of currently loaded characters.
|
|
dchar[] buffer_;
|
|
///Current position within buffer. Only data after this position can be read.
|
|
uint bufferOffset_ = 0;
|
|
///Index of the current character in the stream.
|
|
size_t charIndex_ = 0;
|
|
///Encoding of the input stream.
|
|
Encoding encoding_;
|
|
///Current line in file.
|
|
uint line_;
|
|
///Current column in file.
|
|
uint column_;
|
|
///Number of bytes still available (not read) in the stream.
|
|
size_t available_;
|
|
|
|
///Capacity of raw buffers.
|
|
static immutable bufferLength8_ = 8;
|
|
///Capacity of raw buffers.
|
|
static immutable bufferLength16_ = bufferLength8_ / 2;
|
|
|
|
union
|
|
{
|
|
///Buffer to hold UTF-8 data before decoding.
|
|
char[bufferLength8_ + 1] rawBuffer8_;
|
|
///Buffer to hold UTF-16 data before decoding.
|
|
wchar[bufferLength16_ + 1] rawBuffer16_;
|
|
}
|
|
///Number of elements held in the used raw buffer.
|
|
uint rawUsed_ = 0;
|
|
|
|
public:
|
|
/**
|
|
* Construct a Reader.
|
|
*
|
|
* Params: stream = Input stream. Must be readable and seekable.
|
|
*
|
|
* Throws: ReaderException if the stream is invalid.
|
|
*/
|
|
this(Stream stream)
|
|
in
|
|
{
|
|
assert(stream.readable && stream.seekable,
|
|
"Can't read YAML from a stream that is not readable and seekable");
|
|
}
|
|
body
|
|
{
|
|
stream_ = new EndianStream(stream);
|
|
available_ = stream_.available;
|
|
|
|
//handle files short enough not to have a BOM
|
|
if(available_ < 2)
|
|
{
|
|
encoding_ = Encoding.UTF_8;
|
|
return;
|
|
}
|
|
|
|
//readBOM will determine and set stream endianness
|
|
switch(stream_.readBOM(2))
|
|
{
|
|
case -1:
|
|
//readBOM() eats two more bytes in this case so get them back
|
|
const wchar bytes = stream_.getcw();
|
|
rawBuffer8_[0] = cast(char)(bytes % 256);
|
|
rawBuffer8_[1] = cast(char)(bytes / 256);
|
|
rawUsed_ = 2;
|
|
goto case 0;
|
|
case 0: encoding_ = Encoding.UTF_8; break;
|
|
case 1, 2:
|
|
//readBOM() eats two more bytes in this case so get them back
|
|
encoding_ = Encoding.UTF_16;
|
|
rawBuffer16_[0] = stream_.getcw();
|
|
rawUsed_ = 1;
|
|
enforce(available_ % 2 == 0,
|
|
new ReaderException("Odd byte count in an UTF-16 stream"));
|
|
break;
|
|
case 3, 4:
|
|
enforce(available_ % 4 == 0,
|
|
new ReaderException("Byte count in an UTF-32 stream not divisible by 4"));
|
|
encoding_ = Encoding.UTF_32;
|
|
break;
|
|
default: assert(false, "Unknown UTF BOM");
|
|
}
|
|
available_ = stream_.available;
|
|
|
|
auto ptr = cast(dchar*)core.stdc.stdlib.malloc(dchar.sizeof * 256);
|
|
bufferAllocated_ = ptr[0 .. 256];
|
|
}
|
|
|
|
///Destroy the Reader.
|
|
~this()
|
|
{
|
|
core.stdc.stdlib.free(bufferAllocated_.ptr);
|
|
buffer_ = bufferAllocated_ = null;
|
|
}
|
|
|
|
/**
|
|
* Get character at specified index relative to current position.
|
|
*
|
|
* Params: index = Index of the character to get relative to current position
|
|
* in the stream.
|
|
*
|
|
* Returns: Character at specified position.
|
|
*
|
|
* Throws: ReaderException if trying to read past the end of the stream
|
|
* or if invalid data is read.
|
|
*/
|
|
dchar peek(in size_t index = 0)
|
|
{
|
|
if(buffer_.length <= bufferOffset_ + index + 1)
|
|
{
|
|
updateBuffer(index + 1);
|
|
}
|
|
if(buffer_.length <= bufferOffset_ + index)
|
|
{
|
|
throw new ReaderException("Trying to read past the end of the stream");
|
|
}
|
|
|
|
return buffer_[bufferOffset_ + index];
|
|
}
|
|
|
|
/**
|
|
* Get specified number of characters starting at current position.
|
|
*
|
|
* Note: This gets only a "view" into the internal buffer,
|
|
* which WILL get invalidated after other Reader calls.
|
|
*
|
|
* Params: length = Number of characters to get.
|
|
*
|
|
* Returns: Characters starting at current position or an empty slice if out of bounds.
|
|
*/
|
|
const(dstring) prefix(in size_t length)
|
|
{
|
|
return slice(0, length);
|
|
}
|
|
|
|
/**
|
|
* Get a slice view of the internal buffer.
|
|
*
|
|
* Note: This gets only a "view" into the internal buffer,
|
|
* which WILL get invalidated after other Reader calls.
|
|
*
|
|
* Params: start = Start of the slice relative to current position.
|
|
* end = End of the slice relative to current position.
|
|
*
|
|
* Returns: Slice into the internal buffer or an empty slice if out of bounds.
|
|
*/
|
|
const(dstring) slice(size_t start, size_t end)
|
|
{
|
|
if(buffer_.length <= bufferOffset_ + end)
|
|
{
|
|
updateBuffer(end);
|
|
}
|
|
end += bufferOffset_;
|
|
start += bufferOffset_;
|
|
end = min(buffer_.length, end);
|
|
if(end <= start){return "";}
|
|
|
|
return cast(dstring)buffer_[start .. end];
|
|
}
|
|
|
|
/**
|
|
* Get the next character, moving stream position beyond it.
|
|
*
|
|
* Returns: Next character.
|
|
*
|
|
* Throws: ReaderException if trying to read past the end of the stream
|
|
* or if invalid data is read.
|
|
*/
|
|
dchar get()
|
|
{
|
|
const result = peek();
|
|
forward();
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
* Get specified number of characters, moving stream position beyond them.
|
|
*
|
|
* Params: length = Number or characters to get.
|
|
*
|
|
* Returns: Characters starting at current position.
|
|
*
|
|
* Throws: ReaderException if trying to read past the end of the stream
|
|
* or if invalid data is read.
|
|
*/
|
|
dstring get(in size_t length)
|
|
{
|
|
auto result = prefix(length).dup;
|
|
forward(length);
|
|
return cast(dstring)result;
|
|
}
|
|
|
|
/**
|
|
* Move current position forward.
|
|
*
|
|
* Params: length = Number of characters to move position forward.
|
|
*
|
|
* Throws: ReaderException if trying to read past the end of the stream
|
|
* or if invalid data is read.
|
|
*/
|
|
void forward(size_t length = 1)
|
|
{
|
|
mixin FastCharSearch!"\n\u0085\u2028\u2029"d search;
|
|
|
|
if(buffer_.length <= bufferOffset_ + length + 1)
|
|
{
|
|
updateBuffer(length + 1);
|
|
}
|
|
|
|
while(length > 0)
|
|
{
|
|
const c = buffer_[bufferOffset_];
|
|
++bufferOffset_;
|
|
++charIndex_;
|
|
//New line.
|
|
if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n'))
|
|
{
|
|
++line_;
|
|
column_ = 0;
|
|
}
|
|
else if(c != '\uFEFF'){++column_;}
|
|
--length;
|
|
}
|
|
}
|
|
|
|
///Get a string describing current stream position, used for error messages.
|
|
@property Mark mark() const {return Mark(line_, column_);}
|
|
|
|
///Get current line number.
|
|
@property uint line() const {return line_;}
|
|
|
|
///Get current line number.
|
|
@property uint column() const {return column_;}
|
|
|
|
///Get index of the current character in the stream.
|
|
@property size_t charIndex() const {return charIndex_;}
|
|
|
|
///Get encoding of the input stream.
|
|
@property Encoding encoding() const {return encoding_;}
|
|
|
|
private:
|
|
/**
|
|
* Update buffer to be able to read length characters after buffer offset.
|
|
*
|
|
* If there are not enough characters in the stream, it will get
|
|
* as many as possible.
|
|
*
|
|
* Params: length = Number of characters we need to read.
|
|
*
|
|
* Throws: ReaderException if trying to read past the end of the stream
|
|
* or if invalid data is read.
|
|
*/
|
|
void updateBuffer(in size_t length)
|
|
{
|
|
//get rid of unneeded data in the buffer
|
|
if(bufferOffset_ > 0)
|
|
{
|
|
size_t bufferLength = buffer_.length - bufferOffset_;
|
|
memmove(buffer_.ptr, buffer_.ptr + bufferOffset_,
|
|
bufferLength * dchar.sizeof);
|
|
buffer_ = buffer_[0 .. bufferLength];
|
|
bufferOffset_ = 0;
|
|
}
|
|
|
|
////Load chars in batches of at most 1024 bytes (256 chars)
|
|
while(buffer_.length <= bufferOffset_ + length)
|
|
{
|
|
loadChars(256);
|
|
|
|
if(done)
|
|
{
|
|
if(buffer_.length == 0 || buffer_[$ - 1] != '\0')
|
|
{
|
|
bufferReserve(buffer_.length + 1);
|
|
buffer_ = bufferAllocated_[0 .. buffer_.length + 1];
|
|
buffer_[$ - 1] = '\0';
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Load at most specified number of characters.
|
|
*
|
|
* Params: chars = Maximum number of characters to load.
|
|
*
|
|
* Throws: ReaderException on Unicode decoding error,
|
|
* if nonprintable characters are detected, or
|
|
* if there is an error reading from the stream.
|
|
*/
|
|
void loadChars(size_t chars)
|
|
{
|
|
///Get next character from the stream.
|
|
dchar getDChar()
|
|
{
|
|
final switch(encoding_)
|
|
{
|
|
case Encoding.UTF_8:
|
|
//Temp buffer for moving data in rawBuffer8_.
|
|
char[bufferLength8_] temp;
|
|
//Shortcut for ASCII.
|
|
if(rawUsed_ > 0 && rawBuffer8_[0] < 128)
|
|
{
|
|
//Get the first byte (one char in ASCII).
|
|
const dchar result = rawBuffer8_[0];
|
|
--rawUsed_;
|
|
//Move the data.
|
|
*(cast(ulong*)temp.ptr) = *(cast(ulong*)(rawBuffer8_.ptr + 1));
|
|
*(cast(ulong*)rawBuffer8_.ptr) = *(cast(ulong*)temp.ptr);
|
|
return result;
|
|
}
|
|
|
|
//Bytes to read.
|
|
const readBytes = min(available_, bufferLength8_ - rawUsed_);
|
|
available_ -= readBytes;
|
|
//Length of data in rawBuffer8_ after reading.
|
|
const len = rawUsed_ + readBytes;
|
|
//Read the data.
|
|
stream_.readExact(rawBuffer8_.ptr + rawUsed_, readBytes);
|
|
|
|
//After decoding, this will point to the first byte not decoded.
|
|
size_t idx = 0;
|
|
const dchar result = decode(rawBuffer8_, idx);
|
|
rawUsed_ = cast(uint)(len - idx);
|
|
|
|
//Move the data.
|
|
temp[0 .. rawUsed_] = rawBuffer8_[idx .. len];
|
|
rawBuffer8_[0 .. rawUsed_] = temp[0 .. rawUsed_];
|
|
return result;
|
|
case Encoding.UTF_16:
|
|
//Temp buffer for moving data in rawBuffer8_.
|
|
wchar[bufferLength16_] temp;
|
|
//Words to read.
|
|
size_t readWords = min(available_ / 2, bufferLength16_ - rawUsed_);
|
|
available_ -= readWords * 2;
|
|
//Length of data in rawBuffer16_ after reading.
|
|
size_t len = rawUsed_;
|
|
//Read the data.
|
|
while(readWords > 0)
|
|
{
|
|
//Due to a bug in std.stream, we have to use getcw here.
|
|
rawBuffer16_[len] = stream_.getcw();
|
|
--readWords;
|
|
++len;
|
|
}
|
|
|
|
//After decoding, this will point to the first word not decoded.
|
|
size_t idx = 0;
|
|
const dchar result = decode(rawBuffer16_, idx);
|
|
rawUsed_ = cast(uint)(len - idx);
|
|
|
|
//Move the data.
|
|
temp[0 .. rawUsed_] = rawBuffer16_[idx .. len];
|
|
rawBuffer16_[0 .. rawUsed_] = temp[0 .. rawUsed_];
|
|
return result;
|
|
case Encoding.UTF_32:
|
|
dchar result;
|
|
available_ -= 4;
|
|
stream_.read(result);
|
|
return result;
|
|
}
|
|
}
|
|
|
|
const oldLength = buffer_.length;
|
|
const oldPosition = stream_.position;
|
|
|
|
//Preallocating memory to limit GC reallocations.
|
|
|
|
bufferReserve(buffer_.length + chars);
|
|
buffer_ = bufferAllocated_[0 .. buffer_.length + chars];
|
|
scope(exit)
|
|
{
|
|
buffer_ = buffer_[0 .. $ - chars];
|
|
enforce(printable(buffer_[oldLength .. $]),
|
|
new ReaderException("Special unicode characters are not allowed"));
|
|
}
|
|
|
|
try for(uint c = 0; chars; --chars, ++c)
|
|
{
|
|
if(done){break;}
|
|
buffer_[oldLength + c] = getDChar();
|
|
}
|
|
catch(UtfException e)
|
|
{
|
|
const position = stream_.position;
|
|
throw new ReaderException(format("Unicode decoding error between bytes ",
|
|
oldPosition, " and ", position, " : ", e.msg));
|
|
}
|
|
catch(ReadException e)
|
|
{
|
|
throw new ReaderException(e.msg);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Determine if all characters in an array are printable.
|
|
*
|
|
* Params: chars = Characters to check.
|
|
*
|
|
* Returns: True if all the characters are printable, false otherwise.
|
|
*/
|
|
static bool printable(const ref dchar[] chars) pure
|
|
{
|
|
foreach(c; chars)
|
|
{
|
|
if(!((c == 0x09 || c == 0x0A || c == 0x0D || c == 0x85) ||
|
|
(c >= 0x20 && c <= 0x7E) ||
|
|
(c >= 0xA0 && c <= '\uD7FF') ||
|
|
(c >= '\uE000' && c <= '\uFFFD')))
|
|
{
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
///Are we done reading?
|
|
@property bool done() const
|
|
{
|
|
return (available_ == 0 &&
|
|
((encoding_ == Encoding.UTF_8 && rawUsed_ == 0) ||
|
|
(encoding_ == Encoding.UTF_16 && rawUsed_ == 0) ||
|
|
encoding_ == Encoding.UTF_32));
|
|
}
|
|
|
|
///Ensure there is space for at least capacity characters in bufferAllocated_.
|
|
void bufferReserve(in size_t capacity)
|
|
{
|
|
if(bufferAllocated_.length >= capacity){return;}
|
|
|
|
auto newPtr = core.stdc.stdlib.realloc(bufferAllocated_.ptr,
|
|
capacity * dchar.sizeof);
|
|
bufferAllocated_ = (cast(dchar*)newPtr)[0 .. capacity];
|
|
buffer_ = bufferAllocated_[0 .. buffer_.length];
|
|
}
|
|
|
|
unittest
|
|
{
|
|
writeln("D:YAML reader endian unittest");
|
|
void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected)
|
|
{
|
|
auto reader = new Reader(new MemoryStream(data));
|
|
assert(reader.encoding_ == encoding_expected);
|
|
assert(reader.stream_.endian == endian_expected);
|
|
}
|
|
ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
|
|
ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
|
|
endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian);
|
|
endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian);
|
|
}
|
|
unittest
|
|
{
|
|
writeln("D:YAML reader peek/prefix/forward unittest");
|
|
ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data";
|
|
auto reader = new Reader(new MemoryStream(data));
|
|
assert(reader.peek() == 'd');
|
|
assert(reader.peek(1) == 'a');
|
|
assert(reader.peek(2) == 't');
|
|
assert(reader.peek(3) == 'a');
|
|
assert(reader.peek(4) == '\0');
|
|
assert(reader.prefix(4) == "data");
|
|
assert(reader.prefix(6) == "data\0");
|
|
reader.forward(2);
|
|
assert(reader.peek(1) == 'a');
|
|
assert(collectException(reader.peek(3)));
|
|
}
|
|
unittest
|
|
{
|
|
writeln("D:YAML reader UTF formats unittest");
|
|
dchar[] data = cast(dchar[])"data";
|
|
void utf_test(T)(T[] data, BOM bom)
|
|
{
|
|
ubyte[] bytes = ByteOrderMarks[bom] ~
|
|
(cast(ubyte*)data.ptr)[0 .. data.length * T.sizeof];
|
|
auto reader = new Reader(new MemoryStream(bytes));
|
|
assert(reader.peek() == 'd');
|
|
assert(reader.peek(1) == 'a');
|
|
assert(reader.peek(2) == 't');
|
|
assert(reader.peek(3) == 'a');
|
|
}
|
|
utf_test!char(to!(char[])(data), BOM.UTF8);
|
|
utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.UTF16BE : BOM.UTF16LE);
|
|
utf_test(data, endian == Endian.bigEndian ? BOM.UTF32BE : BOM.UTF32LE);
|
|
}
|
|
}
|