Reader is now using MemoryStream, using tinyendian to fix endianness issues.

This commit is contained in:
Ferdinand Majerech 2014-07-22 01:09:27 +02:00
parent e9a18c62b0
commit 72afe53ab8

View file

@ -65,7 +65,7 @@ final class Reader
{ {
private: private:
// Input stream. // Input stream.
EndianStream stream_; MemoryStream memStream_;
// Allocated space for buffer_. // Allocated space for buffer_.
dchar[] bufferAllocated_ = null; dchar[] bufferAllocated_ = null;
// Buffer of currently loaded characters. // Buffer of currently loaded characters.
@ -81,6 +81,12 @@ final class Reader
// Decoder reading data from file and decoding it to UTF-32. // Decoder reading data from file and decoding it to UTF-32.
UTFFastDecoder decoder_; UTFFastDecoder decoder_;
version(unittest)
{
// Endianness of the input before it was converted (for testing)
Endian endian_;
}
public: public:
/// Construct a Reader. /// Construct a Reader.
/// ///
@ -88,15 +94,18 @@ final class Reader
/// ///
/// Throws: ReaderException if the stream is invalid. /// Throws: ReaderException if the stream is invalid.
this(Stream stream) @trusted //!nothrow this(Stream stream) @trusted //!nothrow
in
{ {
assert(stream.readable && stream.seekable, auto streamBytes = streamToBytesGC(stream);
"Can't read YAML from a stream that is not readable and seekable"); auto result = fixUTFByteOrder(streamBytes);
if(result.bytesStripped > 0)
{
throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned "
"to 2 or 4 bytes, respectively");
} }
body
{ version(unittest) { endian_ = result.endian; }
stream_ = new EndianStream(stream); memStream_ = new MemoryStream(result.array);
decoder_ = UTFFastDecoder(stream_); decoder_ = UTFFastDecoder(memStream_, result.encoding);
} }
@trusted nothrow @nogc ~this() @trusted nothrow @nogc ~this()
@ -294,7 +303,7 @@ final class Reader
void loadChars(size_t chars) @system void loadChars(size_t chars) @system
{ {
const oldLength = buffer_.length; const oldLength = buffer_.length;
const oldPosition = stream_.position; const oldPosition = memStream_.position;
bufferReserve(buffer_.length + chars); bufferReserve(buffer_.length + chars);
buffer_ = bufferAllocated_[0 .. buffer_.length + chars]; buffer_ = bufferAllocated_[0 .. buffer_.length + chars];
@ -324,7 +333,7 @@ final class Reader
try{throw e;} try{throw e;}
catch(UTFException e) catch(UTFException e)
{ {
const position = stream_.position; const position = memStream_.position;
throw new ReaderException(format("Unicode decoding error between bytes %s and %s : %s", throw new ReaderException(format("Unicode decoding error between bytes %s and %s : %s",
oldPosition, position, e.msg)); oldPosition, position, e.msg));
} }
@ -398,7 +407,7 @@ struct UTFBlockDecoder(size_t bufferSize_) if (bufferSize_ % 2 == 0)
// Bytes available in the stream. // Bytes available in the stream.
size_t available_; size_t available_;
// Input stream. // Input stream.
EndianStream stream_; MemoryStream stream_;
// Buffer used to store raw UTF-8 or UTF-16 code points. // Buffer used to store raw UTF-8 or UTF-16 code points.
union union
@ -416,59 +425,17 @@ struct UTFBlockDecoder(size_t bufferSize_) if (bufferSize_ % 2 == 0)
public: public:
/// Construct a UTFBlockDecoder decoding a stream. /// Construct a UTFBlockDecoder decoding a stream.
this(EndianStream stream) @trusted //!nothrow this(MemoryStream stream, UTFEncoding encoding) @trusted
{ {
stream_ = stream; stream_ = stream;
available_ = stream_.available; available_ = stream_.available;
encoding_ = encoding;
//Handle files short enough not to have a BOM. final switch(encoding_)
if(available_ < 2)
{ {
encoding_ = Encoding.UTF_8; case UTFEncoding.UTF_8: maxChars_ = available_; break;
maxChars_ = 0; case UTFEncoding.UTF_16: maxChars_ = available_ / 2; break;
case UTFEncoding.UTF_32: maxChars_ = available_ / 2; break;
if(available_ == 1)
{
bufferSpace_[0] = stream_.getc();
buffer_ = bufferSpace_[0 .. 1];
maxChars_ = 1;
} }
return;
}
char[] rawBuffer8;
wchar[] rawBuffer16;
//readBOM will determine and set stream endianness.
switch(stream_.readBOM(2))
{
case -1:
//readBOM() eats two more bytes in this case so get them back.
const wchar bytes = stream_.getcw();
rawBuffer8_[0 .. 2] = [cast(ubyte)(bytes % 256), cast(ubyte)(bytes / 256)];
rawUsed_ = 2;
goto case 0;
case 0:
maxChars_ = available_;
encoding_ = Encoding.UTF_8;
break;
case 1, 2:
maxChars_ = available_ / 2;
//readBOM() eats two more bytes in this case so get them back.
encoding_ = Encoding.UTF_16;
rawBuffer16_[0] = stream_.getcw();
rawUsed_ = 1;
enforce(available_ % 2 == 0,
new ReaderException("Odd byte count in an UTF-16 stream"));
break;
case 3, 4:
maxChars_ = available_ / 4;
encoding_ = Encoding.UTF_32;
enforce(available_ % 4 == 0,
new ReaderException("Byte count in an UTF-32 stream not divisible by 4"));
break;
default: assert(false, "Unknown UTF BOM");
}
available_ = stream_.available;
} }
/// Get maximum number of characters that might be in the stream. /// Get maximum number of characters that might be in the stream.
@ -658,7 +625,7 @@ void testEndian(R)()
{ {
auto reader = new R(new MemoryStream(data)); auto reader = new R(new MemoryStream(data));
assert(reader.encoding == encoding_expected); assert(reader.encoding == encoding_expected);
assert(reader.stream_.endian == endian_expected); assert(reader.endian_ == endian_expected);
} }
ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00]; ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A]; ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];