2011-08-16 12:53:13 +00:00
|
|
|
|
2014-07-21 22:21:42 +00:00
|
|
|
// Copyright Ferdinand Majerech 2011-2014.
|
2011-08-16 12:53:13 +00:00
|
|
|
// Distributed under the Boost Software License, Version 1.0.
|
|
|
|
// (See accompanying file LICENSE_1_0.txt or copy at
|
|
|
|
// http://www.boost.org/LICENSE_1_0.txt)
|
|
|
|
|
|
|
|
module dyaml.reader;
|
|
|
|
|
|
|
|
|
2011-11-16 02:10:29 +00:00
|
|
|
import core.stdc.stdlib;
|
2011-08-16 12:53:13 +00:00
|
|
|
import core.stdc.string;
|
2011-11-16 02:10:29 +00:00
|
|
|
import core.thread;
|
2011-08-16 12:53:13 +00:00
|
|
|
|
|
|
|
import std.algorithm;
|
2014-07-22 22:09:40 +00:00
|
|
|
import std.array;
|
2011-08-16 12:53:13 +00:00
|
|
|
import std.conv;
|
|
|
|
import std.exception;
|
|
|
|
import std.stdio;
|
|
|
|
import std.string;
|
|
|
|
import std.system;
|
|
|
|
import std.utf;
|
|
|
|
|
2014-07-21 22:23:15 +00:00
|
|
|
import tinyendian;
|
|
|
|
|
2011-10-24 18:36:26 +00:00
|
|
|
import dyaml.fastcharsearch;
|
2011-10-11 13:58:23 +00:00
|
|
|
import dyaml.encoding;
|
2011-08-16 12:53:13 +00:00
|
|
|
import dyaml.exception;
|
2014-07-21 23:04:44 +00:00
|
|
|
import dyaml.streamcompat;
|
|
|
|
|
2011-08-16 12:53:13 +00:00
|
|
|
|
|
|
|
|
|
|
|
package:
|
|
|
|
|
2014-07-21 22:33:17 +00:00
|
|
|
//XXX VIM STUFF:
|
|
|
|
//XXX THE f/t COLORING PLUGIN, AND TRY TO REMOVE THE f/t AUTOREPEAT PLUGIN
|
|
|
|
// (AND MAYBE DO THE REPEAT WITH ALT-T/ALT-F
|
|
|
|
//XXX DDOC snippets such as $D, $BIGOH, anything else
|
|
|
|
// OR MAYBE JUST $ - EXPANDING TO $(${1} ${2})
|
|
|
|
// WHERE DEFAULT ${1} IS 'D' AND SPECIAL SNIPPETS FOR SPECIFIC DDOC MACROS
|
|
|
|
// (E.G. XREF HAS 2 ARGS)
|
|
|
|
// XXX DON'T FORGET TO COMMIT DSNIPS CHANGES
|
|
|
|
// XXX SNIPPETS: WHY CAN'T WE USE NEW IN NEW? FIX!
|
|
|
|
// XXX ALSO WRITELN VISUAL! (print whatever we have selected)
|
|
|
|
// XXX AND ``fun`` VISUAL TOO!
|
|
|
|
// XXX snippet to print variable along its name AND
|
|
|
|
// OR MULTIPLE VARS - USE std.format!
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2011-08-16 12:53:13 +00:00
|
|
|
///Exception thrown at Reader errors.
|
|
|
|
class ReaderException : YAMLException
|
|
|
|
{
|
2011-10-13 09:30:14 +00:00
|
|
|
this(string msg, string file = __FILE__, int line = __LINE__)
|
2013-12-17 14:16:43 +00:00
|
|
|
@safe pure nothrow
|
2011-10-13 09:30:14 +00:00
|
|
|
{
|
2014-07-22 00:40:14 +00:00
|
|
|
super("Reader error: " ~ msg, file, line);
|
2011-10-13 09:30:14 +00:00
|
|
|
}
|
2011-08-16 12:53:13 +00:00
|
|
|
}
|
|
|
|
|
2014-07-22 00:40:14 +00:00
|
|
|
/// Lazily reads and decodes data from a buffer, only storing as much as needed at any
|
|
|
|
/// moment.
|
2014-07-21 23:04:44 +00:00
|
|
|
///
|
2014-07-22 00:40:14 +00:00
|
|
|
/// Adds a '\0' to the end of the data.
|
2011-08-16 12:53:13 +00:00
|
|
|
final class Reader
|
|
|
|
{
|
|
|
|
private:
|
2014-07-21 22:41:13 +00:00
|
|
|
// Buffer of currently loaded characters.
|
2011-11-16 02:10:29 +00:00
|
|
|
dchar[] buffer_ = null;
|
2014-07-21 22:41:13 +00:00
|
|
|
// Current position within buffer. Only data after this position can be read.
|
2011-08-16 12:53:13 +00:00
|
|
|
uint bufferOffset_ = 0;
|
2014-07-22 00:40:14 +00:00
|
|
|
// Index of the current character in the buffer.
|
2011-08-16 12:53:13 +00:00
|
|
|
size_t charIndex_ = 0;
|
2014-07-21 22:41:13 +00:00
|
|
|
// Current line in file.
|
2011-08-16 12:53:13 +00:00
|
|
|
uint line_;
|
2014-07-21 22:41:13 +00:00
|
|
|
// Current column in file.
|
2011-08-16 12:53:13 +00:00
|
|
|
uint column_;
|
2014-07-22 23:36:09 +00:00
|
|
|
|
|
|
|
// Original Unicode encoding of the data.
|
|
|
|
Encoding encoding_;
|
2011-08-16 12:53:13 +00:00
|
|
|
|
2014-07-21 23:09:27 +00:00
|
|
|
version(unittest)
|
|
|
|
{
|
|
|
|
// Endianness of the input before it was converted (for testing)
|
|
|
|
Endian endian_;
|
|
|
|
}
|
|
|
|
|
2011-08-16 12:53:13 +00:00
|
|
|
public:
|
2014-07-22 00:40:44 +00:00
|
|
|
import std.stream;
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Construct a Reader.
|
|
|
|
///
|
|
|
|
/// Params: stream = Input stream. Must be readable and seekable.
|
|
|
|
///
|
2014-07-22 22:41:43 +00:00
|
|
|
/// Throws: ReaderException if the stream is invalid, on a UTF decoding error
|
|
|
|
/// or if there are nonprintable unicode characters illegal in YAML.
|
2014-07-19 13:38:09 +00:00
|
|
|
this(Stream stream) @trusted //!nothrow
|
2011-08-16 12:53:13 +00:00
|
|
|
{
|
2014-07-21 23:09:27 +00:00
|
|
|
auto streamBytes = streamToBytesGC(stream);
|
2014-07-22 23:36:09 +00:00
|
|
|
auto endianResult = fixUTFByteOrder(streamBytes);
|
|
|
|
if(endianResult.bytesStripped > 0)
|
2014-07-21 23:09:27 +00:00
|
|
|
{
|
|
|
|
throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned "
|
|
|
|
"to 2 or 4 bytes, respectively");
|
|
|
|
}
|
|
|
|
|
2014-07-22 23:36:09 +00:00
|
|
|
version(unittest) { endian_ = endianResult.endian; }
|
|
|
|
encoding_ = endianResult.encoding;
|
|
|
|
|
|
|
|
auto decodeResult = decodeUTF(endianResult.array, endianResult.encoding);
|
2014-07-22 22:09:40 +00:00
|
|
|
|
2014-07-22 23:36:09 +00:00
|
|
|
const msg = decodeResult.errorMessage;
|
2014-07-22 18:21:17 +00:00
|
|
|
if(msg !is null)
|
|
|
|
{
|
|
|
|
throw new ReaderException("UTF decoding error: " ~ msg);
|
|
|
|
}
|
2011-08-16 12:53:13 +00:00
|
|
|
|
2014-07-22 23:36:09 +00:00
|
|
|
buffer_ = decodeResult.decoded;
|
|
|
|
// The part of buffer_ excluding trailing zeroes.
|
2014-07-22 22:09:40 +00:00
|
|
|
auto noZeros = buffer_;
|
|
|
|
while(!noZeros.empty && noZeros.back == '\0') { noZeros.popBack(); }
|
|
|
|
enforce(printable(noZeros[]),
|
|
|
|
new ReaderException("Special unicode characters are not allowed"));
|
2011-08-16 12:53:13 +00:00
|
|
|
}
|
|
|
|
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Get character at specified index relative to current position.
|
2014-07-21 23:10:23 +00:00
|
|
|
///
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Params: index = Index of the character to get relative to current position
|
2014-07-22 00:40:14 +00:00
|
|
|
/// in the buffer.
|
2014-07-21 23:10:23 +00:00
|
|
|
///
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Returns: Character at specified position.
|
2014-07-21 23:10:23 +00:00
|
|
|
///
|
2014-07-22 22:42:49 +00:00
|
|
|
// XXX removed; search for 'risky' to find why.
|
|
|
|
// Throws: ReaderException if trying to read past the end of the buffer.
|
2014-07-22 22:46:00 +00:00
|
|
|
dchar peek(size_t index = 0) @safe pure nothrow const @nogc
|
2011-08-16 12:53:13 +00:00
|
|
|
{
|
2011-10-25 18:23:44 +00:00
|
|
|
if(buffer_.length <= bufferOffset_ + index)
|
2011-10-23 18:17:37 +00:00
|
|
|
{
|
2014-07-22 22:42:49 +00:00
|
|
|
// XXX This is risky; revert this and the 'risky' change in UTF decoder
|
|
|
|
// if any bugs are introduced. We rely on the assumption that Reader
|
|
|
|
// only uses peek() to detect the of buffer. The test suite passes.
|
|
|
|
// throw new ReaderException("Trying to read past the end of the buffer");
|
|
|
|
return '\0';
|
2011-10-23 18:17:37 +00:00
|
|
|
}
|
|
|
|
|
2011-08-16 12:53:13 +00:00
|
|
|
return buffer_[bufferOffset_ + index];
|
|
|
|
}
|
|
|
|
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Get specified number of characters starting at current position.
|
2014-07-21 23:10:23 +00:00
|
|
|
///
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Note: This gets only a "view" into the internal buffer,
|
|
|
|
/// which WILL get invalidated after other Reader calls.
|
2014-07-21 23:10:23 +00:00
|
|
|
///
|
2014-07-22 22:41:43 +00:00
|
|
|
/// Params: length = Number of characters to get. May reach past the end of the
|
|
|
|
/// buffer; in that case the returned slice will be shorter.
|
2014-07-21 23:10:23 +00:00
|
|
|
///
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Returns: Characters starting at current position or an empty slice if out of bounds.
|
2014-07-22 22:40:00 +00:00
|
|
|
const(dstring) prefix(size_t length) @safe pure nothrow const @nogc
|
2011-10-29 16:21:44 +00:00
|
|
|
{
|
|
|
|
return slice(0, length);
|
|
|
|
}
|
|
|
|
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Get a slice view of the internal buffer.
|
2014-07-21 23:10:23 +00:00
|
|
|
///
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Note: This gets only a "view" into the internal buffer,
|
|
|
|
/// which WILL get invalidated after other Reader calls.
|
2014-07-21 23:10:23 +00:00
|
|
|
///
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Params: start = Start of the slice relative to current position.
|
2014-07-22 22:41:43 +00:00
|
|
|
/// end = End of the slice relative to current position. May reach
|
|
|
|
/// past the end of the buffer; in that case the returned
|
|
|
|
/// slice will be shorter.
|
2014-07-21 23:10:23 +00:00
|
|
|
///
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Returns: Slice into the internal buffer or an empty slice if out of bounds.
|
2014-07-22 22:40:00 +00:00
|
|
|
const(dstring) slice(size_t start, size_t end) @trusted pure nothrow const @nogc
|
2011-08-16 12:53:13 +00:00
|
|
|
{
|
2011-10-29 16:21:44 +00:00
|
|
|
end += bufferOffset_;
|
|
|
|
start += bufferOffset_;
|
|
|
|
end = min(buffer_.length, end);
|
|
|
|
|
2011-11-16 02:10:29 +00:00
|
|
|
return end > start ? cast(dstring)buffer_[start .. end] : "";
|
2011-08-16 12:53:13 +00:00
|
|
|
}
|
|
|
|
|
2014-07-22 00:40:14 +00:00
|
|
|
/// Get the next character, moving buffer position beyond it.
|
2014-07-21 23:10:23 +00:00
|
|
|
///
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Returns: Next character.
|
2014-07-21 23:10:23 +00:00
|
|
|
///
|
2014-07-22 00:40:14 +00:00
|
|
|
/// Throws: ReaderException if trying to read past the end of the buffer
|
2014-07-21 22:41:13 +00:00
|
|
|
/// or if invalid data is read.
|
2014-07-22 22:46:00 +00:00
|
|
|
dchar get() @safe pure nothrow @nogc
|
2011-08-16 12:53:13 +00:00
|
|
|
{
|
|
|
|
const result = peek();
|
|
|
|
forward();
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
2014-07-22 00:40:14 +00:00
|
|
|
/// Get specified number of characters, moving buffer position beyond them.
|
2014-07-21 23:10:23 +00:00
|
|
|
///
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Params: length = Number or characters to get.
|
2014-07-21 23:10:23 +00:00
|
|
|
///
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Returns: Characters starting at current position.
|
2014-07-22 22:46:37 +00:00
|
|
|
dstring get(size_t length) @safe pure nothrow @nogc
|
2011-08-16 12:53:13 +00:00
|
|
|
{
|
2014-07-22 22:46:37 +00:00
|
|
|
auto result = prefix(length);
|
2011-08-16 12:53:13 +00:00
|
|
|
forward(length);
|
2012-09-08 23:42:13 +00:00
|
|
|
return result;
|
2011-08-16 12:53:13 +00:00
|
|
|
}
|
|
|
|
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Move current position forward.
|
2014-07-21 23:10:23 +00:00
|
|
|
///
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Params: length = Number of characters to move position forward.
|
2014-07-22 22:40:00 +00:00
|
|
|
void forward(size_t length = 1) @safe pure nothrow @nogc
|
2011-08-16 12:53:13 +00:00
|
|
|
{
|
2011-11-16 02:10:29 +00:00
|
|
|
mixin FastCharSearch!"\n\u0085\u2028\u2029"d search;
|
|
|
|
|
2014-07-22 22:41:14 +00:00
|
|
|
for(; length > 0; --length)
|
2011-08-16 12:53:13 +00:00
|
|
|
{
|
|
|
|
const c = buffer_[bufferOffset_];
|
|
|
|
++bufferOffset_;
|
|
|
|
++charIndex_;
|
2011-10-23 18:17:37 +00:00
|
|
|
//New line.
|
2011-10-24 18:36:26 +00:00
|
|
|
if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n'))
|
2011-08-16 12:53:13 +00:00
|
|
|
{
|
|
|
|
++line_;
|
|
|
|
column_ = 0;
|
|
|
|
}
|
2014-07-22 22:41:14 +00:00
|
|
|
else if(c != '\uFEFF') { ++column_; }
|
2011-08-16 12:53:13 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-22 00:40:14 +00:00
|
|
|
/// Get a string describing current buffer position, used for error messages.
|
2014-07-21 23:23:15 +00:00
|
|
|
final Mark mark() @safe pure nothrow const @nogc { return Mark(line_, column_); }
|
2011-08-16 12:53:13 +00:00
|
|
|
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Get current line number.
|
2014-07-21 23:23:15 +00:00
|
|
|
final uint line() @safe pure nothrow const @nogc { return line_; }
|
2011-08-16 12:53:13 +00:00
|
|
|
|
2014-07-21 22:41:13 +00:00
|
|
|
/// Get current column number.
|
2014-07-21 23:23:15 +00:00
|
|
|
final uint column() @safe pure nothrow const @nogc { return column_; }
|
2011-08-16 12:53:13 +00:00
|
|
|
|
2014-07-22 00:40:14 +00:00
|
|
|
/// Get index of the current character in the buffer.
|
2014-07-21 23:23:15 +00:00
|
|
|
final size_t charIndex() @safe pure nothrow const @nogc { return charIndex_; }
|
2011-08-16 12:53:13 +00:00
|
|
|
|
2014-07-22 00:40:14 +00:00
|
|
|
/// Get encoding of the input buffer.
|
2014-07-22 23:36:09 +00:00
|
|
|
final Encoding encoding() @safe pure nothrow const @nogc { return encoding_; }
|
2011-11-16 02:10:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
2014-07-22 23:36:09 +00:00
|
|
|
// Decode an UTF-8/16/32 buffer to UTF-32 (for UTF-32 this does nothing).
|
|
|
|
//
|
|
|
|
// Params:
|
|
|
|
//
|
|
|
|
// input = The UTF-8/16/32 buffer to decode.
|
|
|
|
// encoding = Encoding of input.
|
|
|
|
//
|
|
|
|
// Returns:
|
|
|
|
//
|
|
|
|
// A struct with the following members:
|
|
|
|
//
|
|
|
|
// $(D string errorMessage) In case of a decoding error, the error message is stored
|
|
|
|
// here. If there was no error, errorMessage is NULL. Always
|
|
|
|
// check this first before using the other members.
|
|
|
|
// $(D dchar[] decoded) A GC-allocated buffer with decoded UTF-32 characters.
|
|
|
|
// $(D size_t maxChars) XXX reserved for future
|
|
|
|
auto decodeUTF(ubyte[] input, UTFEncoding encoding) @safe pure nothrow
|
2011-11-16 02:10:29 +00:00
|
|
|
{
|
2014-07-22 23:36:09 +00:00
|
|
|
// Documented in function ddoc.
|
|
|
|
struct Result
|
|
|
|
{
|
|
|
|
string errorMessage;
|
2014-07-22 18:21:17 +00:00
|
|
|
|
2014-07-22 23:36:09 +00:00
|
|
|
dchar[] decoded;
|
|
|
|
}
|
2014-07-22 18:21:17 +00:00
|
|
|
|
2014-07-22 23:36:09 +00:00
|
|
|
Result result;
|
2011-11-16 02:10:29 +00:00
|
|
|
|
2014-07-22 23:36:09 +00:00
|
|
|
final switch(encoding)
|
|
|
|
{
|
|
|
|
case UTFEncoding.UTF_8: result.maxChars = input.length; break;
|
|
|
|
case UTFEncoding.UTF_16: result.maxChars = input.length / 2; break;
|
|
|
|
case UTFEncoding.UTF_32: result.maxChars = input.length / 2; break;
|
|
|
|
}
|
2014-07-22 01:33:36 +00:00
|
|
|
|
2014-07-22 23:36:09 +00:00
|
|
|
// Decode input_ if it's encoded as UTF-8 or UTF-16.
|
|
|
|
//
|
|
|
|
// Params:
|
|
|
|
//
|
|
|
|
// buffer = The input buffer to decode.
|
|
|
|
// result = A Result struct to put decoded result and any error messages to.
|
|
|
|
//
|
|
|
|
// On error, result.errorMessage will be set.
|
|
|
|
static void decode(C)(C[] input, ref Result result) @safe pure nothrow
|
|
|
|
{
|
|
|
|
// End of part of input that contains complete characters that can be decoded.
|
|
|
|
const size_t end = endOfLastUTFSequence(input);
|
|
|
|
// If end is 0, there are no full chars.
|
|
|
|
// This can happen at the end of file if there is an incomplete UTF sequence.
|
|
|
|
if(end < input.length)
|
2014-07-22 01:31:56 +00:00
|
|
|
{
|
2014-07-22 23:36:09 +00:00
|
|
|
result.errorMessage = "Invalid UTF character at the end of input";
|
|
|
|
return;
|
2014-07-22 01:31:56 +00:00
|
|
|
}
|
2014-07-22 00:11:16 +00:00
|
|
|
|
2014-07-22 23:36:09 +00:00
|
|
|
const srclength = input.length;
|
|
|
|
try for(size_t srcpos = 0; srcpos < srclength;)
|
2011-11-16 02:10:29 +00:00
|
|
|
{
|
2014-07-22 23:36:09 +00:00
|
|
|
const c = input[srcpos];
|
|
|
|
if(c < 0x80)
|
2014-07-22 01:31:56 +00:00
|
|
|
{
|
2014-07-22 23:36:09 +00:00
|
|
|
result.decoded ~= c;
|
|
|
|
++srcpos;
|
2014-07-22 01:31:56 +00:00
|
|
|
}
|
2014-07-22 23:36:09 +00:00
|
|
|
else
|
2014-07-22 18:21:17 +00:00
|
|
|
{
|
2014-07-22 23:36:09 +00:00
|
|
|
result.decoded ~= std.utf.decode(input, srcpos);
|
2014-07-22 18:21:17 +00:00
|
|
|
}
|
2011-11-16 02:10:29 +00:00
|
|
|
}
|
2014-07-22 23:36:09 +00:00
|
|
|
catch(UTFException e)
|
2011-11-16 02:10:29 +00:00
|
|
|
{
|
2014-07-22 23:36:09 +00:00
|
|
|
result.errorMessage = e.msg;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
catch(Exception e)
|
|
|
|
{
|
|
|
|
assert(false, "Unexpected exception in decode(): " ~ e.msg);
|
2011-11-16 02:10:29 +00:00
|
|
|
}
|
2014-07-22 23:36:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
final switch(encoding)
|
|
|
|
{
|
|
|
|
case UTFEncoding.UTF_8: decode(cast(char[])input, result); break;
|
|
|
|
case UTFEncoding.UTF_16:
|
|
|
|
assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
|
|
|
|
decode(cast(wchar[])input, result);
|
|
|
|
break;
|
|
|
|
case UTFEncoding.UTF_32:
|
|
|
|
assert(input.length % 4 == 0,
|
|
|
|
"UTF-32 buffer size must be a multiple of 4");
|
|
|
|
// No need to decode anything
|
|
|
|
result.decoded = cast(dchar[])input;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if(result.errorMessage !is null) { return result; }
|
|
|
|
|
|
|
|
// XXX This is risky. We rely on the assumption that the scanner only uses
|
|
|
|
// peek() to detect the end of the buffer. Should this cause any bugs,
|
|
|
|
// revert.
|
|
|
|
//
|
|
|
|
// The buffer must be zero terminated for scanner to detect its end.
|
|
|
|
// if(result.decoded.empty || result.decoded.back() != '\0')
|
|
|
|
// {
|
|
|
|
// result.decoded ~= cast(dchar)'\0';
|
|
|
|
// }
|
|
|
|
|
|
|
|
return result;
|
2011-11-16 02:10:29 +00:00
|
|
|
}
|
|
|
|
|
2014-07-22 23:36:09 +00:00
|
|
|
|
2014-07-19 13:38:09 +00:00
|
|
|
/// Determine if all characters in an array are printable.
|
2014-07-21 07:56:41 +00:00
|
|
|
///
|
2014-07-19 13:38:09 +00:00
|
|
|
/// Params: chars = Characters to check.
|
2014-07-21 07:56:41 +00:00
|
|
|
///
|
2014-07-19 13:38:09 +00:00
|
|
|
/// Returns: True if all the characters are printable, false otherwise.
|
2014-07-21 07:56:41 +00:00
|
|
|
bool printable(const dchar[] chars) @safe pure nothrow @nogc
|
2011-11-16 02:10:29 +00:00
|
|
|
{
|
|
|
|
foreach(c; chars)
|
2011-08-16 12:53:13 +00:00
|
|
|
{
|
2011-11-16 02:10:29 +00:00
|
|
|
if(!((c == 0x09 || c == 0x0A || c == 0x0D || c == 0x85) ||
|
|
|
|
(c >= 0x20 && c <= 0x7E) ||
|
|
|
|
(c >= 0xA0 && c <= '\uD7FF') ||
|
|
|
|
(c >= '\uE000' && c <= '\uFFFD')))
|
2011-08-16 12:53:13 +00:00
|
|
|
{
|
2011-11-16 02:10:29 +00:00
|
|
|
return false;
|
2011-08-16 12:53:13 +00:00
|
|
|
}
|
|
|
|
}
|
2011-11-16 02:10:29 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-07-22 23:36:09 +00:00
|
|
|
// Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer.
|
|
|
|
size_t endOfLastUTFSequence(C)(const C[] buffer)
|
|
|
|
@safe pure nothrow @nogc
|
|
|
|
{
|
|
|
|
// UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence).
|
|
|
|
static immutable ubyte[256] utf8Stride =
|
|
|
|
[
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
|
|
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
|
|
|
|
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
|
|
|
|
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
|
|
|
|
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
|
|
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
|
|
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
|
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
|
|
|
4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
|
|
|
|
];
|
|
|
|
|
|
|
|
static if(is(C == char))
|
|
|
|
{
|
|
|
|
for(long end = buffer.length - 1; end >= 0; --end)
|
|
|
|
{
|
|
|
|
const stride = utf8Stride[buffer[cast(size_t)end]];
|
|
|
|
if(stride != 0xFF)
|
|
|
|
{
|
|
|
|
// If stride goes beyond end of the buffer, return end.
|
|
|
|
// Otherwise the last sequence ends at buffer.length, so we can
|
|
|
|
// return that. (Unless there is an invalid code point, which is
|
|
|
|
// caught at decoding)
|
|
|
|
return (stride > buffer.length - end) ? cast(size_t)end : buffer.length;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
else static if(is(C == wchar))
|
|
|
|
{
|
|
|
|
// TODO this is O(N), which is slow. Find out if we can somehow go
|
|
|
|
// from the end backwards with UTF-16.
|
|
|
|
size_t end = 0;
|
|
|
|
while(end < buffer.length)
|
|
|
|
{
|
|
|
|
const s = stride(buffer, end);
|
|
|
|
if(s + end > buffer.length) { break; }
|
|
|
|
end += s;
|
|
|
|
}
|
|
|
|
return end;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-22 01:33:50 +00:00
|
|
|
// Unittests.
|
2011-11-16 02:10:29 +00:00
|
|
|
|
2014-07-22 00:40:44 +00:00
|
|
|
import std.stream;
|
2011-11-16 02:10:29 +00:00
|
|
|
void testEndian(R)()
|
|
|
|
{
|
|
|
|
writeln(typeid(R).toString() ~ ": endian unittest");
|
|
|
|
void endian_test(ubyte[] data, Encoding encoding_expected, Endian endian_expected)
|
|
|
|
{
|
2014-07-21 07:53:28 +00:00
|
|
|
auto reader = new R(new MemoryStream(data));
|
2011-11-16 02:10:29 +00:00
|
|
|
assert(reader.encoding == encoding_expected);
|
2014-07-21 23:09:27 +00:00
|
|
|
assert(reader.endian_ == endian_expected);
|
2011-11-16 02:10:29 +00:00
|
|
|
}
|
|
|
|
ubyte[] little_endian_utf_16 = [0xFF, 0xFE, 0x7A, 0x00];
|
|
|
|
ubyte[] big_endian_utf_16 = [0xFE, 0xFF, 0x00, 0x7A];
|
|
|
|
endian_test(little_endian_utf_16, Encoding.UTF_16, Endian.littleEndian);
|
|
|
|
endian_test(big_endian_utf_16, Encoding.UTF_16, Endian.bigEndian);
|
|
|
|
}
|
|
|
|
|
|
|
|
void testPeekPrefixForward(R)()
|
|
|
|
{
|
|
|
|
writeln(typeid(R).toString() ~ ": peek/prefix/forward unittest");
|
|
|
|
ubyte[] data = ByteOrderMarks[BOM.UTF8] ~ cast(ubyte[])"data";
|
2014-07-21 07:53:28 +00:00
|
|
|
auto reader = new R(new MemoryStream(data));
|
2011-11-16 02:10:29 +00:00
|
|
|
assert(reader.peek() == 'd');
|
|
|
|
assert(reader.peek(1) == 'a');
|
|
|
|
assert(reader.peek(2) == 't');
|
|
|
|
assert(reader.peek(3) == 'a');
|
|
|
|
assert(reader.peek(4) == '\0');
|
|
|
|
assert(reader.prefix(4) == "data");
|
2014-07-22 22:42:49 +00:00
|
|
|
// assert(reader.prefix(6) == "data\0");
|
2011-11-16 02:10:29 +00:00
|
|
|
reader.forward(2);
|
|
|
|
assert(reader.peek(1) == 'a');
|
2014-07-22 22:42:49 +00:00
|
|
|
// assert(collectException(reader.peek(3)));
|
2011-11-16 02:10:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void testUTF(R)()
|
|
|
|
{
|
|
|
|
writeln(typeid(R).toString() ~ ": UTF formats unittest");
|
|
|
|
dchar[] data = cast(dchar[])"data";
|
|
|
|
void utf_test(T)(T[] data, BOM bom)
|
2011-08-16 12:53:13 +00:00
|
|
|
{
|
2014-07-21 07:56:41 +00:00
|
|
|
ubyte[] bytes = ByteOrderMarks[bom] ~
|
2011-11-16 02:10:29 +00:00
|
|
|
(cast(ubyte*)data.ptr)[0 .. data.length * T.sizeof];
|
2014-07-21 07:53:28 +00:00
|
|
|
auto reader = new R(new MemoryStream(bytes));
|
2011-08-16 12:53:13 +00:00
|
|
|
assert(reader.peek() == 'd');
|
|
|
|
assert(reader.peek(1) == 'a');
|
|
|
|
assert(reader.peek(2) == 't');
|
|
|
|
assert(reader.peek(3) == 'a');
|
|
|
|
}
|
2011-11-16 02:10:29 +00:00
|
|
|
utf_test!char(to!(char[])(data), BOM.UTF8);
|
|
|
|
utf_test!wchar(to!(wchar[])(data), endian == Endian.bigEndian ? BOM.UTF16BE : BOM.UTF16LE);
|
|
|
|
utf_test(data, endian == Endian.bigEndian ? BOM.UTF32BE : BOM.UTF32LE);
|
|
|
|
}
|
|
|
|
|
2014-07-21 07:53:58 +00:00
|
|
|
void test1Byte(R)()
|
|
|
|
{
|
|
|
|
writeln(typeid(R).toString() ~ ": 1 byte file unittest");
|
|
|
|
ubyte[] data = [97];
|
|
|
|
|
|
|
|
auto reader = new R(new MemoryStream(data));
|
|
|
|
assert(reader.peek() == 'a');
|
|
|
|
assert(reader.peek(1) == '\0');
|
2014-07-22 22:42:49 +00:00
|
|
|
// assert(collectException(reader.peek(2)));
|
2014-07-21 07:53:58 +00:00
|
|
|
}
|
|
|
|
|
2011-11-16 02:10:29 +00:00
|
|
|
unittest
|
|
|
|
{
|
|
|
|
testEndian!Reader();
|
|
|
|
testPeekPrefixForward!Reader();
|
|
|
|
testUTF!Reader();
|
2014-07-21 07:53:58 +00:00
|
|
|
test1Byte!Reader();
|
2011-08-16 12:53:13 +00:00
|
|
|
}
|