remove dyaml.unused and rest of dyaml.nogcutil in favour of std.utf functions
This commit is contained in:
parent
d7f51a8225
commit
1dac1f39f3
|
@ -24,7 +24,6 @@ dyaml_src = [
|
||||||
'source/dyaml/linebreak.d',
|
'source/dyaml/linebreak.d',
|
||||||
'source/dyaml/loader.d',
|
'source/dyaml/loader.d',
|
||||||
'source/dyaml/node.d',
|
'source/dyaml/node.d',
|
||||||
'source/dyaml/nogcutil.d',
|
|
||||||
'source/dyaml/package.d',
|
'source/dyaml/package.d',
|
||||||
'source/dyaml/parser.d',
|
'source/dyaml/parser.d',
|
||||||
'source/dyaml/queue.d',
|
'source/dyaml/queue.d',
|
||||||
|
@ -46,8 +45,7 @@ dyaml_src = [
|
||||||
'source/dyaml/test/representer.d',
|
'source/dyaml/test/representer.d',
|
||||||
'source/dyaml/test/resolver.d',
|
'source/dyaml/test/resolver.d',
|
||||||
'source/dyaml/test/tokens.d',
|
'source/dyaml/test/tokens.d',
|
||||||
'source/dyaml/token.d',
|
'source/dyaml/token.d'
|
||||||
'source/dyaml/unused.d'
|
|
||||||
]
|
]
|
||||||
install_subdir('source/dyaml', install_dir: 'include/d/yaml/')
|
install_subdir('source/dyaml', install_dir: 'include/d/yaml/')
|
||||||
|
|
||||||
|
|
|
@ -1,242 +0,0 @@
|
||||||
// Copyright Ferdinand Majerech 2014, Digital Mars 2000-2012, Andrei Alexandrescu 2008- and Jonathan M Davis 2011-.
|
|
||||||
// Distributed under the Boost Software License, Version 1.0.
|
|
||||||
// (See accompanying file LICENSE_1_0.txt or copy at
|
|
||||||
// http://www.boost.org/LICENSE_1_0.txt)
|
|
||||||
|
|
||||||
|
|
||||||
/// @nogc versions of or alternatives to Phobos functions that are not yet @nogc and
|
|
||||||
/// wrappers to simplify their use.
|
|
||||||
module dyaml.nogcutil;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
import std.traits;
|
|
||||||
import std.typecons;
|
|
||||||
import std.typetuple;
|
|
||||||
import std.range;
|
|
||||||
|
|
||||||
|
|
||||||
/// Result of a validateUTF8NoGC call.
|
|
||||||
struct ValidateResult
|
|
||||||
{
|
|
||||||
/// Is the validated string valid?
|
|
||||||
bool valid;
|
|
||||||
/// Number of characters in the string.
|
|
||||||
///
|
|
||||||
/// If the string is not valid, this is the number of valid characters before
|
|
||||||
/// hitting the first invalid sequence.
|
|
||||||
size_t characterCount;
|
|
||||||
/// If the string is not valid, error message with details is here.
|
|
||||||
string msg;
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Validate a UTF-8 string, checking if it is well-formed Unicode.
|
|
||||||
///
|
|
||||||
/// See_Also: ValidateResult
|
|
||||||
ValidateResult validateUTF8NoGC(const(char[]) str) @safe pure nothrow @nogc
|
|
||||||
{
|
|
||||||
immutable len = str.length;
|
|
||||||
size_t characterCount;
|
|
||||||
outer: for (size_t index = 0; index < len; )
|
|
||||||
{
|
|
||||||
if(str[index] < 0x80)
|
|
||||||
{
|
|
||||||
++index;
|
|
||||||
++characterCount;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto decoded = decodeUTF8NoGC!(No.validated)(str, index);
|
|
||||||
if(decoded.errorMessage !is null)
|
|
||||||
{
|
|
||||||
return ValidateResult(false, characterCount, decoded.errorMessage);
|
|
||||||
}
|
|
||||||
++characterCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ValidateResult(true, characterCount);
|
|
||||||
}
|
|
||||||
|
|
||||||
/// @nogc version of std.utf.decode() for char[].
|
|
||||||
///
|
|
||||||
/// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to
|
|
||||||
/// force code using this function to be efficient.
|
|
||||||
///
|
|
||||||
/// Params:
|
|
||||||
///
|
|
||||||
/// validated = If ture, assume str is a valid UTF-8 string and don't generate any
|
|
||||||
/// error-checking code. If validated is true, str $(B must) be a valid
|
|
||||||
/// character, otherwise undefined behavior will occur. Also affects the
|
|
||||||
/// return type.
|
|
||||||
/// str = Will decode the first code point from this string.
|
|
||||||
/// index = Index in str where the code point starts. Will be updated to point to
|
|
||||||
/// the next code point.
|
|
||||||
///
|
|
||||||
/// Returns: If validated is true, the decoded character.
|
|
||||||
/// Otherwise a struct with a 'decoded' member - the decoded character, and a
|
|
||||||
/// 'string errorMessage' member that is null on success and otherwise stores
|
|
||||||
/// the error message.
|
|
||||||
auto decodeUTF8NoGC(Flag!"validated" validated)(const(char[]) str, ref size_t index)
|
|
||||||
{
|
|
||||||
static if(!validated) struct Result
|
|
||||||
{
|
|
||||||
dchar decoded;
|
|
||||||
string errorMessage;
|
|
||||||
}
|
|
||||||
else alias Result = dchar;
|
|
||||||
|
|
||||||
/// Dchar bitmask for different numbers of UTF-8 code units.
|
|
||||||
enum bitMask = tuple((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
|
|
||||||
|
|
||||||
auto pstr = str[index..$];
|
|
||||||
|
|
||||||
immutable length = str.length - index;
|
|
||||||
ubyte fst = pstr[0];
|
|
||||||
|
|
||||||
assert(fst & 0x80);
|
|
||||||
enum invalidUTFMsg = "Invalid UTF-8 sequence";
|
|
||||||
static if(!validated) { enum invalidUTF = Result(cast(dchar)int.max, invalidUTFMsg); }
|
|
||||||
|
|
||||||
// starter must have at least 2 first bits set
|
|
||||||
static if(validated)
|
|
||||||
{
|
|
||||||
assert((fst & 0b1100_0000) == 0b1100_0000, invalidUTFMsg);
|
|
||||||
}
|
|
||||||
else if((fst & 0b1100_0000) != 0b1100_0000)
|
|
||||||
{
|
|
||||||
return invalidUTF;
|
|
||||||
}
|
|
||||||
|
|
||||||
ubyte tmp = void;
|
|
||||||
dchar d = fst; // upper control bits are masked out later
|
|
||||||
fst <<= 1;
|
|
||||||
|
|
||||||
|
|
||||||
foreach (i; TypeTuple!(1, 2, 3))
|
|
||||||
{
|
|
||||||
static if(validated) { assert(i != length, "Decoding out of bounds"); }
|
|
||||||
else if(i == length) { return Result(cast(dchar)int.max, "Decoding out of bounds"); }
|
|
||||||
|
|
||||||
tmp = pstr[i];
|
|
||||||
static if(validated) { assert((tmp & 0xC0) == 0x80, invalidUTFMsg); }
|
|
||||||
else if((tmp & 0xC0) != 0x80) { return invalidUTF; }
|
|
||||||
|
|
||||||
d = (d << 6) | (tmp & 0x3F);
|
|
||||||
fst <<= 1;
|
|
||||||
|
|
||||||
if (!(fst & 0x80)) // no more bytes
|
|
||||||
{
|
|
||||||
d &= bitMask[i]; // mask out control bits
|
|
||||||
|
|
||||||
// overlong, could have been encoded with i bytes
|
|
||||||
static if(validated) { assert((d & ~bitMask[i - 1]) != 0, invalidUTFMsg); }
|
|
||||||
else if((d & ~bitMask[i - 1]) == 0) { return invalidUTF; }
|
|
||||||
|
|
||||||
// check for surrogates only needed for 3 bytes
|
|
||||||
static if (i == 2)
|
|
||||||
{
|
|
||||||
static if(validated) { assert(isValidDchar(d), invalidUTFMsg); }
|
|
||||||
else if(!isValidDchar(d)) { return invalidUTF; }
|
|
||||||
}
|
|
||||||
|
|
||||||
index += i + 1;
|
|
||||||
static if (i == 3)
|
|
||||||
{
|
|
||||||
static if(validated) { assert(d <= dchar.max, invalidUTFMsg); }
|
|
||||||
else if(d > dchar.max) { return invalidUTF; }
|
|
||||||
}
|
|
||||||
|
|
||||||
return Result(d);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
static if(validated) { assert(false, invalidUTFMsg); }
|
|
||||||
else { return invalidUTF; }
|
|
||||||
}
|
|
||||||
|
|
||||||
/// ditto
|
|
||||||
alias decodeValidUTF8NoGC = decodeUTF8NoGC!(Yes.validated);
|
|
||||||
|
|
||||||
/// @nogc version of std.utf.encode() for char[].
|
|
||||||
///
|
|
||||||
/// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to
|
|
||||||
/// force code using this function to be efficient.
|
|
||||||
///
|
|
||||||
/// Params:
|
|
||||||
/// validated = If true, asssume c is a valid, non-surrogate UTF-32 code point and don't
|
|
||||||
/// generate any error-checking code. If validated is true, c $(B must) be
|
|
||||||
/// a valid character, otherwise undefined behavior will occur. Also affects
|
|
||||||
/// the return type.
|
|
||||||
/// buf = Buffer to write the encoded result to.
|
|
||||||
/// c = Character to encode.
|
|
||||||
///
|
|
||||||
/// Returns: If validated is true, number of bytes the encoded character takes up in buf.
|
|
||||||
/// Otherwise a struct with a 'bytes' member specifying the number of bytes of
|
|
||||||
/// the endocded character, and a 'string errorMessage' member that is null
|
|
||||||
/// if there was no error and otherwise stores the error message.
|
|
||||||
auto encodeCharNoGC(Flag!"validated" validated)(ref char[4] buf, dchar c)
|
|
||||||
@safe pure nothrow @nogc
|
|
||||||
{
|
|
||||||
static if(!validated) struct Result
|
|
||||||
{
|
|
||||||
size_t bytes;
|
|
||||||
string errorMessage;
|
|
||||||
}
|
|
||||||
else alias Result = size_t;
|
|
||||||
|
|
||||||
// Force the caller to optimize ASCII (the 1-byte case)
|
|
||||||
assert(c >= 0x80, "Caller should explicitly handle ASCII chars");
|
|
||||||
if (c <= 0x7FF)
|
|
||||||
{
|
|
||||||
assert(isValidDchar(c));
|
|
||||||
buf[0] = cast(char)(0xC0 | (c >> 6));
|
|
||||||
buf[1] = cast(char)(0x80 | (c & 0x3F));
|
|
||||||
return Result(2);
|
|
||||||
}
|
|
||||||
if (c <= 0xFFFF)
|
|
||||||
{
|
|
||||||
static if(validated)
|
|
||||||
{
|
|
||||||
assert(0xD800 > c || c > 0xDFFF,
|
|
||||||
"Supposedly valid code point is a surrogate code point");
|
|
||||||
}
|
|
||||||
else if(0xD800 <= c && c <= 0xDFFF)
|
|
||||||
{
|
|
||||||
return Result(size_t.max, "Can't encode a surrogate code point in UTF-8");
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(isValidDchar(c));
|
|
||||||
buf[0] = cast(char)(0xE0 | (c >> 12));
|
|
||||||
buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
|
||||||
buf[2] = cast(char)(0x80 | (c & 0x3F));
|
|
||||||
return Result(3);
|
|
||||||
}
|
|
||||||
if (c <= 0x10FFFF)
|
|
||||||
{
|
|
||||||
assert(isValidDchar(c));
|
|
||||||
buf[0] = cast(char)(0xF0 | (c >> 18));
|
|
||||||
buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
|
|
||||||
buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
|
|
||||||
buf[3] = cast(char)(0x80 | (c & 0x3F));
|
|
||||||
return Result(4);
|
|
||||||
}
|
|
||||||
|
|
||||||
assert(!isValidDchar(c));
|
|
||||||
static if(!validated)
|
|
||||||
{
|
|
||||||
return Result(size_t.max, "Can't encode an invalid code point in UTF-8");
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
assert(false, "Supposedly valid code point is invalid");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// ditto
|
|
||||||
alias encodeValidCharNoGC = encodeCharNoGC!(Yes.validated);
|
|
||||||
|
|
||||||
/// @nogc version of std.utf.isValidDchar
|
|
||||||
bool isValidDchar(dchar c) @safe pure nothrow @nogc
|
|
||||||
{
|
|
||||||
return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
|
|
||||||
}
|
|
|
@ -538,7 +538,6 @@ final class Parser
|
||||||
{
|
{
|
||||||
string notInPlace;
|
string notInPlace;
|
||||||
bool inEscape = false;
|
bool inEscape = false;
|
||||||
import dyaml.nogcutil;
|
|
||||||
auto appender = appender!(char[])();
|
auto appender = appender!(char[])();
|
||||||
for(char[] oldValue = tokenValue; !oldValue.empty();)
|
for(char[] oldValue = tokenValue; !oldValue.empty();)
|
||||||
{
|
{
|
||||||
|
|
|
@ -26,7 +26,6 @@ import tinyendian;
|
||||||
import dyaml.fastcharsearch;
|
import dyaml.fastcharsearch;
|
||||||
import dyaml.encoding;
|
import dyaml.encoding;
|
||||||
import dyaml.exception;
|
import dyaml.exception;
|
||||||
import dyaml.nogcutil;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -134,7 +133,7 @@ final class Reader
|
||||||
///
|
///
|
||||||
// XXX removed; search for 'risky' to find why.
|
// XXX removed; search for 'risky' to find why.
|
||||||
// Throws: ReaderException if trying to read past the end of the buffer.
|
// Throws: ReaderException if trying to read past the end of the buffer.
|
||||||
dchar peek(const size_t index) @safe pure nothrow @nogc
|
dchar peek(const size_t index) @safe pure
|
||||||
{
|
{
|
||||||
if(index < upcomingASCII_) { return buffer_[bufferOffset_ + index]; }
|
if(index < upcomingASCII_) { return buffer_[bufferOffset_ + index]; }
|
||||||
if(characterCount_ <= charIndex_ + index)
|
if(characterCount_ <= charIndex_ + index)
|
||||||
|
@ -160,7 +159,7 @@ final class Reader
|
||||||
++lastDecodedBufferOffset_;
|
++lastDecodedBufferOffset_;
|
||||||
return b;
|
return b;
|
||||||
}
|
}
|
||||||
return decodeValidUTF8NoGC(buffer_, lastDecodedBufferOffset_);
|
return decode(buffer_, lastDecodedBufferOffset_);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 'Slow' path where we decode everything up to the requested character.
|
// 'Slow' path where we decode everything up to the requested character.
|
||||||
|
@ -177,7 +176,7 @@ final class Reader
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Optimized version of peek() for the case where peek index is 0.
|
/// Optimized version of peek() for the case where peek index is 0.
|
||||||
dchar peek() @safe pure nothrow @nogc
|
dchar peek() @safe pure
|
||||||
{
|
{
|
||||||
if(upcomingASCII_ > 0) { return buffer_[bufferOffset_]; }
|
if(upcomingASCII_ > 0) { return buffer_[bufferOffset_]; }
|
||||||
if(characterCount_ <= charIndex_) { return '\0'; }
|
if(characterCount_ <= charIndex_) { return '\0'; }
|
||||||
|
@ -217,7 +216,7 @@ final class Reader
|
||||||
/// slice will be shorter.
|
/// slice will be shorter.
|
||||||
///
|
///
|
||||||
/// Returns: Characters starting at current position or an empty slice if out of bounds.
|
/// Returns: Characters starting at current position or an empty slice if out of bounds.
|
||||||
char[] prefix(const size_t length) @safe pure nothrow @nogc
|
char[] prefix(const size_t length) @safe pure
|
||||||
{
|
{
|
||||||
return slice(length);
|
return slice(length);
|
||||||
}
|
}
|
||||||
|
@ -250,7 +249,7 @@ final class Reader
|
||||||
/// be shorter.
|
/// be shorter.
|
||||||
///
|
///
|
||||||
/// Returns: Slice into the internal buffer or an empty slice if out of bounds.
|
/// Returns: Slice into the internal buffer or an empty slice if out of bounds.
|
||||||
char[] slice(const size_t end) @safe pure nothrow @nogc
|
char[] slice(const size_t end) @safe pure
|
||||||
{
|
{
|
||||||
// Fast path in case the caller has already peek()ed all the way to end.
|
// Fast path in case the caller has already peek()ed all the way to end.
|
||||||
if(end == lastDecodedCharOffset_)
|
if(end == lastDecodedCharOffset_)
|
||||||
|
@ -278,7 +277,7 @@ final class Reader
|
||||||
///
|
///
|
||||||
/// Throws: ReaderException if trying to read past the end of the buffer
|
/// Throws: ReaderException if trying to read past the end of the buffer
|
||||||
/// or if invalid data is read.
|
/// or if invalid data is read.
|
||||||
dchar get() @safe pure nothrow @nogc
|
dchar get() @safe pure
|
||||||
{
|
{
|
||||||
const result = peek();
|
const result = peek();
|
||||||
forward();
|
forward();
|
||||||
|
@ -290,7 +289,7 @@ final class Reader
|
||||||
/// Params: length = Number or characters (code points, not bytes) to get.
|
/// Params: length = Number or characters (code points, not bytes) to get.
|
||||||
///
|
///
|
||||||
/// Returns: Characters starting at current position.
|
/// Returns: Characters starting at current position.
|
||||||
char[] get(const size_t length) @safe pure nothrow @nogc
|
char[] get(const size_t length) @safe pure
|
||||||
{
|
{
|
||||||
auto result = slice(length);
|
auto result = slice(length);
|
||||||
forward(length);
|
forward(length);
|
||||||
|
@ -300,7 +299,7 @@ final class Reader
|
||||||
/// Move current position forward.
|
/// Move current position forward.
|
||||||
///
|
///
|
||||||
/// Params: length = Number of characters to move position forward.
|
/// Params: length = Number of characters to move position forward.
|
||||||
void forward(size_t length) @safe pure nothrow @nogc
|
void forward(size_t length) @safe pure
|
||||||
{
|
{
|
||||||
mixin FastCharSearch!"\n\u0085\u2028\u2029"d search;
|
mixin FastCharSearch!"\n\u0085\u2028\u2029"d search;
|
||||||
|
|
||||||
|
@ -337,7 +336,7 @@ final class Reader
|
||||||
"ASCII must be handled by preceding code");
|
"ASCII must be handled by preceding code");
|
||||||
|
|
||||||
++charIndex_;
|
++charIndex_;
|
||||||
const c = decodeValidUTF8NoGC(buffer_, bufferOffset_);
|
const c = decode(buffer_, bufferOffset_);
|
||||||
|
|
||||||
// New line. (can compare with '\n' without decoding since it's ASCII)
|
// New line. (can compare with '\n' without decoding since it's ASCII)
|
||||||
if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n'))
|
if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n'))
|
||||||
|
@ -355,7 +354,7 @@ final class Reader
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Move current position forward by one character.
|
/// Move current position forward by one character.
|
||||||
void forward() @safe pure nothrow @nogc
|
void forward() @safe pure
|
||||||
{
|
{
|
||||||
++charIndex_;
|
++charIndex_;
|
||||||
lastDecodedBufferOffset_ = bufferOffset_;
|
lastDecodedBufferOffset_ = bufferOffset_;
|
||||||
|
@ -384,7 +383,7 @@ final class Reader
|
||||||
assert(buffer_[bufferOffset_] >= 0x80,
|
assert(buffer_[bufferOffset_] >= 0x80,
|
||||||
"ASCII must be handled by preceding code");
|
"ASCII must be handled by preceding code");
|
||||||
|
|
||||||
const c = decodeValidUTF8NoGC(buffer_, bufferOffset_);
|
const c = decode(buffer_, bufferOffset_);
|
||||||
|
|
||||||
// New line. (can compare with '\n' without decoding since it's ASCII)
|
// New line. (can compare with '\n' without decoding since it's ASCII)
|
||||||
if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n'))
|
if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n'))
|
||||||
|
@ -426,7 +425,7 @@ private:
|
||||||
// lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them.
|
// lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them.
|
||||||
//
|
//
|
||||||
// Does not advance the buffer position. Used in peek() and slice().
|
// Does not advance the buffer position. Used in peek() and slice().
|
||||||
dchar decodeNext() @safe pure nothrow @nogc
|
dchar decodeNext() @safe pure
|
||||||
{
|
{
|
||||||
assert(lastDecodedBufferOffset_ < buffer_.length,
|
assert(lastDecodedBufferOffset_ < buffer_.length,
|
||||||
"Attempted to decode past the end of YAML buffer");
|
"Attempted to decode past the end of YAML buffer");
|
||||||
|
@ -439,7 +438,7 @@ private:
|
||||||
return b;
|
return b;
|
||||||
}
|
}
|
||||||
|
|
||||||
return decodeValidUTF8NoGC(buffer_, lastDecodedBufferOffset_);
|
return decode(buffer_, lastDecodedBufferOffset_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -559,7 +558,7 @@ public:
|
||||||
/// Data can only be written up to the current position in the Reader buffer.
|
/// Data can only be written up to the current position in the Reader buffer.
|
||||||
///
|
///
|
||||||
/// See_Also: begin
|
/// See_Also: begin
|
||||||
void write(dchar c) @safe pure nothrow @nogc
|
void write(dchar c) @safe pure
|
||||||
{
|
{
|
||||||
assert(inProgress, "write called without begin");
|
assert(inProgress, "write called without begin");
|
||||||
if(c < 0x80)
|
if(c < 0x80)
|
||||||
|
@ -570,7 +569,7 @@ public:
|
||||||
|
|
||||||
// We need to encode a non-ASCII dchar into UTF-8
|
// We need to encode a non-ASCII dchar into UTF-8
|
||||||
char[4] encodeBuf;
|
char[4] encodeBuf;
|
||||||
const bytes = encodeValidCharNoGC(encodeBuf, c);
|
const bytes = encode(encodeBuf, c);
|
||||||
reader_.buffer_[end_ .. end_ + bytes] = encodeBuf[0 .. bytes];
|
reader_.buffer_[end_ .. end_ + bytes] = encodeBuf[0 .. bytes];
|
||||||
end_ += bytes;
|
end_ += bytes;
|
||||||
}
|
}
|
||||||
|
@ -586,7 +585,7 @@ public:
|
||||||
/// position = Position to insert the character at in code units, not code points.
|
/// position = Position to insert the character at in code units, not code points.
|
||||||
/// Must be less than slice length(); a previously returned length()
|
/// Must be less than slice length(); a previously returned length()
|
||||||
/// can be used.
|
/// can be used.
|
||||||
void insert(const dchar c, const size_t position) @safe pure nothrow @nogc
|
void insert(const dchar c, const size_t position) @safe pure
|
||||||
{
|
{
|
||||||
assert(inProgress, "insert called without begin");
|
assert(inProgress, "insert called without begin");
|
||||||
assert(start_ + position <= end_, "Trying to insert after the end of the slice");
|
assert(start_ + position <= end_, "Trying to insert after the end of the slice");
|
||||||
|
@ -597,7 +596,7 @@ public:
|
||||||
// Encode c into UTF-8
|
// Encode c into UTF-8
|
||||||
char[4] encodeBuf;
|
char[4] encodeBuf;
|
||||||
if(c < 0x80) { encodeBuf[0] = cast(char)c; }
|
if(c < 0x80) { encodeBuf[0] = cast(char)c; }
|
||||||
const size_t bytes = c < 0x80 ? 1 : encodeValidCharNoGC(encodeBuf, c);
|
const size_t bytes = c < 0x80 ? 1 : encode(encodeBuf, c);
|
||||||
|
|
||||||
if(movedLength > 0)
|
if(movedLength > 0)
|
||||||
{
|
{
|
||||||
|
@ -764,13 +763,8 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const encodeResult = encodeCharNoGC!(No.validated)(encodeBuf, c);
|
std.utf.encode(encodeBuf, c);
|
||||||
if(encodeResult.errorMessage !is null)
|
const bytes = codeLength!char(c);
|
||||||
{
|
|
||||||
result.errorMessage = encodeResult.errorMessage;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const bytes = encodeResult.bytes;
|
|
||||||
utf8[length .. length + bytes] = encodeBuf[0 .. bytes];
|
utf8[length .. length + bytes] = encodeBuf[0 .. bytes];
|
||||||
length += bytes;
|
length += bytes;
|
||||||
}
|
}
|
||||||
|
@ -788,14 +782,8 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
|
||||||
{
|
{
|
||||||
case UTFEncoding.UTF_8:
|
case UTFEncoding.UTF_8:
|
||||||
result.utf8 = cast(char[])input;
|
result.utf8 = cast(char[])input;
|
||||||
const validateResult = result.utf8.validateUTF8NoGC();
|
result.utf8.validate();
|
||||||
if(!validateResult.valid)
|
result.characterCount = std.utf.count(result.utf8);
|
||||||
{
|
|
||||||
result.errorMessage = "UTF-8 validation error after character #" ~
|
|
||||||
validateResult.characterCount.to!string ~ ": " ~
|
|
||||||
validateResult.msg;
|
|
||||||
}
|
|
||||||
result.characterCount = validateResult.characterCount;
|
|
||||||
break;
|
break;
|
||||||
case UTFEncoding.UTF_16:
|
case UTFEncoding.UTF_16:
|
||||||
assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
|
assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
|
||||||
|
@ -817,7 +805,7 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Determine if all characters (code points, not bytes) in a string are printable.
|
/// Determine if all characters (code points, not bytes) in a string are printable.
|
||||||
bool isPrintableValidUTF8(const char[] chars) @safe pure nothrow @nogc
|
bool isPrintableValidUTF8(const char[] chars) @safe pure
|
||||||
{
|
{
|
||||||
// This is oversized (only 128 entries are necessary) simply because having 256
|
// This is oversized (only 128 entries are necessary) simply because having 256
|
||||||
// entries improves performance... for some reason (alignment?)
|
// entries improves performance... for some reason (alignment?)
|
||||||
|
@ -917,7 +905,7 @@ bool isPrintableValidUTF8(const char[] chars) @safe pure nothrow @nogc
|
||||||
if(index == chars.length) { break; }
|
if(index == chars.length) { break; }
|
||||||
|
|
||||||
// Not ASCII, need to decode.
|
// Not ASCII, need to decode.
|
||||||
const dchar c = decodeValidUTF8NoGC(chars, index);
|
const dchar c = decode(chars, index);
|
||||||
// We now c is not ASCII, so only check for printable non-ASCII chars.
|
// We now c is not ASCII, so only check for printable non-ASCII chars.
|
||||||
if(!(c == 0x85 || (c >= 0xA0 && c <= '\uD7FF') ||
|
if(!(c == 0x85 || (c >= 0xA0 && c <= '\uD7FF') ||
|
||||||
(c >= '\uE000' && c <= '\uFFFD') ||
|
(c >= '\uE000' && c <= '\uFFFD') ||
|
||||||
|
|
|
@ -20,11 +20,11 @@ import std.exception;
|
||||||
import std.string;
|
import std.string;
|
||||||
import std.typecons;
|
import std.typecons;
|
||||||
import std.traits : Unqual;
|
import std.traits : Unqual;
|
||||||
|
import std.utf;
|
||||||
|
|
||||||
import dyaml.fastcharsearch;
|
import dyaml.fastcharsearch;
|
||||||
import dyaml.escapes;
|
import dyaml.escapes;
|
||||||
import dyaml.exception;
|
import dyaml.exception;
|
||||||
import dyaml.nogcutil;
|
|
||||||
import dyaml.queue;
|
import dyaml.queue;
|
||||||
import dyaml.reader;
|
import dyaml.reader;
|
||||||
import dyaml.style;
|
import dyaml.style;
|
||||||
|
@ -1552,7 +1552,7 @@ final class Scanner
|
||||||
for(size_t i = oldSliceLength; i < slice.length;)
|
for(size_t i = oldSliceLength; i < slice.length;)
|
||||||
{
|
{
|
||||||
// slice is UTF-8 - need to decode
|
// slice is UTF-8 - need to decode
|
||||||
const ch = slice[i] < 0x80 ? slice[i++] : decodeValidUTF8NoGC(slice, i);
|
const ch = slice[i] < 0x80 ? slice[i++] : decode(slice, i);
|
||||||
if(search.canFind(ch)) { break outer; }
|
if(search.canFind(ch)) { break outer; }
|
||||||
++numCodePoints;
|
++numCodePoints;
|
||||||
}
|
}
|
||||||
|
@ -1985,9 +1985,7 @@ final class Scanner
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
const decoded = decodeUTF8NoGC!(No.validated)(bytes[], nextChar);
|
c = decode(bytes[], nextChar);
|
||||||
if(decoded.errorMessage !is null) { return size_t.max; }
|
|
||||||
c = decoded.decoded;
|
|
||||||
}
|
}
|
||||||
reader_.sliceBuilder.write(c);
|
reader_.sliceBuilder.write(c);
|
||||||
if(bytes.length - nextChar > 0)
|
if(bytes.length - nextChar > 0)
|
||||||
|
|
|
@ -1,161 +0,0 @@
|
||||||
// Copyright Ferdinand Majerech 2014.
|
|
||||||
// Distributed under the Boost Software License, Version 1.0.
|
|
||||||
// (See accompanying file LICENSE_1_0.txt or copy at
|
|
||||||
// http://www.boost.org/LICENSE_1_0.txt)
|
|
||||||
|
|
||||||
|
|
||||||
// Code that is currently unused but may be useful for future D:YAML releases
|
|
||||||
module dyaml.unused;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
import std.utf;
|
|
||||||
|
|
||||||
import tinyendian;
|
|
||||||
|
|
||||||
// Decode an UTF-8/16/32 buffer to UTF-32 (for UTF-32 this does nothing).
|
|
||||||
//
|
|
||||||
// Params:
|
|
||||||
//
|
|
||||||
// input = The UTF-8/16/32 buffer to decode.
|
|
||||||
// encoding = Encoding of input.
|
|
||||||
//
|
|
||||||
// Returns:
|
|
||||||
//
|
|
||||||
// A struct with the following members:
|
|
||||||
//
|
|
||||||
// $(D string errorMessage) In case of a decoding error, the error message is stored
|
|
||||||
// here. If there was no error, errorMessage is NULL. Always
|
|
||||||
// check this first before using the other members.
|
|
||||||
// $(D dchar[] decoded) A GC-allocated buffer with decoded UTF-32 characters.
|
|
||||||
auto decodeUTF(ubyte[] input, UTFEncoding encoding) @safe pure nothrow
|
|
||||||
{
|
|
||||||
// Documented in function ddoc.
|
|
||||||
struct Result
|
|
||||||
{
|
|
||||||
string errorMessage;
|
|
||||||
dchar[] decoded;
|
|
||||||
}
|
|
||||||
|
|
||||||
Result result;
|
|
||||||
|
|
||||||
// Decode input_ if it's encoded as UTF-8 or UTF-16.
|
|
||||||
//
|
|
||||||
// Params:
|
|
||||||
//
|
|
||||||
// buffer = The input buffer to decode.
|
|
||||||
// result = A Result struct to put decoded result and any error messages to.
|
|
||||||
//
|
|
||||||
// On error, result.errorMessage will be set.
|
|
||||||
static void decode(C)(C[] input, ref Result result)
|
|
||||||
{
|
|
||||||
// End of part of input that contains complete characters that can be decoded.
|
|
||||||
const size_t end = endOfLastUTFSequence(input);
|
|
||||||
// If end is 0, there are no full chars.
|
|
||||||
// This can happen at the end of file if there is an incomplete UTF sequence.
|
|
||||||
if(end < input.length)
|
|
||||||
{
|
|
||||||
result.errorMessage = "Invalid UTF character at the end of input";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
const srclength = input.length;
|
|
||||||
try for(size_t srcpos = 0; srcpos < srclength;)
|
|
||||||
{
|
|
||||||
const c = input[srcpos];
|
|
||||||
if(c < 0x80)
|
|
||||||
{
|
|
||||||
result.decoded ~= c;
|
|
||||||
++srcpos;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
result.decoded ~= std.utf.decode(input, srcpos);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch(UTFException e)
|
|
||||||
{
|
|
||||||
result.errorMessage = e.msg;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
catch(Exception e)
|
|
||||||
{
|
|
||||||
assert(false, "Unexpected exception in decode(): " ~ e.msg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
final switch(encoding)
|
|
||||||
{
|
|
||||||
case UTFEncoding.UTF_8: decode(cast(char[])input, result); break;
|
|
||||||
case UTFEncoding.UTF_16:
|
|
||||||
assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
|
|
||||||
decode(cast(wchar[])input, result);
|
|
||||||
break;
|
|
||||||
case UTFEncoding.UTF_32:
|
|
||||||
assert(input.length % 4 == 0,
|
|
||||||
"UTF-32 buffer size must be a multiple of 4");
|
|
||||||
// No need to decode anything
|
|
||||||
result.decoded = cast(dchar[])input;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(result.errorMessage !is null) { return result; }
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer.
|
|
||||||
size_t endOfLastUTFSequence(C)(const C[] buffer)
|
|
||||||
{
|
|
||||||
static if(is(C == char))
|
|
||||||
{
|
|
||||||
for(long end = buffer.length - 1; end >= 0; --end)
|
|
||||||
{
|
|
||||||
const stride = utf8Stride[buffer[cast(size_t)end]];
|
|
||||||
if(stride != 0xFF)
|
|
||||||
{
|
|
||||||
// If stride goes beyond end of the buffer, return end.
|
|
||||||
// Otherwise the last sequence ends at buffer.length, so we can
|
|
||||||
// return that. (Unless there is an invalid code unit, which is
|
|
||||||
// caught at decoding)
|
|
||||||
return (stride > buffer.length - end) ? cast(size_t)end : buffer.length;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
else static if(is(C == wchar))
|
|
||||||
{
|
|
||||||
// TODO this is O(N), which is slow. Find out if we can somehow go
|
|
||||||
// from the end backwards with UTF-16.
|
|
||||||
size_t end = 0;
|
|
||||||
while(end < buffer.length)
|
|
||||||
{
|
|
||||||
const s = stride(buffer, end);
|
|
||||||
if(s + end > buffer.length) { break; }
|
|
||||||
end += s;
|
|
||||||
}
|
|
||||||
return end;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence).
|
|
||||||
immutable ubyte[256] utf8Stride =
|
|
||||||
[
|
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
|
||||||
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
|
|
||||||
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
|
|
||||||
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
|
|
||||||
0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
|
|
||||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
||||||
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
|
||||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
|
|
||||||
4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
|
|
||||||
];
|
|
Loading…
Reference in a new issue