From 1dac1f39f3da14dc63db93169067dbb65e56e35b Mon Sep 17 00:00:00 2001 From: Cameron Ross Date: Fri, 20 Apr 2018 20:42:34 -0300 Subject: [PATCH] remove dyaml.unused and rest of dyaml.nogcutil in favour of std.utf functions --- meson.build | 4 +- source/dyaml/nogcutil.d | 242 ---------------------------------------- source/dyaml/parser.d | 1 - source/dyaml/reader.d | 58 ++++------ source/dyaml/scanner.d | 8 +- source/dyaml/unused.d | 161 -------------------------- 6 files changed, 27 insertions(+), 447 deletions(-) delete mode 100644 source/dyaml/nogcutil.d delete mode 100644 source/dyaml/unused.d diff --git a/meson.build b/meson.build index f99e9ce..053912b 100644 --- a/meson.build +++ b/meson.build @@ -24,7 +24,6 @@ dyaml_src = [ 'source/dyaml/linebreak.d', 'source/dyaml/loader.d', 'source/dyaml/node.d', - 'source/dyaml/nogcutil.d', 'source/dyaml/package.d', 'source/dyaml/parser.d', 'source/dyaml/queue.d', @@ -46,8 +45,7 @@ dyaml_src = [ 'source/dyaml/test/representer.d', 'source/dyaml/test/resolver.d', 'source/dyaml/test/tokens.d', - 'source/dyaml/token.d', - 'source/dyaml/unused.d' + 'source/dyaml/token.d' ] install_subdir('source/dyaml', install_dir: 'include/d/yaml/') diff --git a/source/dyaml/nogcutil.d b/source/dyaml/nogcutil.d deleted file mode 100644 index 322fb04..0000000 --- a/source/dyaml/nogcutil.d +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright Ferdinand Majerech 2014, Digital Mars 2000-2012, Andrei Alexandrescu 2008- and Jonathan M Davis 2011-. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - - -/// @nogc versions of or alternatives to Phobos functions that are not yet @nogc and -/// wrappers to simplify their use. -module dyaml.nogcutil; - - - -import std.traits; -import std.typecons; -import std.typetuple; -import std.range; - - -/// Result of a validateUTF8NoGC call. -struct ValidateResult -{ - /// Is the validated string valid? - bool valid; - /// Number of characters in the string. - /// - /// If the string is not valid, this is the number of valid characters before - /// hitting the first invalid sequence. - size_t characterCount; - /// If the string is not valid, error message with details is here. - string msg; -} - -/// Validate a UTF-8 string, checking if it is well-formed Unicode. -/// -/// See_Also: ValidateResult -ValidateResult validateUTF8NoGC(const(char[]) str) @safe pure nothrow @nogc -{ - immutable len = str.length; - size_t characterCount; - outer: for (size_t index = 0; index < len; ) - { - if(str[index] < 0x80) - { - ++index; - ++characterCount; - continue; - } - - auto decoded = decodeUTF8NoGC!(No.validated)(str, index); - if(decoded.errorMessage !is null) - { - return ValidateResult(false, characterCount, decoded.errorMessage); - } - ++characterCount; - } - - return ValidateResult(true, characterCount); -} - -/// @nogc version of std.utf.decode() for char[]. -/// -/// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to -/// force code using this function to be efficient. -/// -/// Params: -/// -/// validated = If ture, assume str is a valid UTF-8 string and don't generate any -/// error-checking code. If validated is true, str $(B must) be a valid -/// character, otherwise undefined behavior will occur. Also affects the -/// return type. -/// str = Will decode the first code point from this string. -/// index = Index in str where the code point starts. Will be updated to point to -/// the next code point. -/// -/// Returns: If validated is true, the decoded character. -/// Otherwise a struct with a 'decoded' member - the decoded character, and a -/// 'string errorMessage' member that is null on success and otherwise stores -/// the error message. -auto decodeUTF8NoGC(Flag!"validated" validated)(const(char[]) str, ref size_t index) -{ - static if(!validated) struct Result - { - dchar decoded; - string errorMessage; - } - else alias Result = dchar; - - /// Dchar bitmask for different numbers of UTF-8 code units. - enum bitMask = tuple((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1); - - auto pstr = str[index..$]; - - immutable length = str.length - index; - ubyte fst = pstr[0]; - - assert(fst & 0x80); - enum invalidUTFMsg = "Invalid UTF-8 sequence"; - static if(!validated) { enum invalidUTF = Result(cast(dchar)int.max, invalidUTFMsg); } - - // starter must have at least 2 first bits set - static if(validated) - { - assert((fst & 0b1100_0000) == 0b1100_0000, invalidUTFMsg); - } - else if((fst & 0b1100_0000) != 0b1100_0000) - { - return invalidUTF; - } - - ubyte tmp = void; - dchar d = fst; // upper control bits are masked out later - fst <<= 1; - - - foreach (i; TypeTuple!(1, 2, 3)) - { - static if(validated) { assert(i != length, "Decoding out of bounds"); } - else if(i == length) { return Result(cast(dchar)int.max, "Decoding out of bounds"); } - - tmp = pstr[i]; - static if(validated) { assert((tmp & 0xC0) == 0x80, invalidUTFMsg); } - else if((tmp & 0xC0) != 0x80) { return invalidUTF; } - - d = (d << 6) | (tmp & 0x3F); - fst <<= 1; - - if (!(fst & 0x80)) // no more bytes - { - d &= bitMask[i]; // mask out control bits - - // overlong, could have been encoded with i bytes - static if(validated) { assert((d & ~bitMask[i - 1]) != 0, invalidUTFMsg); } - else if((d & ~bitMask[i - 1]) == 0) { return invalidUTF; } - - // check for surrogates only needed for 3 bytes - static if (i == 2) - { - static if(validated) { assert(isValidDchar(d), invalidUTFMsg); } - else if(!isValidDchar(d)) { return invalidUTF; } - } - - index += i + 1; - static if (i == 3) - { - static if(validated) { assert(d <= dchar.max, invalidUTFMsg); } - else if(d > dchar.max) { return invalidUTF; } - } - - return Result(d); - } - } - - static if(validated) { assert(false, invalidUTFMsg); } - else { return invalidUTF; } -} - -/// ditto -alias decodeValidUTF8NoGC = decodeUTF8NoGC!(Yes.validated); - -/// @nogc version of std.utf.encode() for char[]. -/// -/// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to -/// force code using this function to be efficient. -/// -/// Params: -/// validated = If true, asssume c is a valid, non-surrogate UTF-32 code point and don't -/// generate any error-checking code. If validated is true, c $(B must) be -/// a valid character, otherwise undefined behavior will occur. Also affects -/// the return type. -/// buf = Buffer to write the encoded result to. -/// c = Character to encode. -/// -/// Returns: If validated is true, number of bytes the encoded character takes up in buf. -/// Otherwise a struct with a 'bytes' member specifying the number of bytes of -/// the endocded character, and a 'string errorMessage' member that is null -/// if there was no error and otherwise stores the error message. -auto encodeCharNoGC(Flag!"validated" validated)(ref char[4] buf, dchar c) - @safe pure nothrow @nogc -{ - static if(!validated) struct Result - { - size_t bytes; - string errorMessage; - } - else alias Result = size_t; - - // Force the caller to optimize ASCII (the 1-byte case) - assert(c >= 0x80, "Caller should explicitly handle ASCII chars"); - if (c <= 0x7FF) - { - assert(isValidDchar(c)); - buf[0] = cast(char)(0xC0 | (c >> 6)); - buf[1] = cast(char)(0x80 | (c & 0x3F)); - return Result(2); - } - if (c <= 0xFFFF) - { - static if(validated) - { - assert(0xD800 > c || c > 0xDFFF, - "Supposedly valid code point is a surrogate code point"); - } - else if(0xD800 <= c && c <= 0xDFFF) - { - return Result(size_t.max, "Can't encode a surrogate code point in UTF-8"); - } - - assert(isValidDchar(c)); - buf[0] = cast(char)(0xE0 | (c >> 12)); - buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); - buf[2] = cast(char)(0x80 | (c & 0x3F)); - return Result(3); - } - if (c <= 0x10FFFF) - { - assert(isValidDchar(c)); - buf[0] = cast(char)(0xF0 | (c >> 18)); - buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); - buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); - buf[3] = cast(char)(0x80 | (c & 0x3F)); - return Result(4); - } - - assert(!isValidDchar(c)); - static if(!validated) - { - return Result(size_t.max, "Can't encode an invalid code point in UTF-8"); - } - else - { - assert(false, "Supposedly valid code point is invalid"); - } -} - -/// ditto -alias encodeValidCharNoGC = encodeCharNoGC!(Yes.validated); - -/// @nogc version of std.utf.isValidDchar -bool isValidDchar(dchar c) @safe pure nothrow @nogc -{ - return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF); -} diff --git a/source/dyaml/parser.d b/source/dyaml/parser.d index 199eb0a..50c5543 100644 --- a/source/dyaml/parser.d +++ b/source/dyaml/parser.d @@ -538,7 +538,6 @@ final class Parser { string notInPlace; bool inEscape = false; - import dyaml.nogcutil; auto appender = appender!(char[])(); for(char[] oldValue = tokenValue; !oldValue.empty();) { diff --git a/source/dyaml/reader.d b/source/dyaml/reader.d index c197e60..725cd8b 100644 --- a/source/dyaml/reader.d +++ b/source/dyaml/reader.d @@ -26,7 +26,6 @@ import tinyendian; import dyaml.fastcharsearch; import dyaml.encoding; import dyaml.exception; -import dyaml.nogcutil; @@ -134,7 +133,7 @@ final class Reader /// // XXX removed; search for 'risky' to find why. // Throws: ReaderException if trying to read past the end of the buffer. - dchar peek(const size_t index) @safe pure nothrow @nogc + dchar peek(const size_t index) @safe pure { if(index < upcomingASCII_) { return buffer_[bufferOffset_ + index]; } if(characterCount_ <= charIndex_ + index) @@ -160,7 +159,7 @@ final class Reader ++lastDecodedBufferOffset_; return b; } - return decodeValidUTF8NoGC(buffer_, lastDecodedBufferOffset_); + return decode(buffer_, lastDecodedBufferOffset_); } // 'Slow' path where we decode everything up to the requested character. @@ -177,7 +176,7 @@ final class Reader } /// Optimized version of peek() for the case where peek index is 0. - dchar peek() @safe pure nothrow @nogc + dchar peek() @safe pure { if(upcomingASCII_ > 0) { return buffer_[bufferOffset_]; } if(characterCount_ <= charIndex_) { return '\0'; } @@ -217,7 +216,7 @@ final class Reader /// slice will be shorter. /// /// Returns: Characters starting at current position or an empty slice if out of bounds. - char[] prefix(const size_t length) @safe pure nothrow @nogc + char[] prefix(const size_t length) @safe pure { return slice(length); } @@ -250,7 +249,7 @@ final class Reader /// be shorter. /// /// Returns: Slice into the internal buffer or an empty slice if out of bounds. - char[] slice(const size_t end) @safe pure nothrow @nogc + char[] slice(const size_t end) @safe pure { // Fast path in case the caller has already peek()ed all the way to end. if(end == lastDecodedCharOffset_) @@ -278,7 +277,7 @@ final class Reader /// /// Throws: ReaderException if trying to read past the end of the buffer /// or if invalid data is read. - dchar get() @safe pure nothrow @nogc + dchar get() @safe pure { const result = peek(); forward(); @@ -290,7 +289,7 @@ final class Reader /// Params: length = Number or characters (code points, not bytes) to get. /// /// Returns: Characters starting at current position. - char[] get(const size_t length) @safe pure nothrow @nogc + char[] get(const size_t length) @safe pure { auto result = slice(length); forward(length); @@ -300,7 +299,7 @@ final class Reader /// Move current position forward. /// /// Params: length = Number of characters to move position forward. - void forward(size_t length) @safe pure nothrow @nogc + void forward(size_t length) @safe pure { mixin FastCharSearch!"\n\u0085\u2028\u2029"d search; @@ -337,7 +336,7 @@ final class Reader "ASCII must be handled by preceding code"); ++charIndex_; - const c = decodeValidUTF8NoGC(buffer_, bufferOffset_); + const c = decode(buffer_, bufferOffset_); // New line. (can compare with '\n' without decoding since it's ASCII) if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n')) @@ -355,7 +354,7 @@ final class Reader } /// Move current position forward by one character. - void forward() @safe pure nothrow @nogc + void forward() @safe pure { ++charIndex_; lastDecodedBufferOffset_ = bufferOffset_; @@ -384,7 +383,7 @@ final class Reader assert(buffer_[bufferOffset_] >= 0x80, "ASCII must be handled by preceding code"); - const c = decodeValidUTF8NoGC(buffer_, bufferOffset_); + const c = decode(buffer_, bufferOffset_); // New line. (can compare with '\n' without decoding since it's ASCII) if(search.canFind(c) || (c == '\r' && buffer_[bufferOffset_] != '\n')) @@ -426,7 +425,7 @@ private: // lastDecodedCharOffset_/lastDecodedBufferOffset_ and update them. // // Does not advance the buffer position. Used in peek() and slice(). - dchar decodeNext() @safe pure nothrow @nogc + dchar decodeNext() @safe pure { assert(lastDecodedBufferOffset_ < buffer_.length, "Attempted to decode past the end of YAML buffer"); @@ -439,7 +438,7 @@ private: return b; } - return decodeValidUTF8NoGC(buffer_, lastDecodedBufferOffset_); + return decode(buffer_, lastDecodedBufferOffset_); } } @@ -559,7 +558,7 @@ public: /// Data can only be written up to the current position in the Reader buffer. /// /// See_Also: begin - void write(dchar c) @safe pure nothrow @nogc + void write(dchar c) @safe pure { assert(inProgress, "write called without begin"); if(c < 0x80) @@ -570,7 +569,7 @@ public: // We need to encode a non-ASCII dchar into UTF-8 char[4] encodeBuf; - const bytes = encodeValidCharNoGC(encodeBuf, c); + const bytes = encode(encodeBuf, c); reader_.buffer_[end_ .. end_ + bytes] = encodeBuf[0 .. bytes]; end_ += bytes; } @@ -586,7 +585,7 @@ public: /// position = Position to insert the character at in code units, not code points. /// Must be less than slice length(); a previously returned length() /// can be used. - void insert(const dchar c, const size_t position) @safe pure nothrow @nogc + void insert(const dchar c, const size_t position) @safe pure { assert(inProgress, "insert called without begin"); assert(start_ + position <= end_, "Trying to insert after the end of the slice"); @@ -597,7 +596,7 @@ public: // Encode c into UTF-8 char[4] encodeBuf; if(c < 0x80) { encodeBuf[0] = cast(char)c; } - const size_t bytes = c < 0x80 ? 1 : encodeValidCharNoGC(encodeBuf, c); + const size_t bytes = c < 0x80 ? 1 : encode(encodeBuf, c); if(movedLength > 0) { @@ -764,13 +763,8 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow continue; } - const encodeResult = encodeCharNoGC!(No.validated)(encodeBuf, c); - if(encodeResult.errorMessage !is null) - { - result.errorMessage = encodeResult.errorMessage; - return; - } - const bytes = encodeResult.bytes; + std.utf.encode(encodeBuf, c); + const bytes = codeLength!char(c); utf8[length .. length + bytes] = encodeBuf[0 .. bytes]; length += bytes; } @@ -788,14 +782,8 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow { case UTFEncoding.UTF_8: result.utf8 = cast(char[])input; - const validateResult = result.utf8.validateUTF8NoGC(); - if(!validateResult.valid) - { - result.errorMessage = "UTF-8 validation error after character #" ~ - validateResult.characterCount.to!string ~ ": " ~ - validateResult.msg; - } - result.characterCount = validateResult.characterCount; + result.utf8.validate(); + result.characterCount = std.utf.count(result.utf8); break; case UTFEncoding.UTF_16: assert(input.length % 2 == 0, "UTF-16 buffer size must be even"); @@ -817,7 +805,7 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow } /// Determine if all characters (code points, not bytes) in a string are printable. -bool isPrintableValidUTF8(const char[] chars) @safe pure nothrow @nogc +bool isPrintableValidUTF8(const char[] chars) @safe pure { // This is oversized (only 128 entries are necessary) simply because having 256 // entries improves performance... for some reason (alignment?) @@ -917,7 +905,7 @@ bool isPrintableValidUTF8(const char[] chars) @safe pure nothrow @nogc if(index == chars.length) { break; } // Not ASCII, need to decode. - const dchar c = decodeValidUTF8NoGC(chars, index); + const dchar c = decode(chars, index); // We now c is not ASCII, so only check for printable non-ASCII chars. if(!(c == 0x85 || (c >= 0xA0 && c <= '\uD7FF') || (c >= '\uE000' && c <= '\uFFFD') || diff --git a/source/dyaml/scanner.d b/source/dyaml/scanner.d index f833230..d9be4c5 100644 --- a/source/dyaml/scanner.d +++ b/source/dyaml/scanner.d @@ -20,11 +20,11 @@ import std.exception; import std.string; import std.typecons; import std.traits : Unqual; +import std.utf; import dyaml.fastcharsearch; import dyaml.escapes; import dyaml.exception; -import dyaml.nogcutil; import dyaml.queue; import dyaml.reader; import dyaml.style; @@ -1552,7 +1552,7 @@ final class Scanner for(size_t i = oldSliceLength; i < slice.length;) { // slice is UTF-8 - need to decode - const ch = slice[i] < 0x80 ? slice[i++] : decodeValidUTF8NoGC(slice, i); + const ch = slice[i] < 0x80 ? slice[i++] : decode(slice, i); if(search.canFind(ch)) { break outer; } ++numCodePoints; } @@ -1985,9 +1985,7 @@ final class Scanner } else { - const decoded = decodeUTF8NoGC!(No.validated)(bytes[], nextChar); - if(decoded.errorMessage !is null) { return size_t.max; } - c = decoded.decoded; + c = decode(bytes[], nextChar); } reader_.sliceBuilder.write(c); if(bytes.length - nextChar > 0) diff --git a/source/dyaml/unused.d b/source/dyaml/unused.d deleted file mode 100644 index 73864f7..0000000 --- a/source/dyaml/unused.d +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright Ferdinand Majerech 2014. -// Distributed under the Boost Software License, Version 1.0. -// (See accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) - - -// Code that is currently unused but may be useful for future D:YAML releases -module dyaml.unused; - - - -import std.utf; - -import tinyendian; - -// Decode an UTF-8/16/32 buffer to UTF-32 (for UTF-32 this does nothing). -// -// Params: -// -// input = The UTF-8/16/32 buffer to decode. -// encoding = Encoding of input. -// -// Returns: -// -// A struct with the following members: -// -// $(D string errorMessage) In case of a decoding error, the error message is stored -// here. If there was no error, errorMessage is NULL. Always -// check this first before using the other members. -// $(D dchar[] decoded) A GC-allocated buffer with decoded UTF-32 characters. -auto decodeUTF(ubyte[] input, UTFEncoding encoding) @safe pure nothrow -{ - // Documented in function ddoc. - struct Result - { - string errorMessage; - dchar[] decoded; - } - - Result result; - - // Decode input_ if it's encoded as UTF-8 or UTF-16. - // - // Params: - // - // buffer = The input buffer to decode. - // result = A Result struct to put decoded result and any error messages to. - // - // On error, result.errorMessage will be set. - static void decode(C)(C[] input, ref Result result) - { - // End of part of input that contains complete characters that can be decoded. - const size_t end = endOfLastUTFSequence(input); - // If end is 0, there are no full chars. - // This can happen at the end of file if there is an incomplete UTF sequence. - if(end < input.length) - { - result.errorMessage = "Invalid UTF character at the end of input"; - return; - } - - const srclength = input.length; - try for(size_t srcpos = 0; srcpos < srclength;) - { - const c = input[srcpos]; - if(c < 0x80) - { - result.decoded ~= c; - ++srcpos; - } - else - { - result.decoded ~= std.utf.decode(input, srcpos); - } - } - catch(UTFException e) - { - result.errorMessage = e.msg; - return; - } - catch(Exception e) - { - assert(false, "Unexpected exception in decode(): " ~ e.msg); - } - } - - final switch(encoding) - { - case UTFEncoding.UTF_8: decode(cast(char[])input, result); break; - case UTFEncoding.UTF_16: - assert(input.length % 2 == 0, "UTF-16 buffer size must be even"); - decode(cast(wchar[])input, result); - break; - case UTFEncoding.UTF_32: - assert(input.length % 4 == 0, - "UTF-32 buffer size must be a multiple of 4"); - // No need to decode anything - result.decoded = cast(dchar[])input; - break; - } - - if(result.errorMessage !is null) { return result; } - - return result; -} - - -// Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer. -size_t endOfLastUTFSequence(C)(const C[] buffer) -{ - static if(is(C == char)) - { - for(long end = buffer.length - 1; end >= 0; --end) - { - const stride = utf8Stride[buffer[cast(size_t)end]]; - if(stride != 0xFF) - { - // If stride goes beyond end of the buffer, return end. - // Otherwise the last sequence ends at buffer.length, so we can - // return that. (Unless there is an invalid code unit, which is - // caught at decoding) - return (stride > buffer.length - end) ? cast(size_t)end : buffer.length; - } - } - return 0; - } - else static if(is(C == wchar)) - { - // TODO this is O(N), which is slow. Find out if we can somehow go - // from the end backwards with UTF-16. - size_t end = 0; - while(end < buffer.length) - { - const s = stride(buffer, end); - if(s + end > buffer.length) { break; } - end += s; - } - return end; - } -} - -// UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence). -immutable ubyte[256] utf8Stride = -[ - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF, -];