dyaml/source/dyaml/nogcutil.d

// Copyright Ferdinand Majerech 2014, Digital Mars 2000-2012, Andrei Alexandrescu 2008- and Jonathan M Davis 2011-.
// Distributed under the Boost Software License, Version 1.0.
//    (See accompanying file LICENSE_1_0.txt or copy at
//          http://www.boost.org/LICENSE_1_0.txt)


/// @nogc versions of or alternatives to Phobos functions that are not yet @nogc and
/// wrappers to simplify their use.
module dyaml.nogcutil;


import std.traits;
import std.typecons;
import std.range;


/// A NoGC version of std.conv.parse for integer types.
///
/// Differences:
///    overflow parameter - bool set to true if there was integer overflow.
///    Asserts that at least one character was parsed instead of throwing an exception.
///    The caller must validate the inputs before calling parseNoGC.
Target parseNoGC(Target, Source)(ref Source s, uint radix, out bool overflow)
    @safe pure nothrow @nogc
    if (isSomeChar!(ElementType!Source) &&
        isIntegral!Target && !is(Target == enum))
in { assert(radix >= 2 && radix <= 36); }
body
{
    immutable uint beyond = (radix < 10 ? '0' : 'a'-10) + radix;

    Target v = 0;
    size_t atStart = true;

    // We can safely foreach over individual code points.
    // Even with UTF-8 any digit is ASCII and anything not ASCII (such as the start of
    // a UTF-8 sequence) is not a digit.
    foreach(i; 0 .. s.length)
    {
        dchar c = s[i];
        // We can just take a char instead of decoding because anything non-ASCII is not
        // going to be a decodable digit, i.e. we will end at such a byte.
        if (c < '0' || c >= 0x80)
            break;
        if (radix < 10)
        {
            if (c >= beyond)
                break;
        }
        else
        {
            if (c > '9')
            {
                c |= 0x20;//poorman's tolower
                if (c < 'a' || c >= beyond) { break; }
                c -= 'a'-10-'0';
            }
        }
        auto blah = cast(Target) (v * radix + c - '0');
        if (blah < v)
        {
            overflow = true;
            return Target.max;
        }
        v = blah;
        atStart = false;
    }
    assert(!atStart, "Nothing to parse in parse()");
    return v;
}


/// Buils a message to a buffer similarly to writef/writefln, but without
/// using GC.
///
/// C snprintf would be better, but it isn't pure.
/// formattedWrite isn't completely @nogc yet (although it isn't GC-heavy).
///
/// The user has to ensure buffer is long enough - an assert checks that we don't run
/// out of space. Currently this can only write strings and dchars.
char[] printNoGC(S...)(char[] buffer, S args) @safe pure nothrow @nogc
{
    auto appender = appenderNoGC(buffer);

    foreach(arg; args)
    {
        alias A = typeof(arg);
        static if(is(A == char[]) || is(A == string)) { appender.put(arg); }
        else static if(is(Unqual!A == dchar))         { appender.putDChar(arg); }
        else static assert(false, "printNoGC does not support " ~ A.stringof);
    }

    return appender.data;
}


/// A UFCS utility function to write a dchar to an AppenderNoGCFixed using writeDCharTo.
///
/// The char $(B must) be a valid dchar.
void putDChar(ref AppenderNoGCFixed!(char[], char) appender, dchar c)
    @safe pure nothrow @nogc
{
    char[4] dcharBuf;
    if(c < 0x80)
    {
        dcharBuf[0] = cast(char)c;
        appender.put(dcharBuf[0 .. 1]);
        return;
    }
    // Should be safe to use as the first thing Reader does is validate everything.
    const bytes = encodeValidCharNoGC(dcharBuf, c);
    appender.put(dcharBuf[0 .. bytes]);
}

/// Convenience function that returns an $(D AppenderNoGCFixed!A) using with $(D array)
/// for storage.
AppenderNoGCFixed!(E[]) appenderNoGC(A : E[], E)(A array)
{
    return AppenderNoGCFixed!(E[])(array);
}

/// A gutted, NoGC version of std.array.appender.
///
/// Works on a fixed-size buffer.
struct AppenderNoGCFixed(A : T[], T)
{
    import std.array;

    private struct Data
    {
        size_t capacity;
        Unqual!T[] arr;
        bool canExtend = false;
    }

    private Data _data;

    @nogc:

    /// Construct an appender that will work with given buffer.
    ///
    /// Data written to the appender will overwrite the buffer from the start.
    this(T[] arr) @trusted pure nothrow
    {
        // initialize to a given array.
        _data.arr = cast(Unqual!T[])arr[0 .. 0]; //trusted
        _data.capacity = arr.length;
    }

    /**
     * Returns the capacity of the array (the maximum number of elements the
     * managed array can accommodate before triggering a reallocation).  If any
     * appending will reallocate, $(D capacity) returns $(D 0).
     */
    @property size_t capacity() const @safe pure nothrow
    {
        return _data.capacity;
    }

    /**
     * Returns the managed array.
     */
    @property inout(T)[] data() inout @trusted pure nothrow
    {
        /* @trusted operation:
         * casting Unqual!T[] to inout(T)[]
         */
        return cast(typeof(return))(_data.arr);
    }

    // ensure we can add nelems elements, resizing as necessary
    private void ensureAddable(size_t nelems) @safe pure nothrow
    {
        assert(_data.capacity >= _data.arr.length + nelems,
                "AppenderFixed ran out of space");
    }

    void put(U)(U[] items) if (is(Unqual!U == T))
    {
        // make sure we have enough space, then add the items
        ensureAddable(items.length);
        immutable len = _data.arr.length;
        immutable newlen = len + items.length;

        auto bigDataFun() @trusted nothrow { return _data.arr.ptr[0 .. newlen];}
        auto bigData = bigDataFun();

        alias UT = Unqual!T;

        bigData[len .. newlen] = items[];

        //We do this at the end, in case of exceptions
        _data.arr = bigData;
    }

    // only allow overwriting data on non-immutable and non-const data
    static if (isMutable!T)
    {
        /**
         * Clears the managed array.  This allows the elements of the array to be reused
         * for appending.
         *
         * Note that clear is disabled for immutable or const element types, due to the
         * possibility that $(D AppenderNoGCFixed) might overwrite immutable data.
         */
        void clear() @safe pure nothrow
        {
            _data.arr = ()@trusted{ return _data.arr.ptr[0 .. 0]; }();
        }
    }
    else
    {
        /// Clear is not available for const/immutable data.
        @disable void clear();
    }
}
unittest
{
    char[256] buffer;
    auto appender = appenderNoGC(buffer[]);
    appender.put("found unsupported escape character: ");
    appender.putDChar('a');
    appender.putDChar('á');
    assert(appender.data == "found unsupported escape character: aá");
}


/// Result of a validateUTF8NoGC call.
struct ValidateResult
{
    /// Is the validated string valid?
    bool   valid;
    /// Number of characters in the string.
    ///
    /// If the string is not valid, this is the number of valid characters before
    /// hitting the first invalid sequence.
    size_t characterCount;
    /// If the string is not valid, error message with details is here.
    string msg;
}

/// Validate a UTF-8 string, checking if it is well-formed Unicode.
///
/// See_Also: ValidateResult
ValidateResult validateUTF8NoGC(const(char[]) str) @trusted pure nothrow @nogc
{
    immutable len = str.length;
    size_t characterCount;
    outer: for (size_t index = 0; index < len; )
    {
        if(str[index] < 0x80)
        {
            ++index;
            ++characterCount;
            continue;
        }

        auto decoded = decodeUTF8NoGC!(No.validated)(str, index);
        if(decoded.errorMessage !is null)
        {
            return ValidateResult(false, characterCount, decoded.errorMessage);
        }
        ++characterCount;
    }

    return ValidateResult(true, characterCount);
}

/// @nogc version of std.utf.decode() for char[].
///
/// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to
/// force code using this function to be efficient.
///
/// Params:
///
/// validated = If ture, assume str is a valid UTF-8 string and don't generate any
///             error-checking code. If validated is true, str $(B must) be a valid
///             character, otherwise undefined behavior will occur. Also affects the
///             return type.
/// str       = Will decode the first code point from this string.
/// index     = Index in str where the code point starts. Will be updated to point to
///             the next code point.
///
/// Returns: If validated is true, the decoded character.
///          Otherwise a struct with a 'decoded' member - the decoded character, and a
///          'string errorMessage' member that is null on success and otherwise stores
///          the error message.
auto decodeUTF8NoGC(Flag!"validated" validated)(const(char[]) str, ref size_t index)
    @trusted pure nothrow @nogc
{
    static if(!validated) struct Result
    {
        dchar decoded;
        string errorMessage;
    }
    else alias Result = dchar;

    /// Dchar bitmask for different numbers of UTF-8 code units.
    enum bitMask     = tuple((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);

    auto pstr = str.ptr + index;

    immutable length = str.length - index;
    ubyte fst = pstr[0];

    assert(fst & 0x80);
    enum invalidUTFMsg = "Invalid UTF-8 sequence";
    static if(!validated) { enum invalidUTF = Result(cast(dchar)int.max, invalidUTFMsg); }

    // starter must have at least 2 first bits set
    static if(validated)
    {
        assert((fst & 0b1100_0000) == 0b1100_0000, invalidUTFMsg);
    }
    else if((fst & 0b1100_0000) != 0b1100_0000)
    {
        return invalidUTF;
    }

    ubyte tmp = void;
    dchar d = fst; // upper control bits are masked out later
    fst <<= 1;


    foreach (i; TypeTuple!(1, 2, 3))
    {
        static if(validated) { assert(i != length, "Decoding out of bounds"); }
        else if(i == length) { return Result(cast(dchar)int.max, "Decoding out of bounds"); }

        tmp = pstr[i];
        static if(validated)          { assert((tmp & 0xC0) == 0x80, invalidUTFMsg); }
        else if((tmp & 0xC0) != 0x80) { return invalidUTF; }

        d = (d << 6) | (tmp & 0x3F);
        fst <<= 1;

        if (!(fst & 0x80)) // no more bytes
        {
            d &= bitMask[i]; // mask out control bits

            // overlong, could have been encoded with i bytes
            static if(validated) { assert((d & ~bitMask[i - 1]) != 0, invalidUTFMsg); }
            else if((d & ~bitMask[i - 1]) == 0) { return invalidUTF; }

            // check for surrogates only needed for 3 bytes
            static if (i == 2)
            {
                static if(validated)      { assert(isValidDchar(d), invalidUTFMsg); }
                else if(!isValidDchar(d)) { return invalidUTF; }
            }

            index += i + 1;
            static if (i == 3)
            {
                static if(validated)   { assert(d <= dchar.max, invalidUTFMsg); }
                else if(d > dchar.max) { return invalidUTF; }
            }

            return Result(d);
        }
    }

    static if(validated) { assert(false, invalidUTFMsg); }
    else                 { return invalidUTF; }
}

/// @nogc version of std.utf.decode() for char[], but assumes str is valid UTF-8.
///
/// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to
/// force code using this function to be efficient.
///
/// Params:
///
/// str   = Will decode the first code point from this string. Must be valid UTF-8,
///         otherwise undefined behavior WILL occur.
/// index = Index in str where the code point starts. Will be updated to point to the
///         next code point.
alias decodeValidUTF8NoGC = decodeUTF8NoGC!(Yes.validated);

/// @nogc version of std.utf.encode() for char[].
///
/// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to
/// force code using this function to be efficient.
///
/// Params:
/// validated = If true, asssume c is a valid, non-surrogate UTF-32 code point and don't
///             generate any error-checking code. If validated is true, c $(B must) be
///             a valid character, otherwise undefined behavior will occur. Also affects
///             the return type.
/// buf       = Buffer to write the encoded result to.
/// c         = Character to encode.
///
/// Returns: If validated is true, number of bytes the encoded character takes up in buf.
///          Otherwise a struct with a 'bytes' member specifying the number of bytes of
///          the endocded character, and a 'string errorMessage' member that is null
///          if there was no error and otherwise stores the error message.
auto encodeCharNoGC(Flag!"validated" validated)(ref char[4] buf, dchar c)
    @safe pure nothrow @nogc
{
    static if(!validated) struct Result
    {
        size_t bytes;
        string errorMessage;
    }
    else alias Result = size_t;

    // Force the caller to optimize ASCII (the 1-byte case)
    assert(c >= 0x80, "Caller should explicitly handle ASCII chars");
    if (c <= 0x7FF)
    {
        assert(isValidDchar(c));
        buf[0] = cast(char)(0xC0 | (c >> 6));
        buf[1] = cast(char)(0x80 | (c & 0x3F));
        return Result(2);
    }
    if (c <= 0xFFFF)
    {
        static if(validated)
        {
            assert(0xD800 > c || c > 0xDFFF,
                   "Supposedly valid code point is a surrogate code point");
        }
        else if(0xD800 <= c && c <= 0xDFFF)
        {
            return Result(size_t.max, "Can't encode a surrogate code point in UTF-8");
        }

        assert(isValidDchar(c));
        buf[0] = cast(char)(0xE0 | (c >> 12));
        buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
        buf[2] = cast(char)(0x80 | (c & 0x3F));
        return Result(3);
    }
    if (c <= 0x10FFFF)
    {
        assert(isValidDchar(c));
        buf[0] = cast(char)(0xF0 | (c >> 18));
        buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
        buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
        buf[3] = cast(char)(0x80 | (c & 0x3F));
        return Result(4);
    }

    assert(!isValidDchar(c));
    static if(!validated)
    {
        return Result(size_t.max, "Can't encode an invalid code point in UTF-8");
    }
    else
    {
        assert(false, "Supposedly valid code point is invalid");
    }
}

/// @nogc version of std.utf.encode() for char[], but assumes c is a valid UTF-32 char.
///
/// The caller $(B must) handle ASCII (< 0x80) characters manually; this is asserted to
/// force code using this function to be efficient.
///
/// Params:
///
/// buf = Buffer to write the encoded result to.
/// c   = Character to encode. Must be valid UTF-32, otherwise undefined behavior
///       $(D will) occur.
///
/// Returns: Number of bytes the encoded character takes up in buf.
alias encodeValidCharNoGC = encodeCharNoGC!(Yes.validated);

/// @nogc version of std.utf.isValidDchar
bool isValidDchar(dchar c) @safe pure nothrow @nogc
{
    return c < 0xD800 || (c > 0xDFFF && c <= 0x10FFFF);
}