Moved unused, but potentially useful Reader code to dyaml.unused.

2014-07-30 18:38:27 +02:00 · 2014-07-30 18:38:27 +02:00 · 6c15bd95cc
commit 6c15bd95cc
parent e58b092fe1
2 changed files with 162 additions and 157 deletions
--- a/source/dyaml/reader.d
+++ b/source/dyaml/reader.d
@ -604,107 +604,6 @@ private:
 private:
 // Decode an UTF-8/16/32 buffer to UTF-32 (for UTF-32 this does nothing).
 //
 // Params:
 //
 // input    = The UTF-8/16/32 buffer to decode.
 // encoding = Encoding of input.
 //
 // Returns:
 //
 // A struct with the following members:
 //
 // $(D string errorMessage) In case of a decoding error, the error message is stored
 //                          here. If there was no error, errorMessage is NULL. Always
 //                          check this first before using the other members.
 // $(D dchar[] decoded)     A GC-allocated buffer with decoded UTF-32 characters.
 auto decodeUTF(ubyte[] input, UTFEncoding encoding) @safe pure nothrow
 {
    // Documented in function ddoc.
    struct Result
    {
        string errorMessage;
        dchar[] decoded;
    }
    Result result;
    // Decode input_ if it's encoded as UTF-8 or UTF-16.
    //
    // Params:
    //
    // buffer = The input buffer to decode.
    // result = A Result struct to put decoded result and any error messages to.
    //
    // On error, result.errorMessage will be set.
    static void decode(C)(C[] input, ref Result result) @safe pure nothrow
    {
        // End of part of input that contains complete characters that can be decoded.
        const size_t end = endOfLastUTFSequence(input);
        // If end is 0, there are no full chars.
        // This can happen at the end of file if there is an incomplete UTF sequence.
        if(end < input.length)
        {
            result.errorMessage = "Invalid UTF character at the end of input";
            return;
        }
        const srclength = input.length;
        try for(size_t srcpos = 0; srcpos < srclength;)
        {
            const c = input[srcpos];
            if(c < 0x80)
            {
                result.decoded ~= c;
                ++srcpos;
            }
            else
            {
                result.decoded ~= std.utf.decode(input, srcpos);
            }
        }
        catch(UTFException e)
        {
            result.errorMessage = e.msg;
            return;
        }
        catch(Exception e)
        {
            assert(false, "Unexpected exception in decode(): " ~ e.msg);
        }
    }
    final switch(encoding)
    {
        case UTFEncoding.UTF_8:  decode(cast(char[])input, result); break;
        case UTFEncoding.UTF_16:
            assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
            decode(cast(wchar[])input, result);
            break;
        case UTFEncoding.UTF_32:
            assert(input.length % 4 == 0,
                    "UTF-32 buffer size must be a multiple of 4");
            // No need to decode anything
            result.decoded = cast(dchar[])input;
            break;
    }
    if(result.errorMessage !is null) { return result; }
    // XXX This is risky. We rely on the assumption that the scanner only uses
    // peek() to detect the end of the buffer. Should this cause any bugs, revert.
    //
    // The buffer must be zero terminated for scanner to detect its end.
    // if(result.decoded.empty || result.decoded.back() != '\0')
    // {
    //     result.decoded ~= cast(dchar)'\0';
    // }
    return result;
 }
 // Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible.
 //
 // Params:
@ -826,62 +725,6 @@ bool isPrintableValidUTF8(const char[] chars) @safe pure nothrow @nogc
    return true;
 }
 // Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer.
 size_t endOfLastUTFSequence(C)(const C[] buffer)
    @safe pure nothrow @nogc
 {
    static if(is(C == char))
    {
        for(long end = buffer.length - 1; end >= 0; --end)
        {
            const stride = utf8Stride[buffer[cast(size_t)end]];
            if(stride != 0xFF)
            {
                // If stride goes beyond end of the buffer, return end.
                // Otherwise the last sequence ends at buffer.length, so we can
                // return that. (Unless there is an invalid code unit, which is
                // caught at decoding)
                return (stride > buffer.length - end) ? cast(size_t)end : buffer.length;
            }
        }
        return 0;
    }
    else static if(is(C == wchar))
    {
        // TODO this is O(N), which is slow. Find out if we can somehow go
        // from the end backwards with UTF-16.
        size_t end = 0;
        while(end < buffer.length)
        {
            const s = stride(buffer, end);
            if(s + end > buffer.length) { break; }
            end += s;
        }
        return end;
    }
 }
 // UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence).
 immutable ubyte[256] utf8Stride =
 [
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
    4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
 ];
 // Unittests.
 import std.stream;
--- a/source/dyaml/unused.d
+++ b/source/dyaml/unused.d
@ -0,0 +1,162 @@
 //          Copyright Ferdinand Majerech 2014.
 // Distributed under the Boost Software License, Version 1.0.
 //    (See accompanying file LICENSE_1_0.txt or copy at
 //          http://www.boost.org/LICENSE_1_0.txt)
 // Code that is currently unused but may be useful for future D:YAML releases
 module dyaml.unused;
 import std.utf;
 import tinyendian;
 // Decode an UTF-8/16/32 buffer to UTF-32 (for UTF-32 this does nothing).
 //
 // Params:
 //
 // input    = The UTF-8/16/32 buffer to decode.
 // encoding = Encoding of input.
 //
 // Returns:
 //
 // A struct with the following members:
 //
 // $(D string errorMessage) In case of a decoding error, the error message is stored
 //                          here. If there was no error, errorMessage is NULL. Always
 //                          check this first before using the other members.
 // $(D dchar[] decoded)     A GC-allocated buffer with decoded UTF-32 characters.
 auto decodeUTF(ubyte[] input, UTFEncoding encoding) @safe pure nothrow
 {
    // Documented in function ddoc.
    struct Result
    {
        string errorMessage;
        dchar[] decoded;
    }
    Result result;
    // Decode input_ if it's encoded as UTF-8 or UTF-16.
    //
    // Params:
    //
    // buffer = The input buffer to decode.
    // result = A Result struct to put decoded result and any error messages to.
    //
    // On error, result.errorMessage will be set.
    static void decode(C)(C[] input, ref Result result) @safe pure nothrow
    {
        // End of part of input that contains complete characters that can be decoded.
        const size_t end = endOfLastUTFSequence(input);
        // If end is 0, there are no full chars.
        // This can happen at the end of file if there is an incomplete UTF sequence.
        if(end < input.length)
        {
            result.errorMessage = "Invalid UTF character at the end of input";
            return;
        }
        const srclength = input.length;
        try for(size_t srcpos = 0; srcpos < srclength;)
        {
            const c = input[srcpos];
            if(c < 0x80)
            {
                result.decoded ~= c;
                ++srcpos;
            }
            else
            {
                result.decoded ~= std.utf.decode(input, srcpos);
            }
        }
        catch(UTFException e)
        {
            result.errorMessage = e.msg;
            return;
        }
        catch(Exception e)
        {
            assert(false, "Unexpected exception in decode(): " ~ e.msg);
        }
    }
    final switch(encoding)
    {
        case UTFEncoding.UTF_8:  decode(cast(char[])input, result); break;
        case UTFEncoding.UTF_16:
            assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
            decode(cast(wchar[])input, result);
            break;
        case UTFEncoding.UTF_32:
            assert(input.length % 4 == 0,
                    "UTF-32 buffer size must be a multiple of 4");
            // No need to decode anything
            result.decoded = cast(dchar[])input;
            break;
    }
    if(result.errorMessage !is null) { return result; }
    return result;
 }
 // Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer.
 size_t endOfLastUTFSequence(C)(const C[] buffer)
    @safe pure nothrow @nogc
 {
    static if(is(C == char))
    {
        for(long end = buffer.length - 1; end >= 0; --end)
        {
            const stride = utf8Stride[buffer[cast(size_t)end]];
            if(stride != 0xFF)
            {
                // If stride goes beyond end of the buffer, return end.
                // Otherwise the last sequence ends at buffer.length, so we can
                // return that. (Unless there is an invalid code unit, which is
                // caught at decoding)
                return (stride > buffer.length - end) ? cast(size_t)end : buffer.length;
            }
        }
        return 0;
    }
    else static if(is(C == wchar))
    {
        // TODO this is O(N), which is slow. Find out if we can somehow go
        // from the end backwards with UTF-16.
        size_t end = 0;
        while(end < buffer.length)
        {
            const s = stride(buffer, end);
            if(s + end > buffer.length) { break; }
            end += s;
        }
        return end;
    }
 }
 // UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence).
 immutable ubyte[256] utf8Stride =
 [
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
    4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
 ];