Rewritten UTFDecoder into a function.

2014-07-23 01:36:09 +02:00 · 2014-07-23 01:36:09 +02:00 · 0a625ec7a7
parent b060699950
commit 0a625ec7a7
1 changed files with 162 additions and 174 deletions
--- a/source/dyaml/reader.d
+++ b/source/dyaml/reader.d
@ -75,8 +75,9 @@ final class Reader
        uint line_;
        // Current column in file.
        uint column_;
-        // Decoder reading data from file and decoding it to UTF-32.
+
-        UTFFastDecoder decoder_;
+        // Original Unicode encoding of the data.
        Encoding encoding_;
        version(unittest)
        {
@ -95,26 +96,26 @@ final class Reader
        this(Stream stream) @trusted //!nothrow
        {
            auto streamBytes = streamToBytesGC(stream);
-            auto result = fixUTFByteOrder(streamBytes);
+            auto endianResult = fixUTFByteOrder(streamBytes);
-            if(result.bytesStripped > 0)
+            if(endianResult.bytesStripped > 0)
            {
                throw new ReaderException("Size of UTF-16 or UTF-32 input not aligned "
                                          "to 2 or 4 bytes, respectively");
            }
-            version(unittest) { endian_ = result.endian; }
+            version(unittest) { endian_ = endianResult.endian; }
-            decoder_ = UTFFastDecoder(result.array, result.encoding);
+            encoding_ = endianResult.encoding;
            decoder_.decodeAll();
            const msg = decoder_.getAndClearErrorMessage();
            auto decodeResult = decodeUTF(endianResult.array, endianResult.encoding);
            const msg = decodeResult.errorMessage;
            if(msg !is null)
            {
                throw new ReaderException("UTF decoding error: " ~ msg);
            }
-            buffer_ = decoder_.decoded;
+            buffer_ = decodeResult.decoded;
-
+            // The part of buffer_ excluding trailing zeroes.
            // The part of buffer excluding trailing zeroes.
            auto noZeros = buffer_;
            while(!noZeros.empty && noZeros.back == '\0') { noZeros.popBack(); }
            enforce(printable(noZeros[]),
@ -238,191 +239,122 @@ final class Reader
        final size_t charIndex() @safe pure nothrow const @nogc { return charIndex_; }
        /// Get encoding of the input buffer.
-        final Encoding encoding() @safe pure nothrow const @nogc { return decoder_.encoding; }
+        final Encoding encoding() @safe pure nothrow const @nogc { return encoding_; }
 }
 private:
-alias UTFDecoder UTFFastDecoder;
+// Decode an UTF-8/16/32 buffer to UTF-32 (for UTF-32 this does nothing).
-
+//
-struct UTFDecoder
+// Params:
 //
 // input    = The UTF-8/16/32 buffer to decode.
 // encoding = Encoding of input.
 // 
 // Returns:
 //
 // A struct with the following members:
 //
 // $(D string errorMessage) In case of a decoding error, the error message is stored
 //                          here. If there was no error, errorMessage is NULL. Always
 //                          check this first before using the other members.
 // $(D dchar[] decoded)     A GC-allocated buffer with decoded UTF-32 characters.
 // $(D size_t maxChars)     XXX reserved for future
 auto decodeUTF(ubyte[] input, UTFEncoding encoding) @safe pure nothrow
 {
-    private:
+    // Documented in function ddoc.
-        // UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence).
+    struct Result
-        static immutable ubyte[256] utf8Stride =
+    {
-        [
+        string errorMessage;
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
            0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
            0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
            0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
            0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
            2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
            2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
            3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
            4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
        ];
-        // Encoding of the input buffer.
+        dchar[] decoded;
-        UTFEncoding encoding_;
+    }
        // Maximum number of characters that might be in the buffer.
        size_t maxChars_;
        // Part of the input buffer that has not yet been decoded.
        ubyte[] input_;
-        // Decoded (UTF-32) version of the entire input_. If input is UTF-32, this is
+    Result result;
        // just a reference to input_.
        dchar[] decoded_;
-        // Current error message.
+    final switch(encoding)
-        //
+    {
-        // To be fully nothrow, we use return values and the user (Reader) can check
+        case UTFEncoding.UTF_8:  result.maxChars = input.length;     break;
-        // for a detailed error message if they get an error return.
+        case UTFEncoding.UTF_16: result.maxChars = input.length / 2; break;
-        string errorMessage_;
+        case UTFEncoding.UTF_32: result.maxChars = input.length / 2; break;
    }
-    public:
+    // Decode input_ if it's encoded as UTF-8 or UTF-16.
-        /// Construct a UTFBlockDecoder decoding data from a buffer.
+    //
-        this(ubyte[] buffer, UTFEncoding encoding) @safe pure nothrow @nogc
+    // Params:
    //
    // buffer = The input buffer to decode.
    // result = A Result struct to put decoded result and any error messages to.
    //
    // On error, result.errorMessage will be set.
    static void decode(C)(C[] input, ref Result result) @safe pure nothrow
    {
        // End of part of input that contains complete characters that can be decoded.
        const size_t end = endOfLastUTFSequence(input);
        // If end is 0, there are no full chars.
        // This can happen at the end of file if there is an incomplete UTF sequence.
        if(end < input.length)
        {
-            input_    = buffer;
+            result.errorMessage = "Invalid UTF character at the end of input";
-            encoding_ = encoding;
+            return;
            final switch(encoding_)
            {
                case UTFEncoding.UTF_8:  maxChars_ = input_.length;     break;
                case UTFEncoding.UTF_16: maxChars_ = input_.length / 2; break;
                case UTFEncoding.UTF_32: maxChars_ = input_.length / 2; break;
            }
        }
-        /// Decode all data passed to the constructor.
+        const srclength = input.length;
-        ///
+        try for(size_t srcpos = 0; srcpos < srclength;)
        /// On error, getAndClearErrorMessage() will return a non-null string.
        void decodeAll() @safe pure nothrow
        {
-            assert(decoded_ is null, "Calling decodeAll more than once");
+            const c = input[srcpos];
-
+            if(c < 0x80)
            final switch(encoding_)
            {
-                case UTFEncoding.UTF_8:  decode(cast(char[])input_); break;
+                result.decoded ~= c;
-                case UTFEncoding.UTF_16:
+                ++srcpos;
                    assert(input_.length % 2 == 0, "UTF-16 buffer size must be even");
                    decode(cast(wchar[])input_);
                    break;
                case UTFEncoding.UTF_32:
                    assert(input_.length % 4 == 0,
                           "UTF-32 buffer size must be a multiple of 4");
                    // No need to decode anything
                    decoded_ = cast(dchar[])input_;
                    break;
            }
-            // XXX This is risky. We rely on the assumption that the scanner only uses 
+            else
            // peek() to detect the end of the buffer. Should this cause any bugs,
            // revert.
            //
            // The buffer must be zero terminated for scanner to detect its end.
            // if(decoded_.empty || decoded_.back() != '\0')
            // {
            //     decoded_ ~= cast(dchar)'\0';
            // }
        }
        /// Get encoding we're decoding from.
        UTFEncoding encoding() const pure @safe nothrow @nogc { return encoding_; }
        /// Get all decoded characters.
        const(dchar[]) decoded() @safe pure nothrow @nogc { return decoded_; }
        /// Get the error message and clear it.
        string getAndClearErrorMessage() @safe pure nothrow @nogc
        {
            const result = errorMessage_;
            errorMessage_ = null;
            return result;
        }
    private:
        // Decode input_ if it's encoded as UTF-8 or UTF-16.
        //
        // On error, errorMessage_ will be set.
        void decode(C)(C[] buffer) @safe pure nothrow
        {
            // End of part of buffer that contains complete characters that can be decoded.
            const size_t end = endOfLastUTFSequence(buffer);
            // If end is 0, there are no full chars.
            // This can happen at the end of file if there is an incomplete UTF sequence.
            if(end < buffer.length)
            {
-                errorMessage_ = "Invalid UTF character at the end of buffer";
+                result.decoded ~= std.utf.decode(input, srcpos);
                return;
            }
            const srclength = buffer.length;
            try for(size_t srcpos = 0; srcpos < srclength;)
            {
                const c = buffer[srcpos];
                if(c < 0x80)
                {
                    decoded_ ~= c;
                    ++srcpos;
                }
                else
                {
                    decoded_ ~= std.utf.decode(buffer, srcpos);
                }
            }
            catch(UTFException e)
            {
                errorMessage_ = e.msg;
                return;
            }
            catch(Exception e)
            {
                assert(false, "Unexpected exception in decode(): " ~ e.msg);
            }
        }
-
+        catch(UTFException e)
        // Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer.
        size_t endOfLastUTFSequence(C)(const C[] buffer)
            @safe pure nothrow const @nogc
        {
-            static if(is(C == char))
+            result.errorMessage = e.msg;
-            {
+            return;
                for(long end = buffer.length - 1; end >= 0; --end)
                {
                    const stride = utf8Stride[buffer[cast(size_t)end]];
                    if(stride != 0xFF)
                    {
                        // If stride goes beyond end of the buffer, return end.
                        // Otherwise the last sequence ends at buffer.length, so we can
                        // return that. (Unless there is an invalid code point, which is
                        // caught at decoding)
                        return (stride > buffer.length - end) ? cast(size_t)end : buffer.length;
                    }
                }
                return 0;
            }
            else static if(is(C == wchar))
            {
                // TODO this is O(N), which is slow. Find out if we can somehow go
                // from the end backwards with UTF-16.
                size_t end = 0;
                while(end < buffer.length)
                {
                    const s = stride(buffer, end);
                    if(s + end > buffer.length) { break; }
                    end += s;
                }
                return end;
            }
        }
        catch(Exception e)
        {
            assert(false, "Unexpected exception in decode(): " ~ e.msg);
        }
    }
    final switch(encoding)
    {
        case UTFEncoding.UTF_8:  decode(cast(char[])input, result); break;
        case UTFEncoding.UTF_16:
            assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
            decode(cast(wchar[])input, result);
            break;
        case UTFEncoding.UTF_32:
            assert(input.length % 4 == 0,
                    "UTF-32 buffer size must be a multiple of 4");
            // No need to decode anything
            result.decoded = cast(dchar[])input;
            break;
    }
    if(result.errorMessage !is null) { return result; }
    // XXX This is risky. We rely on the assumption that the scanner only uses 
    // peek() to detect the end of the buffer. Should this cause any bugs,
    // revert.
    //
    // The buffer must be zero terminated for scanner to detect its end.
    // if(result.decoded.empty || result.decoded.back() != '\0')
    // {
    //     result.decoded ~= cast(dchar)'\0';
    // }
    return result;
 }
 /// Determine if all characters in an array are printable.
 ///
 /// Params:  chars = Characters to check.
@ -443,6 +375,62 @@ bool printable(const dchar[] chars) @safe pure nothrow @nogc
    return true;
 }
 // Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer.
 size_t endOfLastUTFSequence(C)(const C[] buffer)
    @safe pure nothrow @nogc
 {
    // UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence).
    static immutable ubyte[256] utf8Stride =
    [
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
        3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
        4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
    ];
    static if(is(C == char))
    {
        for(long end = buffer.length - 1; end >= 0; --end)
        {
            const stride = utf8Stride[buffer[cast(size_t)end]];
            if(stride != 0xFF)
            {
                // If stride goes beyond end of the buffer, return end.
                // Otherwise the last sequence ends at buffer.length, so we can
                // return that. (Unless there is an invalid code point, which is
                // caught at decoding)
                return (stride > buffer.length - end) ? cast(size_t)end : buffer.length;
            }
        }
        return 0;
    }
    else static if(is(C == wchar))
    {
        // TODO this is O(N), which is slow. Find out if we can somehow go
        // from the end backwards with UTF-16.
        size_t end = 0;
        while(end < buffer.length)
        {
            const s = stride(buffer, end);
            if(s + end > buffer.length) { break; }
            end += s;
        }
        return end;
    }
 }
 // Unittests.
 import std.stream;