Moved unused, but potentially useful Reader code to dyaml.unused.

2014-07-30 18:38:27 +02:00 · 2014-07-30 18:38:27 +02:00 · 6c15bd95cc
commit 6c15bd95cc
parent e58b092fe1
2 changed files with 162 additions and 157 deletions
--- a/source/dyaml/reader.d
+++ b/source/dyaml/reader.d
@ -604,107 +604,6 @@ private:
 private:


-// Decode an UTF-8/16/32 buffer to UTF-32 (for UTF-32 this does nothing).
-//
-// Params:
-//
-// input    = The UTF-8/16/32 buffer to decode.
-// encoding = Encoding of input.
-//
-// Returns:
-//
-// A struct with the following members:
-//
-// $(D string errorMessage) In case of a decoding error, the error message is stored
-//                          here. If there was no error, errorMessage is NULL. Always
-//                          check this first before using the other members.
-// $(D dchar[] decoded)     A GC-allocated buffer with decoded UTF-32 characters.
-auto decodeUTF(ubyte[] input, UTFEncoding encoding) @safe pure nothrow
-{
-    // Documented in function ddoc.
-    struct Result
-    {
-        string errorMessage;
-        dchar[] decoded;
-    }
-
-    Result result;
-
-    // Decode input_ if it's encoded as UTF-8 or UTF-16.
-    //
-    // Params:
-    //
-    // buffer = The input buffer to decode.
-    // result = A Result struct to put decoded result and any error messages to.
-    //
-    // On error, result.errorMessage will be set.
-    static void decode(C)(C[] input, ref Result result) @safe pure nothrow
-    {
-        // End of part of input that contains complete characters that can be decoded.
-        const size_t end = endOfLastUTFSequence(input);
-        // If end is 0, there are no full chars.
-        // This can happen at the end of file if there is an incomplete UTF sequence.
-        if(end < input.length)
-        {
-            result.errorMessage = "Invalid UTF character at the end of input";
-            return;
-        }
-
-        const srclength = input.length;
-        try for(size_t srcpos = 0; srcpos < srclength;)
-        {
-            const c = input[srcpos];
-            if(c < 0x80)
-            {
-                result.decoded ~= c;
-                ++srcpos;
-            }
-            else
-            {
-                result.decoded ~= std.utf.decode(input, srcpos);
-            }
-        }
-        catch(UTFException e)
-        {
-            result.errorMessage = e.msg;
-            return;
-        }
-        catch(Exception e)
-        {
-            assert(false, "Unexpected exception in decode(): " ~ e.msg);
-        }
-    }
-
-    final switch(encoding)
-    {
-        case UTFEncoding.UTF_8:  decode(cast(char[])input, result); break;
-        case UTFEncoding.UTF_16:
-            assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
-            decode(cast(wchar[])input, result);
-            break;
-        case UTFEncoding.UTF_32:
-            assert(input.length % 4 == 0,
-                    "UTF-32 buffer size must be a multiple of 4");
-            // No need to decode anything
-            result.decoded = cast(dchar[])input;
-            break;
-    }
-
-    if(result.errorMessage !is null) { return result; }
-
-    // XXX This is risky. We rely on the assumption that the scanner only uses
-    // peek() to detect the end of the buffer. Should this cause any bugs, revert.
-    //
-    // The buffer must be zero terminated for scanner to detect its end.
-    // if(result.decoded.empty || result.decoded.back() != '\0')
-    // {
-    //     result.decoded ~= cast(dchar)'\0';
-    // }
-
-    return result;
-}
-
-
 // Convert a UTF-8/16/32 buffer to UTF-8, in-place if possible.
 //
 // Params:
@ -826,62 +725,6 @@ bool isPrintableValidUTF8(const char[] chars) @safe pure nothrow @nogc
    return true;
 }

-// Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer.
-size_t endOfLastUTFSequence(C)(const C[] buffer)
-    @safe pure nothrow @nogc
-{
-    static if(is(C == char))
-    {
-        for(long end = buffer.length - 1; end >= 0; --end)
-        {
-            const stride = utf8Stride[buffer[cast(size_t)end]];
-            if(stride != 0xFF)
-            {
-                // If stride goes beyond end of the buffer, return end.
-                // Otherwise the last sequence ends at buffer.length, so we can
-                // return that. (Unless there is an invalid code unit, which is
-                // caught at decoding)
-                return (stride > buffer.length - end) ? cast(size_t)end : buffer.length;
-            }
-        }
-        return 0;
-    }
-    else static if(is(C == wchar))
-    {
-        // TODO this is O(N), which is slow. Find out if we can somehow go
-        // from the end backwards with UTF-16.
-        size_t end = 0;
-        while(end < buffer.length)
-        {
-            const s = stride(buffer, end);
-            if(s + end > buffer.length) { break; }
-            end += s;
-        }
-        return end;
-    }
-}
-
-// UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence).
-immutable ubyte[256] utf8Stride =
-[
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
-    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
-    4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
-];
-
 // Unittests.

 import std.stream;
--- a/source/dyaml/unused.d
+++ b/source/dyaml/unused.d
@ -0,0 +1,162 @@
+//          Copyright Ferdinand Majerech 2014.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+
+// Code that is currently unused but may be useful for future D:YAML releases
+module dyaml.unused;
+
+
+
+import std.utf;
+
+import tinyendian;
+
+// Decode an UTF-8/16/32 buffer to UTF-32 (for UTF-32 this does nothing).
+//
+// Params:
+//
+// input    = The UTF-8/16/32 buffer to decode.
+// encoding = Encoding of input.
+//
+// Returns:
+//
+// A struct with the following members:
+//
+// $(D string errorMessage) In case of a decoding error, the error message is stored
+//                          here. If there was no error, errorMessage is NULL. Always
+//                          check this first before using the other members.
+// $(D dchar[] decoded)     A GC-allocated buffer with decoded UTF-32 characters.
+auto decodeUTF(ubyte[] input, UTFEncoding encoding) @safe pure nothrow
+{
+    // Documented in function ddoc.
+    struct Result
+    {
+        string errorMessage;
+        dchar[] decoded;
+    }
+
+    Result result;
+
+    // Decode input_ if it's encoded as UTF-8 or UTF-16.
+    //
+    // Params:
+    //
+    // buffer = The input buffer to decode.
+    // result = A Result struct to put decoded result and any error messages to.
+    //
+    // On error, result.errorMessage will be set.
+    static void decode(C)(C[] input, ref Result result) @safe pure nothrow
+    {
+        // End of part of input that contains complete characters that can be decoded.
+        const size_t end = endOfLastUTFSequence(input);
+        // If end is 0, there are no full chars.
+        // This can happen at the end of file if there is an incomplete UTF sequence.
+        if(end < input.length)
+        {
+            result.errorMessage = "Invalid UTF character at the end of input";
+            return;
+        }
+
+        const srclength = input.length;
+        try for(size_t srcpos = 0; srcpos < srclength;)
+        {
+            const c = input[srcpos];
+            if(c < 0x80)
+            {
+                result.decoded ~= c;
+                ++srcpos;
+            }
+            else
+            {
+                result.decoded ~= std.utf.decode(input, srcpos);
+            }
+        }
+        catch(UTFException e)
+        {
+            result.errorMessage = e.msg;
+            return;
+        }
+        catch(Exception e)
+        {
+            assert(false, "Unexpected exception in decode(): " ~ e.msg);
+        }
+    }
+
+    final switch(encoding)
+    {
+        case UTFEncoding.UTF_8:  decode(cast(char[])input, result); break;
+        case UTFEncoding.UTF_16:
+            assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
+            decode(cast(wchar[])input, result);
+            break;
+        case UTFEncoding.UTF_32:
+            assert(input.length % 4 == 0,
+                    "UTF-32 buffer size must be a multiple of 4");
+            // No need to decode anything
+            result.decoded = cast(dchar[])input;
+            break;
+    }
+
+    if(result.errorMessage !is null) { return result; }
+
+    return result;
+}
+
+
+// Determine the end of last UTF-8 or UTF-16 sequence in a raw buffer.
+size_t endOfLastUTFSequence(C)(const C[] buffer)
+    @safe pure nothrow @nogc
+{
+    static if(is(C == char))
+    {
+        for(long end = buffer.length - 1; end >= 0; --end)
+        {
+            const stride = utf8Stride[buffer[cast(size_t)end]];
+            if(stride != 0xFF)
+            {
+                // If stride goes beyond end of the buffer, return end.
+                // Otherwise the last sequence ends at buffer.length, so we can
+                // return that. (Unless there is an invalid code unit, which is
+                // caught at decoding)
+                return (stride > buffer.length - end) ? cast(size_t)end : buffer.length;
+            }
+        }
+        return 0;
+    }
+    else static if(is(C == wchar))
+    {
+        // TODO this is O(N), which is slow. Find out if we can somehow go
+        // from the end backwards with UTF-16.
+        size_t end = 0;
+        while(end < buffer.length)
+        {
+            const s = stride(buffer, end);
+            if(s + end > buffer.length) { break; }
+            end += s;
+        }
+        return end;
+    }
+}
+
+// UTF-8 codepoint strides (0xFF are codepoints that can't start a sequence).
+immutable ubyte[256] utf8Stride =
+[
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+    0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+    4,4,4,4,4,4,4,4,5,5,5,5,6,6,0xFF,0xFF,
+];