UTF-8 validation now uses UTF-8 decoding code.

2014-08-02 01:37:16 +02:00 · 2014-08-02 01:37:16 +02:00 · 7b699c5903
parent b5da695d6b
commit 7b699c5903
2 changed files with 10 additions and 93 deletions
--- a/source/dyaml/nogcutil.d
+++ b/source/dyaml/nogcutil.d
@ -233,20 +233,12 @@ struct ValidateResult
    /// Is the validated string valid?
    bool   valid;
    /// Number of characters in the string.
+    ///
+    /// If the string is not valid, this is the number of valid characters before
+    /// hitting the first invalid sequence.
    size_t characterCount;
    /// If the string is not valid, error message with details is here.
    string msg;
-    /// If the string is not valid, the first invalid sequence of bytes is here.
-    const(uint)[] sequence() @safe pure nothrow const @nogc
-    {
-        return sequenceBuffer[0 .. sequenceLength];
-    }
-
-private:
-    // Buffer for the invalid sequence of bytes if valid == false.
-    uint[4] sequenceBuffer;
-    // Number of used bytes in sequenceBuffer.
-    size_t  sequenceLength;
 }

 /// Validate a UTF-8 string, checking if it is well-formed Unicode.
@ -265,88 +257,12 @@ ValidateResult validateUTF8NoGC(const(char[]) str) @trusted pure nothrow @nogc
            continue;
        }

-        // The following encodings are valid, except for the 5 and 6 byte combinations:
-        //  0xxxxxxx
-        //  110xxxxx 10xxxxxx
-        //  1110xxxx 10xxxxxx 10xxxxxx
-        //  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-        //  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-        //  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-
-        // Dchar bitmask for different numbers of UTF-8 code units.
-        import std.typecons;
-        enum bitMask     = tuple((1 << 7) - 1, (1 << 11) - 1, (1 << 16) - 1, (1 << 21) - 1);
-        auto pstr        = str.ptr + index;
-        immutable length = str.length - index;
-        ubyte fst        = pstr[0];
-
-        static ValidateResult error(const(char[]) str, string msg) @safe pure nothrow @nogc
+        auto decoded = decodeUTF8NoGC!(No.validated)(str, index);
+        if(decoded.errorMessage !is null)
        {
-            ValidateResult result;
-            size_t i;
-
-            do
-            {
-                result.sequenceBuffer[i] = str[i];
-            } while (++i < str.length && i < 4 && (str[i] & 0xC0) == 0x80);
-
-            result.valid          = false;
-            result.msg            = msg;
-            result.sequenceLength = i;
-            return result;
+            return ValidateResult(false, characterCount, decoded.errorMessage);
        }
-
-        static ValidateResult invalidUTF(const(char[]) str) @safe pure nothrow @nogc
-        {
-            return error(str, "Invalid UTF-8 sequence");
-        }
-        static ValidateResult outOfBounds(const(char[]) str) @safe pure nothrow @nogc
-        {
-            return error(str, "Attempted to decode past the end of a string");
-        }
-
-        assert(fst & 0x80);
-        // starter must have at least 2 first bits set
-        if((fst & 0b1100_0000) != 0b1100_0000) { return invalidUTF(pstr[0 .. length]); }
-        ubyte tmp = void;
-        dchar d = fst; // upper control bits are masked out later
-        fst <<= 1;
-
-        foreach (i; TypeTuple!(1, 2, 3))
-        {
-            if(i == length) { return outOfBounds(pstr[0 .. length]); }
-
-            tmp = pstr[i];
-
-            if ((tmp & 0xC0) != 0x80) { return invalidUTF(pstr[0 .. length]); }
-
-            d = (d << 6) | (tmp & 0x3F);
-            fst <<= 1;
-
-            if (!(fst & 0x80)) // no more bytes
-            {
-                d &= bitMask[i]; // mask out control bits
-
-                // overlong, could have been encoded with i bytes
-                if ((d & ~bitMask[i - 1]) == 0) { return invalidUTF(pstr[0 .. length]); }
-
-                // check for surrogates only needed for 3 bytes
-                static if(i == 2)
-                {
-                    if (!isValidDchar(d)) { return invalidUTF(pstr[0 .. length]); }
-                }
-
-                ++characterCount;
-                index += i + 1;
-                static if(i == 3)
-                {
-                    if (d > dchar.max) { return invalidUTF(pstr[0 .. length]); }
-                }
-                continue outer;
-            }
-        }
-
-        return invalidUTF(pstr[0 .. length]);
+        ++characterCount;
    }

    return ValidateResult(true, characterCount);
--- a/source/dyaml/reader.d
+++ b/source/dyaml/reader.d
@ -682,8 +682,9 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
            const validateResult = result.utf8.validateUTF8NoGC();
            if(!validateResult.valid)
            {
-                result.errorMessage = "UTF-8 validation error: " ~ validateResult.msg ~
-                                      validateResult.sequence.to!string;
+                result.errorMessage = "UTF-8 validation error after character #" ~
+                                      validateResult.characterCount.to!string ~ ": " ~
+                                      validateResult.msg;
            }
            result.characterCount = validateResult.characterCount;
            break;