Only validate UTF-8 if we get UTF-8 input (UTF-16/32 validated at conversion)

2014-07-30 22:29:40 +02:00 · 2014-07-30 22:29:40 +02:00 · 4e2c3e6093
commit 4e2c3e6093
parent 6c15bd95cc
1 changed files with 52 additions and 48 deletions
--- a/source/dyaml/reader.d
+++ b/source/dyaml/reader.d
@ -121,14 +121,10 @@ final class Reader

            buffer8_ = utf8Result.utf8;

-            const validateResult = buffer8_.validateUTF8NoGC;
-            enforce(validateResult.valid,
-                    new ReaderException(validateResult.msg ~
-                                        validateResult.sequence.to!string));
+            characterCount_ = utf8Result.characterCount;
            // Check that all characters in buffer are printable.
            enforce(isPrintableValidUTF8(buffer8_),
                    new ReaderException("Special unicode characters are not allowed"));
-            characterCount_ = validateResult.characterCount;

            this.sliceBuilder = SliceBuilder(this);
        }
@ -617,9 +613,10 @@ private:
 // A struct with the following members:
 //
 // $(D string errorMessage)   In case of an error, the error message is stored here. If
-//                          there was no error, errorMessage is NULL. Always check this
-//                          first.
+//                            there was no error, errorMessage is NULL. Always check
+//                            this first.
 // $(D char[] utf8)           input converted to UTF-8. May be a slice of input.
+// $(D size_t characterCount) Number of characters (code points) in input.
 auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
 {
    // Documented in function ddoc.
@ -627,6 +624,7 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
    {
        string errorMessage;
        char[] utf8;
+        size_t characterCount;
    }

    Result result;
@ -639,9 +637,7 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
    // result = A Result struct to put encoded result and any error messages to.
    //
    // On error, result.errorMessage will be set.
-    static void encode(C)(C[] input, ref Result result) @safe pure nothrow
-    {
-        try
+    static void encode(C)(C[] input, ref Result result) @safe pure
    {
        // We can do UTF-32->UTF-8 in place because all UTF-8 sequences are 4 or
        // less bytes.
@ -652,6 +648,7 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
            auto length = 0;
            foreach(dchar c; input)
            {
+                ++result.characterCount;
                // ASCII
                if(c < 0x80)
                {
@ -674,20 +671,23 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
        // Unfortunately we can't do UTF-16 in place so we just use std.conv.to
        else
        {
+            result.characterCount = std.utf.count(input);
            result.utf8 = input.to!(char[]);
        }
    }
-        catch(ConvException e) { result.errorMessage = e.msg; }
-        catch(UTFException e)  { result.errorMessage = e.msg; }
-        catch(Exception e)
-        {
-            assert(false, "Unexpected exception in encode(): " ~ e.msg);
-        }
-    }

-    final switch(encoding)
+    try final switch(encoding)
    {
-        case UTFEncoding.UTF_8:  result.utf8 = cast(char[])input; break;
+        case UTFEncoding.UTF_8:
+            result.utf8 = cast(char[])input;
+            const validateResult = result.utf8.validateUTF8NoGC();
+            if(!validateResult.valid)
+            {
+                result.errorMessage = "UTF-8 validation error: " ~ validateResult.msg ~
+                                      validateResult.sequence.to!string;
+            }
+            result.characterCount = validateResult.characterCount;
+            break;
        case UTFEncoding.UTF_16:
            assert(input.length % 2 == 0, "UTF-16 buffer size must be even");
            encode(cast(wchar[])input, result);
@ -697,8 +697,12 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
            encode(cast(dchar[])input, result);
            break;
    }
-
-    if(result.errorMessage !is null) { return result; }
+    catch(ConvException e) { result.errorMessage = e.msg; }
+    catch(UTFException e)  { result.errorMessage = e.msg; }
+    catch(Exception e)
+    {
+        assert(false, "Unexpected exception in encode(): " ~ e.msg);
+    }

    return result;
 }