Merge pull request #235 from Herringway/phobos-utf-validation

reimplement isPrintableValidUTF8 using phobos functions merged-on-behalf-of: Cameron Ross <elpenguino@gmail.com>
2019-03-18 08:32:17 +01:00 · 2019-03-18 08:32:17 +01:00 · 6834338736
commit 6834338736
parent 8a31826124 ff38f20b09
1 changed files with 3 additions and 102 deletions
--- a/source/dyaml/reader.d
+++ b/source/dyaml/reader.d
@ -800,109 +800,10 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
 /// Determine if all characters (code points, not bytes) in a string are printable.
 bool isPrintableValidUTF8(const char[] chars) @safe pure
 {
-    // This is oversized (only 128 entries are necessary) simply because having 256
-    // entries improves performance... for some reason (alignment?)
-    bool[256] printable = [false, false, false, false, false, false, false, false,
-                           false, true,  true,  false, false, true,  false, false,
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false,
-
-                           true,  true,  true,  true, true,  true,  true,  true,
-                           true,  true,  true,  true, true,  true,  true,  true,
-                           true,  true,  true,  true, true,  true,  true,  true,
-                           true,  true,  true,  true, true,  true,  true,  true,
-
-                           true,  true,  true,  true, true,  true,  true,  true,
-                           true,  true,  true,  true, true,  true,  true,  true,
-                           true,  true,  true,  true, true,  true,  true,  true,
-                           true,  true,  true,  true, true,  true,  true,  true,
-                           true,  true,  true,  true, true,  true,  true,  true,
-                           true,  true,  true,  true, true,  true,  true,  true,
-                           true,  true,  true,  true, true,  true,  true,  true,
-                           true,  true,  true,  true, true,  true,  true,  true,
-
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false,
-
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false,
-                           false, false, false, false, false, false, false, false];
-
-    for(size_t index; index < chars.length;)
+    import std.uni : isControl, isWhite;
+    foreach (dchar chr; chars)
    {
-        // Fast path for ASCII.
-        // Both this while() block and the if() block below it are optimized, unrolled
-        // versions of the for() block below them; the while()/if() block could be
-        // removed without affecting logic, but both help increase performance.
-        size_t asciiCount = countASCII(chars[index .. $]);
-        // 8 ASCII iterations unrolled, looping while there are at most 8 ASCII chars.
-        while(asciiCount > 8)
-        {
-            const dchar b0 = chars[index];
-            const dchar b1 = chars[index + 1];
-            const dchar b2 = chars[index + 2];
-            const dchar b3 = chars[index + 3];
-            const dchar b4 = chars[index + 4];
-            const dchar b5 = chars[index + 5];
-            const dchar b6 = chars[index + 6];
-            const dchar b7 = chars[index + 7];
-
-            index += 8;
-            asciiCount -= 8;
-
-            const all = printable[b0] & printable[b1] & printable[b2] & printable[b3] &
-                        printable[b4] & printable[b5] & printable[b6] & printable[b1];
-            if(!all)
-            {
-                return false;
-            }
-        }
-        // 4 ASCII iterations unrolled
-        if(asciiCount > 4)
-        {
-            const char b0 = chars[index];
-            const char b1 = chars[index + 1];
-            const char b2 = chars[index + 2];
-            const char b3 = chars[index + 3];
-
-            index += 4;
-            asciiCount -= 4;
-
-            if(!printable[b0]) { return false; }
-            if(!printable[b1]) { return false; }
-            if(!printable[b2]) { return false; }
-            if(!printable[b3]) { return false; }
-        }
-        // Any remaining ASCII chars. This is really the only code needed to handle
-        // ASCII, the above if() and while() blocks are just an optimization.
-        for(; asciiCount > 0; --asciiCount)
-        {
-            const char b = chars[index];
-            ++index;
-            if(b >= 0x20)    { continue; }
-            if(printable[b]) { continue; }
-            return false;
-        }
-
-        if(index == chars.length) { break; }
-
-        // Not ASCII, need to decode.
-        const dchar c = decode(chars, index);
-        // We now c is not ASCII, so only check for printable non-ASCII chars.
-        if(!(c == 0x85 || (c >= 0xA0 && c <= '\uD7FF') ||
-            (c >= '\uE000' && c <= '\uFFFD') ||
-            (c >= '\U00010000' && c <= '\U0010FFFF')))
+        if (!chr.isValidDchar || (chr.isControl && !chr.isWhite))
        {
            return false;
        }