Merge pull request #235 from Herringway/phobos-utf-validation
reimplement isPrintableValidUTF8 using phobos functions merged-on-behalf-of: Cameron Ross <elpenguino@gmail.com>
This commit is contained in:
commit
6834338736
|
@ -800,109 +800,10 @@ auto toUTF8(ubyte[] input, const UTFEncoding encoding) @safe pure nothrow
|
||||||
/// Determine if all characters (code points, not bytes) in a string are printable.
|
/// Determine if all characters (code points, not bytes) in a string are printable.
|
||||||
bool isPrintableValidUTF8(const char[] chars) @safe pure
|
bool isPrintableValidUTF8(const char[] chars) @safe pure
|
||||||
{
|
{
|
||||||
// This is oversized (only 128 entries are necessary) simply because having 256
|
import std.uni : isControl, isWhite;
|
||||||
// entries improves performance... for some reason (alignment?)
|
foreach (dchar chr; chars)
|
||||||
bool[256] printable = [false, false, false, false, false, false, false, false,
|
|
||||||
false, true, true, false, false, true, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
|
|
||||||
true, true, true, true, true, true, true, true,
|
|
||||||
true, true, true, true, true, true, true, true,
|
|
||||||
true, true, true, true, true, true, true, true,
|
|
||||||
true, true, true, true, true, true, true, true,
|
|
||||||
|
|
||||||
true, true, true, true, true, true, true, true,
|
|
||||||
true, true, true, true, true, true, true, true,
|
|
||||||
true, true, true, true, true, true, true, true,
|
|
||||||
true, true, true, true, true, true, true, true,
|
|
||||||
true, true, true, true, true, true, true, true,
|
|
||||||
true, true, true, true, true, true, true, true,
|
|
||||||
true, true, true, true, true, true, true, true,
|
|
||||||
true, true, true, true, true, true, true, true,
|
|
||||||
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false,
|
|
||||||
false, false, false, false, false, false, false, false];
|
|
||||||
|
|
||||||
for(size_t index; index < chars.length;)
|
|
||||||
{
|
{
|
||||||
// Fast path for ASCII.
|
if (!chr.isValidDchar || (chr.isControl && !chr.isWhite))
|
||||||
// Both this while() block and the if() block below it are optimized, unrolled
|
|
||||||
// versions of the for() block below them; the while()/if() block could be
|
|
||||||
// removed without affecting logic, but both help increase performance.
|
|
||||||
size_t asciiCount = countASCII(chars[index .. $]);
|
|
||||||
// 8 ASCII iterations unrolled, looping while there are at most 8 ASCII chars.
|
|
||||||
while(asciiCount > 8)
|
|
||||||
{
|
|
||||||
const dchar b0 = chars[index];
|
|
||||||
const dchar b1 = chars[index + 1];
|
|
||||||
const dchar b2 = chars[index + 2];
|
|
||||||
const dchar b3 = chars[index + 3];
|
|
||||||
const dchar b4 = chars[index + 4];
|
|
||||||
const dchar b5 = chars[index + 5];
|
|
||||||
const dchar b6 = chars[index + 6];
|
|
||||||
const dchar b7 = chars[index + 7];
|
|
||||||
|
|
||||||
index += 8;
|
|
||||||
asciiCount -= 8;
|
|
||||||
|
|
||||||
const all = printable[b0] & printable[b1] & printable[b2] & printable[b3] &
|
|
||||||
printable[b4] & printable[b5] & printable[b6] & printable[b1];
|
|
||||||
if(!all)
|
|
||||||
{
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// 4 ASCII iterations unrolled
|
|
||||||
if(asciiCount > 4)
|
|
||||||
{
|
|
||||||
const char b0 = chars[index];
|
|
||||||
const char b1 = chars[index + 1];
|
|
||||||
const char b2 = chars[index + 2];
|
|
||||||
const char b3 = chars[index + 3];
|
|
||||||
|
|
||||||
index += 4;
|
|
||||||
asciiCount -= 4;
|
|
||||||
|
|
||||||
if(!printable[b0]) { return false; }
|
|
||||||
if(!printable[b1]) { return false; }
|
|
||||||
if(!printable[b2]) { return false; }
|
|
||||||
if(!printable[b3]) { return false; }
|
|
||||||
}
|
|
||||||
// Any remaining ASCII chars. This is really the only code needed to handle
|
|
||||||
// ASCII, the above if() and while() blocks are just an optimization.
|
|
||||||
for(; asciiCount > 0; --asciiCount)
|
|
||||||
{
|
|
||||||
const char b = chars[index];
|
|
||||||
++index;
|
|
||||||
if(b >= 0x20) { continue; }
|
|
||||||
if(printable[b]) { continue; }
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if(index == chars.length) { break; }
|
|
||||||
|
|
||||||
// Not ASCII, need to decode.
|
|
||||||
const dchar c = decode(chars, index);
|
|
||||||
// We now c is not ASCII, so only check for printable non-ASCII chars.
|
|
||||||
if(!(c == 0x85 || (c >= 0xA0 && c <= '\uD7FF') ||
|
|
||||||
(c >= '\uE000' && c <= '\uFFFD') ||
|
|
||||||
(c >= '\U00010000' && c <= '\U0010FFFF')))
|
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue