Func to count the num of ASCII chars in string before the first UTF-8 sequence

2014-08-05 19:12:54 +02:00 · 2014-08-05 19:12:54 +02:00 · 1c0702f3cd
commit 1c0702f3cd
parent 8902ea8806
1 changed files with 40 additions and 0 deletions
--- a/source/dyaml/reader.d
+++ b/source/dyaml/reader.d
@ -758,6 +758,46 @@ bool isPrintableValidUTF8(const char[] chars) @safe pure nothrow @nogc
    return true;
 }
 /// Counts the number of ASCII characters in buffer until the first UTF-8 sequence.
 ///
 /// Used to determine how many characters we can process without decoding.
 size_t countASCII(const(char)[] buffer) @trusted pure nothrow @nogc
 {
    size_t count = 0;
    // The topmost bit in ASCII characters is always 0
    enum ulong Mask8  = 0x7f7f7f7f7f7f7f7f;
    enum uint Mask4   = 0x7f7f7f7f;
    enum ushort Mask2 = 0x7f7f;
    // Start by checking in 8-byte chunks.
    while(buffer.length >= Mask8.sizeof)
    {
        const block  = *cast(typeof(Mask8)*)buffer.ptr;
        const masked = Mask8 & block;
        if(masked != block) { break; }
        count += Mask8.sizeof;
        buffer = buffer[Mask8.sizeof .. $];
    }
    // If 8 bytes didn't match, try 4, 2 bytes.
    import std.typetuple;
    foreach(Mask; TypeTuple!(Mask4, Mask2))
    {
        if(buffer.length < Mask.sizeof) { continue; }
        const block  = *cast(typeof(Mask)*)buffer.ptr;
        const masked = Mask & block;
        if(masked != block) { continue; }
        count += Mask.sizeof;
        buffer = buffer[Mask.sizeof .. $];
    }
    // If even a 2-byte chunk didn't match, test just one byte.
    if(buffer.empty || buffer[0] >= 0x80) { return count; }
    ++count;
    return count;
 }
 // Unittests.
 void testEndian(R)()