Func to count the num of ASCII chars in string before the first UTF-8 sequence

2014-08-05 19:12:54 +02:00 · 2014-08-05 19:12:54 +02:00 · 1c0702f3cd
commit 1c0702f3cd
parent 8902ea8806
1 changed files with 40 additions and 0 deletions
--- a/source/dyaml/reader.d
+++ b/source/dyaml/reader.d
@ -758,6 +758,46 @@ bool isPrintableValidUTF8(const char[] chars) @safe pure nothrow @nogc
    return true;
 }

+/// Counts the number of ASCII characters in buffer until the first UTF-8 sequence.
+///
+/// Used to determine how many characters we can process without decoding.
+size_t countASCII(const(char)[] buffer) @trusted pure nothrow @nogc
+{
+    size_t count = 0;
+
+    // The topmost bit in ASCII characters is always 0
+    enum ulong Mask8  = 0x7f7f7f7f7f7f7f7f;
+    enum uint Mask4   = 0x7f7f7f7f;
+    enum ushort Mask2 = 0x7f7f;
+
+    // Start by checking in 8-byte chunks.
+    while(buffer.length >= Mask8.sizeof)
+    {
+        const block  = *cast(typeof(Mask8)*)buffer.ptr;
+        const masked = Mask8 & block;
+        if(masked != block) { break; }
+        count += Mask8.sizeof;
+        buffer = buffer[Mask8.sizeof .. $];
+    }
+
+    // If 8 bytes didn't match, try 4, 2 bytes.
+    import std.typetuple;
+    foreach(Mask; TypeTuple!(Mask4, Mask2))
+    {
+        if(buffer.length < Mask.sizeof) { continue; }
+        const block  = *cast(typeof(Mask)*)buffer.ptr;
+        const masked = Mask & block;
+        if(masked != block) { continue; }
+        count += Mask.sizeof;
+        buffer = buffer[Mask.sizeof .. $];
+    }
+
+    // If even a 2-byte chunk didn't match, test just one byte.
+    if(buffer.empty || buffer[0] >= 0x80) { return count; }
+    ++count;
+
+    return count;
+}
 // Unittests.

 void testEndian(R)()