From 1c0702f3cd051fd593aff24e1e132ffd6cafa215 Mon Sep 17 00:00:00 2001 From: Ferdinand Majerech Date: Tue, 5 Aug 2014 19:12:54 +0200 Subject: [PATCH] Func to count the num of ASCII chars in string before the first UTF-8 sequence --- source/dyaml/reader.d | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/source/dyaml/reader.d b/source/dyaml/reader.d index 00a7db7..a506335 100644 --- a/source/dyaml/reader.d +++ b/source/dyaml/reader.d @@ -758,6 +758,46 @@ bool isPrintableValidUTF8(const char[] chars) @safe pure nothrow @nogc return true; } +/// Counts the number of ASCII characters in buffer until the first UTF-8 sequence. +/// +/// Used to determine how many characters we can process without decoding. +size_t countASCII(const(char)[] buffer) @trusted pure nothrow @nogc +{ + size_t count = 0; + + // The topmost bit in ASCII characters is always 0 + enum ulong Mask8 = 0x7f7f7f7f7f7f7f7f; + enum uint Mask4 = 0x7f7f7f7f; + enum ushort Mask2 = 0x7f7f; + + // Start by checking in 8-byte chunks. + while(buffer.length >= Mask8.sizeof) + { + const block = *cast(typeof(Mask8)*)buffer.ptr; + const masked = Mask8 & block; + if(masked != block) { break; } + count += Mask8.sizeof; + buffer = buffer[Mask8.sizeof .. $]; + } + + // If 8 bytes didn't match, try 4, 2 bytes. + import std.typetuple; + foreach(Mask; TypeTuple!(Mask4, Mask2)) + { + if(buffer.length < Mask.sizeof) { continue; } + const block = *cast(typeof(Mask)*)buffer.ptr; + const masked = Mask & block; + if(masked != block) { continue; } + count += Mask.sizeof; + buffer = buffer[Mask.sizeof .. $]; + } + + // If even a 2-byte chunk didn't match, test just one byte. + if(buffer.empty || buffer[0] >= 0x80) { return count; } + ++count; + + return count; +} // Unittests. void testEndian(R)()