Func to count the num of ASCII chars in string before the first UTF-8 sequence

This commit is contained in:
Ferdinand Majerech 2014-08-05 19:12:54 +02:00
parent 8902ea8806
commit 1c0702f3cd

View file

@ -758,6 +758,46 @@ bool isPrintableValidUTF8(const char[] chars) @safe pure nothrow @nogc
return true; return true;
} }
/// Counts the number of ASCII characters in buffer until the first UTF-8 sequence.
///
/// Used to determine how many characters we can process without decoding.
size_t countASCII(const(char)[] buffer) @trusted pure nothrow @nogc
{
size_t count = 0;
// The topmost bit in ASCII characters is always 0
enum ulong Mask8 = 0x7f7f7f7f7f7f7f7f;
enum uint Mask4 = 0x7f7f7f7f;
enum ushort Mask2 = 0x7f7f;
// Start by checking in 8-byte chunks.
while(buffer.length >= Mask8.sizeof)
{
const block = *cast(typeof(Mask8)*)buffer.ptr;
const masked = Mask8 & block;
if(masked != block) { break; }
count += Mask8.sizeof;
buffer = buffer[Mask8.sizeof .. $];
}
// If 8 bytes didn't match, try 4, 2 bytes.
import std.typetuple;
foreach(Mask; TypeTuple!(Mask4, Mask2))
{
if(buffer.length < Mask.sizeof) { continue; }
const block = *cast(typeof(Mask)*)buffer.ptr;
const masked = Mask & block;
if(masked != block) { continue; }
count += Mask.sizeof;
buffer = buffer[Mask.sizeof .. $];
}
// If even a 2-byte chunk didn't match, test just one byte.
if(buffer.empty || buffer[0] >= 0x80) { return count; }
++count;
return count;
}
// Unittests. // Unittests.
void testEndian(R)() void testEndian(R)()